In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

sns.set_style("whitegrid")

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar os Dados

In [4]:
# Carregar Dataset Limpo
df_leads = pd.read_csv('.\datasets\leads_cleaned.csv')

  df_leads = pd.read_csv('.\datasets\leads_cleaned.csv')


In [6]:
# Mostrar informações dos dados
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

In [5]:
# Mostrar primeiras linhas
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


#### Preparação dos Dados

In [7]:
# Preparar os dados para o modelo
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']

In [8]:
# Criar lista de colunas para a parte de interpretabilidade
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns

In [11]:
# Usasr preprocessor salvo anteriormente
import joblib
preprocessor = joblib.load('.\preprocessor_leads.pkl')

  preprocessor = joblib.load('.\preprocessor_leads.pkl')


In [20]:
# Dividir dados de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [21]:
# Aplicar o preprocessor
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [22]:
# Mostrar a estrutura dos conjuntos
print(X_train.shape)
print(X_test.shape)


(7259, 68)
(1815, 68)


#### Treinamento do Modelo

In [23]:
# Criar o modelo de BoostingClassifier
boosting_model = AdaBoostClassifier(
    estimator=LogisticRegression(),
    n_estimators=50, # numero padrão
    learning_rate=1.0, # número padrão, nivel de aprendizado em cada iteração, quanto menor o número mais ele vai demorar para convergir.
    random_state=51
)


In [24]:
# Treinar o Modelo
boosting_model.fit(X_train, y_train)

#### Análise das Métricas

In [25]:
# Fazer predições no conjunto de testes
y_pred = boosting_model.predict(X_test)

In [26]:
# Métricas principais do modelo de classificação
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


In [27]:
# Resultados
print(f'Acuracia: {accuracy}\nPrecisão: {precision}\nRecall: {recall}\nF1 Score: {f1}')

Acuracia: 0.7856749311294766
Precisão: 0.7004279600570613
Recall: 0.7328358208955223
F1 Score: 0.7162654996353027


In [28]:
# Mostar a Matriz de Confusão
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(conf_matrix,
                labels=dict(x='Predição', y='Real', color='Contagem'),
                x=['Not Converted', 'Converted'],
                y=['Not Converted', 'Converted'],
                color_continuous_scale='Viridis')

fig.update_traces(text=conf_matrix, texttemplate='%{z}')
fig.update_layout(coloraxis_showscale=False)
fig.show()

#### Análise de Importância

In [30]:
# Calcular a importância das variáveis
importances = np.mean([np.abs(estimator.coef_[0]) for estimator in boosting_model.estimators_], axis=0)

In [31]:
# Obter os nomes reais da features 
feature_names = (numeric_features.tolist() +
                 preprocessor.named_transformers_['cat']
                 .get_feature_names_out(categorical_features)
                 .tolist())

In [34]:
# Criar um DataFrame combinando os nomes das variáveis do com as importancias
df_feature_impontances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

In [35]:
# Ordenar o DataFrame pela importância
df_feature_impontances = df_feature_impontances.sort_values(by='Importance', ascending=False)

In [36]:
# Plotar o gráfico de importância
fig = px.bar(df_feature_impontances,
             x='Importance',
             y='Feature',
             orientation='h',
             title='Importância das Features (baseada nos coeficientes absolutos)')

fig.update_layout(height=1200, width=1000, yaxis={'categoryorder': 'total ascending'})
fig.show()