In [2]:
import pandas as pd
import plotly.express as px
import numpy as np
import seaborn as sns

sns.set_style("whitegrid")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar os Dados

In [3]:
# Carregar os dados ja tratados
df_leads = pd.read_csv('.\datasets\leads_cleaned.csv')

  df_leads = pd.read_csv('.\datasets\leads_cleaned.csv')


In [4]:
# Mostrar primeiras linhas
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [5]:
# Estrutura do DataSet
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

### Preparação dos Dados

In [6]:
# Preparar os dados para o modelo
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']

In [7]:
# Criar listas de colunas
numeric_features = df_leads.select_dtypes(include=['number']).columns
categorical_features = df_leads.select_dtypes(include=['object']).columns

In [8]:
# Usar preprocessor existente
import joblib
preprocessor = joblib.load('.\preprocessor_leads.pkl')

  preprocessor = joblib.load('.\preprocessor_leads.pkl')


In [9]:
# Dividir o dataset entre treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [10]:
# Aplicar preprocessor
X_train = preprocessor.fit_transform(X_train).toarray()
X_test = preprocessor.transform(X_test).toarray()

In [11]:
# Mostrar os Conjuntos
print(f'COnjunto de Treinamento: {X_train.shape}\nCOnjunto de Test: {X_test.shape}')

COnjunto de Treinamento: (7259, 68)
COnjunto de Test: (1815, 68)


### Treinamento do Modelo
- Criar o modelo de StackingClassifier

In [12]:
# Meta-Modelo
lr_model = LogisticRegression(random_state=51)

# Modelos Base
tree_model = DecisionTreeClassifier(random_state=51)
svc_model = SVC(kernel='linear')
sgd_model = SGDClassifier(penalty='elasticnet', random_state=51)

# Criar o objeto do StackingClassifier
stacking_model = StackingClassifier(
    estimators=[
        ('sgd classifier', sgd_model),
        ('svc', svc_model),
        ('decision tree', tree_model),
    ],
    final_estimator=lr_model,
    # passthrough=False, usa apenas os redultados dos estimadores de cada algoritmo base (Vanilla)
    # passthrough=True, usa os resultados dos estimadores de cada algoritmo base + dataset original (Blending)
    passthrough=False, 
)

In [13]:
# TReinar o Modelo
stacking_model.fit(X_train, y_train)

### Avaliação do Modelo

In [14]:
# Fazer predições no conjunto de teste
y_pred = stacking_model.predict(X_test)

In [16]:
# Calcular Métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Apresentar as Métricas
print(f'Acurácia: {accuracy}\nPrecisão: {precision}\nRecall: {recall}\nF1 Score: {f1}')

Acurácia: 0.8
Precisão: 0.74481658692185
Recall: 0.6970149253731344
F1 Score: 0.7201233616037008


In [18]:
# Mostrar a Matriz de Confusão
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(conf_matrix,
                      labels=dict(x='Precisão', y='Real', color='Contagem'),
                      x=['Not Converted', 'Converted'],
                      y=['Not Converted', 'Converted'],
                      color_continuous_scale='Viridis')

fig.update_traces(text=conf_matrix, texttemplate="%{z}")
fig.update_layout(coloraxis_showscale=False)

### Análise das Importâncias

In [19]:
# Calcular a importância das variáveis considerando o Stacking Classifier

importances = []

for estimador in stacking_model.estimators_:
    # Modelos lineares possuem coeficiente
    if hasattr(estimador, 'coef_'):
        importances.append(np.abs(estimador.coef_[0]))
        print(f'Coeficientes do Modelo {type(estimador).__name__}')
    # Modelos baseados em árvores
    elif hasattr(estimador, 'feature_importances_'):
        importances.append(np.abs(estimador.feature_importances_))
        print(f'Feature Importances do Modelo {type(estimador).__name__}')
    else:
        print(f'Não foi possível calcular a importância para {type(estimador).__name__}')

Coeficientes do Modelo SGDClassifier
Coeficientes do Modelo SVC
Feature Importances do Modelo DecisionTreeClassifier


In [21]:
# Calcular a Média da Importâncias
importances_media = np.mean(importances, axis=0)

In [30]:
len(importances_media)

68

In [34]:
# Obter os nomes da Features
feature_names = (numeric_features.tolist() + 
                 preprocessor.named_transformers_['cat']
                 .get_feature_names_out(categorical_features).tolist())

In [31]:
len(feature_names)

69

In [29]:
# Criar um dataframe com nomes e importância
df_feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances_media})

ValueError: All arrays must be of the same length

### Propriedades do Modelo

In [36]:
# Mostrar evidências do Modelo

# Fazer uma predição num exemplo específico
X_sample = X_test[7].reshape(1, -1)

# Predições individuais dos estimadores
sgd_pred = stacking_model.named_estimators_['sgd classifier'].predict(X_sample)
svc_pred = stacking_model.named_estimators_['svc'].predict(X_sample)
tree_pred = stacking_model.named_estimators_['decision tree'].predict(X_sample)

# Predição final com o Stacking
stacking_pred = stacking_model.predict(X_sample)

In [37]:
# Exibir Resultados
print(f'Predição do SGD: {sgd_pred[0]}')
print(f'Predição do SVC: {svc_pred[0]}')
print(f'Predição do Árvore de Decisão: {tree_pred[0]}')
print(f'Predição final do Stacking (Logistic Regression): {stacking_pred[0]}')

Predição do SGD: 0
Predição do SVC: 0
Predição do Árvore de Decisão: 1
Predição final do Stacking (Logistic Regression): 0
