In [1]:
# Importar as bibliotecas 

# EDA
import pandas as pd
import plotly.express as px
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar os dados

In [2]:
# Carregar o dataset já tratado
df_leads = pd.read_csv('./datasets/leads_cleaned.csv')

In [3]:
# Mostrar as primeiras linhas
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [4]:
# Mostrar as últimas linhas
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [5]:
# Mostrar a estrutura do dataset
df_leads.info(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

### Preparação dos dados

In [6]:
# Prepar os dados (X e y)
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']

In [7]:
# Usar preprocessor já salvo
import joblib

preprocessor = joblib.load('./preprocessor_dataset_leads.pkl')

In [8]:
# Dividir dataset entre treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

# aplicar preprocessor
X_train = preprocessor.fit_transform(X_train).toarray()
X_test = preprocessor.transform(X_test).toarray()

In [9]:
# Mostrar a estrutura dos conjuntos
print(f'Conjunto de Treinamento: {X_train.shape}')
print(f'Conjunto de Testes: {X_test.shape}')


Conjunto de Treinamento: (7259, 68)
Conjunto de Testes: (1815, 68)


In [10]:
preprocessor.get_feature_names_out()

array(['num__Do Not Email', 'num__Do Not Call', 'num__TotalVisits',
       'num__Total Time Spent on Website', 'num__Page Views Per Visit',
       'num__Search', 'num__Newspaper Article', 'num__X Education Forums',
       'num__Newspaper', 'num__Digital Advertisement',
       'num__Through Recommendations',
       'num__A free copy of Mastering The Interview',
       'cat__Lead Origin_API', 'cat__Lead Origin_Landing Page Submission',
       'cat__Lead Origin_Lead Add Form', 'cat__Lead Origin_Lead Import',
       'cat__Lead Source_Click2call', 'cat__Lead Source_Direct Traffic',
       'cat__Lead Source_Facebook', 'cat__Lead Source_Google',
       'cat__Lead Source_Live Chat', 'cat__Lead Source_NC_EDM',
       'cat__Lead Source_Olark Chat', 'cat__Lead Source_Organic Search',
       'cat__Lead Source_Pay per Click Ads', 'cat__Lead Source_Reference',
       'cat__Lead Source_Referral Sites', 'cat__Lead Source_Social Media',
       'cat__Lead Source_WeLearn', 'cat__Lead Source_Welingak Webs

### Treinamento do Modelo do Voting Classifier 

In [35]:
# Criar o modelo de VotingClassifier

lr_model = LogisticRegression(random_state=51)
# Para executar o modo de Soft Voting, é necessario colocar o hiperparâmetro probability = True no modelo SVC
svc_model = SVC(probability=True, kernel='linear')
tree_model = DecisionTreeClassifier(random_state=51)

voting_model = VotingClassifier(
    estimators= [
        ('logistic regression', lr_model),
        ('svc', svc_model),
        ('decision tree', tree_model)
    ],
    # Hard faz a votação pela maioria das predições dos estimadores
    # Soft faz a votação pela média das probablidades de cada classe vindas de cada estimador
    voting='soft'
)

In [36]:
# Treinar o modelo
voting_model.fit(X_train, y_train)

0,1,2
,estimators,"[('logistic regression', ...), ('svc', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,51
,solver,'lbfgs'
,max_iter,100

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,51
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Análise dos Resultados

In [37]:
# Realizar predições no conjunto de testes
y_pred = voting_model.predict(X_test)

In [38]:
# Visualizar y_pred
y_pred

array([1, 0, 0, ..., 0, 0, 1], shape=(1815,))

In [15]:
# Calcular métricas
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recal = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

In [39]:
# Mostrar métricas
print(f'Acurácia: {accuracy}')
print(f'Precisão: {precision}')
print(f'Recall: {recal}')
print(f'F1 Score: {f1}')


Acurácia: 0.7983471074380165
Precisão: 0.7420382165605095
Recall: 0.6955223880597015
F1 Score: 0.7180277349768875


In [40]:
# Mostrar matriz de confusão
conf_matrix = confusion_matrix(y_test,y_pred)

fig = px.imshow(conf_matrix,
                labels=dict(x='Predição', y='Real', color='Contagem'),
                x=['Not Converted', 'Converted'],
                y=['Not Converted', 'Converted'],
                color_continuous_scale='Viridis')
fig.update_traces(text=conf_matrix, texttemplate="%{z}")
fig.update_layout(coloraxis_showscale=False)
fig.show()                

In [22]:
# Carregar as importâncias dos modelo

importances = []

for estimador in voting_model.estimators_:
        # Para modelo lineares, retorna coef
    if hasattr(estimador, 'coef_'):
        importances.append(np.abs(estimador.coef_[0]))
        # Para modelo basedos em árvores
    elif hasattr(estimador, 'feature_importances_'):
        importances.append(estimador.feature_importances_)
    else:
        print(f'Não foi possível carregar a importância das variáves do modelo {type(estimador).__name__}')

In [23]:
# Calcular a média das importâncias
importancia_media = np.mean(importances, axis=0)

In [27]:
# Nomes das features
feature_names = preprocessor.get_feature_names_out()

In [29]:
# Criar um dataframe com nome e importâncias das features
df_features_importances = pd.DataFrame({'feature': feature_names, 'importance': importancia_media})

In [30]:
# Ordenar o Dataframe pela importância
df_features_importances = df_features_importances.sort_values(by='importance' ,ascending=True)

In [31]:
# Plotar a importância das features
fig = px.bar(df_features_importances,
             x='importance',
             y='feature',
             orientation='h',
             title='Importância das Features (Voting Classifier)')

fig.update_layout(height=1280, width=1000)
fig.show()

### Propriedades do Modelo

In [None]:
# Mostrar evidencias - Hard Voting

# Selecionar um registro da base para fazer a predição
X_sample = X_test[7].reshape(1, -1)

# Predições individuais dos estimadores
log_pred = voting_model.named_estimators_['logistic regression'].predict(X_sample)
svc_pred = voting_model.named_estimators_['svc'].predict(X_sample)
tree_pred = voting_model.named_estimators_['decision tree'].predict(X_sample)

# Predição final do voting Classifier
voting_pred = voting_model.predict(X_sample)


In [34]:
# Exibir os resultados
print(f'Regressão Logística: {log_pred[0]}')
print(f'SVC: {svc_pred[0]}')
print(f'Árvore de Decisão: {tree_pred[0]}')
print(f'Predição final do Hard Voting:  (Votação Majoritaria{voting_pred[0]}')


Regressão Logística: 0
SVC: 0
Árvore de Decisão: 1
Predição final do Hard Voting:  (Votação Majoritaria0


In [43]:
# Mostrar evidencias - Soft Voting

# Selecionar um registro da base para fazer a predição
X_sample = X_test[7].reshape(1, -1)

# Predições individuais das probabilidades estimadores
log_proba = voting_model.named_estimators_['logistic regression'].predict_proba(X_sample)
svc_proba = voting_model.named_estimators_['svc'].predict_proba(X_sample)
tree_proba = voting_model.named_estimators_['decision tree'].predict_proba(X_sample)

# Predição final do voting Classifier
voting_pred = voting_model.predict(X_sample)
voting_proba = voting_model.predict_proba(X_sample)


In [44]:
# Exibir os resultados
print(f'Probabilidade de Regressão Logística: {log_proba}')
print(f'Probabilidade do SVC: {svc_proba}')
print(f'Probabilidade da Árvore de Decisão: {tree_proba}')
print(f'Predição final do Soft Voting:  (Votação Ponderada: {voting_proba}')
print(f'Predição final do Soft Voting:  (Votação Ponderada: {voting_pred[0]}')


Probabilidade de Regressão Logística: [[0.67050795 0.32949205]]
Probabilidade do SVC: [[0.69420552 0.30579448]]
Probabilidade da Árvore de Decisão: [[0. 1.]]
Predição final do Soft Voting:  (Votação Ponderada: [[0.45490449 0.54509551]]
Predição final do Soft Voting:  (Votação Ponderada: 1
