In [2]:


import pandas as pd
import numpy as np
import requests
import seaborn as sns
from pycaret.classification import *

from datetime import datetime


* O objetivo

O objetivo da equipe é construir um modelo preditivo que produzirá o maior lucro para a próxima campanha de marketing direto, programada para o próximo mês. A nova campanha, sexta, visa a venda de um novo gadget para clientes cadastrados no Banco de Dados da empresa. Para construir o modelo, foi realizada uma campanha piloto envolvendo 2.240 clientes. Os clientes foram selecionados aleatoriamente e contatados por telefone para a aquisição do gadget. Durante os meses seguintes, os clientes que compraram a oferta foram devidamente etiquetados. O custo total da campanha da amostra foi de 6,720MU e a receita gerada pelos clientes que aceitaram a oferta foi de 3,674MU. Globalmente, a campanha teve um lucro de -3,046MU. A taxa de sucesso da campanha foi de 15%. O objetivo da equipe é desenvolver um modelo que preveja o comportamento do cliente e aplicá-lo ao restante da base de clientes. Felizmente, o modelo permitirá que a empresa escolha a dedo os clientes com maior probabilidade de comprar a oferta, deixando de fora os não respondentes, tornando a próxima campanha altamente lucrativa. Além disso, além de maximizar o lucro da campanha, o CMO está interessado em estudar as características dos clientes que desejam comprar o gadget.
Os dados
O conjunto de dados contém características sociodemográficas e firográficas de cerca de 2.240 clientes contatados. Além disso, contém um sinalizador para aqueles clientes que responderam à campanha, comprando o produto.

In [3]:
df = pd.read_csv('data.csv')

In [4]:
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


In [5]:
df = df.drop(columns='ID')
df.shape

(2240, 28)

In [6]:
df.nunique().sort_values()

Z_Revenue                 1
Z_CostContact             1
Response                  2
AcceptedCmp3              2
AcceptedCmp4              2
AcceptedCmp5              2
AcceptedCmp2              2
AcceptedCmp1              2
Complain                  2
Teenhome                  3
Kidhome                   3
Education                 5
Marital_Status            8
NumCatalogPurchases      14
NumStorePurchases        14
NumDealsPurchases        15
NumWebPurchases          15
NumWebVisitsMonth        16
Year_Birth               59
Recency                 100
MntFruits               158
MntSweetProducts        177
MntFishProducts         182
MntGoldProds            213
MntMeatProducts         558
Dt_Customer             663
MntWines                776
Income                 1974
dtype: int64

In [7]:
df = df.drop(columns=['Z_Revenue','Z_CostContact'])
df.shape

(2240, 26)

In [8]:
df.isna().sum()

Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Response                0
dtype: int64

In [9]:
df[df.Income.isna()].Response.value_counts()

0    23
1     1
Name: Response, dtype: int64

In [10]:
df = df.dropna()
df.shape

(2216, 26)

In [11]:
df.Response.value_counts(normalize = True)

0    0.849729
1    0.150271
Name: Response, dtype: float64

In [12]:
df.dtypes

Year_Birth               int64
Education               object
Marital_Status          object
Income                 float64
Kidhome                  int64
Teenhome                 int64
Dt_Customer             object
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases        int64
NumWebPurchases          int64
NumCatalogPurchases      int64
NumStorePurchases        int64
NumWebVisitsMonth        int64
AcceptedCmp3             int64
AcceptedCmp4             int64
AcceptedCmp5             int64
AcceptedCmp1             int64
AcceptedCmp2             int64
Complain                 int64
Response                 int64
dtype: object

In [13]:
df.Dt_Customer = pd.to_datetime(df.Dt_Customer)
df.Response = df.Response.astype('bool')
df.dtypes

Year_Birth                      int64
Education                      object
Marital_Status                 object
Income                        float64
Kidhome                         int64
Teenhome                        int64
Dt_Customer            datetime64[ns]
Recency                         int64
MntWines                        int64
MntFruits                       int64
MntMeatProducts                 int64
MntFishProducts                 int64
MntSweetProducts                int64
MntGoldProds                    int64
NumDealsPurchases               int64
NumWebPurchases                 int64
NumCatalogPurchases             int64
NumStorePurchases               int64
NumWebVisitsMonth               int64
AcceptedCmp3                    int64
AcceptedCmp4                    int64
AcceptedCmp5                    int64
AcceptedCmp1                    int64
AcceptedCmp2                    int64
Complain                        int64
Response                         bool
dtype: objec

In [14]:

ano_atual = pd.datetime.now().year
df['Age'] = ano_atual - df.Year_Birth
df.drop('Year_Birth',axis = 1, errors = 'ignore', inplace = True)
df.head()

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Age
0,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,546,...,4,7,0,0,0,0,0,0,True,66
1,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,6,...,2,5,0,0,0,0,0,0,False,69
2,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,127,...,10,4,0,0,0,0,0,0,False,58
3,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,20,...,4,6,0,0,0,0,0,0,False,39
4,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,118,...,6,5,0,0,0,0,0,0,False,42


In [15]:
dt = pd.datetime.now().date()
df['Time_Customer'] = dt - pd.to_datetime(df['Dt_Customer']).dt.date
df['Time_Customer'] = df['Time_Customer'] / np.timedelta64(1, 'Y')

df = df.drop(columns='Dt_Customer')

In [16]:
df.head()

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Age,Time_Customer
0,Graduation,Single,58138.0,0,0,58,635,88,546,172,...,7,0,0,0,0,0,0,True,66,10.584748
1,Graduation,Single,46344.0,1,1,38,11,1,6,2,...,5,0,0,0,0,0,0,False,69,9.0789
2,Graduation,Together,71613.0,0,0,26,426,49,127,111,...,4,0,0,0,0,0,0,False,58,9.623743
3,Graduation,Together,26646.0,1,0,26,11,4,20,10,...,6,0,0,0,0,0,0,False,39,9.150085
4,PhD,Married,58293.0,1,0,94,173,43,118,46,...,5,0,0,0,0,0,0,False,42,9.210319


In [17]:
df.shape

(2216, 26)

In [18]:
index_to_drop = df[(df['Marital_Status'] =='YOLO') | (df['Marital_Status'] =='Absurd') | (df['Marital_Status'] =='absurd') | (df['Marital_Status'] == 'Alone')].index
df.drop(index_to_drop,inplace = True)
df = df.reset_index(drop = True)
df.Marital_Status.value_counts()
print(df.shape)

(2209, 26)


In [19]:
df.rename(columns={'Response':'z_Response'},inplace = True)
cols = df.columns.sort_values()
df = df[cols]
df.rename(columns={'z_Response':'Response'},inplace = True)

df.columns

Index(['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4',
       'AcceptedCmp5', 'Age', 'Complain', 'Education', 'Income', 'Kidhome',
       'Marital_Status', 'MntFishProducts', 'MntFruits', 'MntGoldProds',
       'MntMeatProducts', 'MntSweetProducts', 'MntWines',
       'NumCatalogPurchases', 'NumDealsPurchases', 'NumStorePurchases',
       'NumWebPurchases', 'NumWebVisitsMonth', 'Recency', 'Teenhome',
       'Time_Customer', 'Response'],
      dtype='object')

In [20]:
df.dtypes

AcceptedCmp1             int64
AcceptedCmp2             int64
AcceptedCmp3             int64
AcceptedCmp4             int64
AcceptedCmp5             int64
Age                      int64
Complain                 int64
Education               object
Income                 float64
Kidhome                  int64
Marital_Status          object
MntFishProducts          int64
MntFruits                int64
MntGoldProds             int64
MntMeatProducts          int64
MntSweetProducts         int64
MntWines                 int64
NumCatalogPurchases      int64
NumDealsPurchases        int64
NumStorePurchases        int64
NumWebPurchases          int64
NumWebVisitsMonth        int64
Recency                  int64
Teenhome                 int64
Time_Customer          float64
Response                  bool
dtype: object

In [21]:
s = setup(  data = df, 
            target = 'Response', 
            fix_imbalance = True, 
            remove_outliers = True,
            normalize = True, 
            categorical_features = ['Education', 'Marital_Status'],
            session_id = 123
            )

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Response
2,Target type,Binary
3,Original data shape,"(2209, 26)"
4,Transformed data shape,"(3233, 34)"
5,Transformed train set shape,"(2570, 34)"
6,Transformed test set shape,"(663, 34)"
7,Numeric features,23
8,Categorical features,2
9,Preprocess,True


In [22]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [23]:
best_model = compare_models()

In [38]:
plot_model(best_model, plot = 'threshold',save=True)


'Threshold.png'

In [39]:
plot_model(best_model, plot = 'feature',save=True)


'Feature Importance.png'

In [40]:
plot_model(best_model, plot = 'auc',save=True)


'AUC.png'

In [41]:
plot_model(best_model, plot = 'learning',save=True)


'Learning Curve.png'

In [42]:
plot_model(best_model, plot = 'confusion_matrix', plot_kwargs = {'percent' : True},save=True)


'Confusion Matrix.png'

In [26]:
mdl_gbc = create_model('gbc')

In [27]:
mdl_lightgbm = create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8581,0.893,0.3913,0.5294,0.45,0.3706,0.3762
1,0.8903,0.9407,0.5652,0.65,0.6047,0.5413,0.5431
2,0.8774,0.8822,0.4348,0.625,0.5128,0.4453,0.4549
3,0.8903,0.9325,0.3913,0.75,0.5143,0.4593,0.4903
4,0.8839,0.9226,0.5652,0.619,0.5909,0.5234,0.5241
5,0.8452,0.8826,0.4167,0.5,0.4545,0.3652,0.3672
6,0.8831,0.9084,0.4783,0.6471,0.55,0.4846,0.4919
7,0.8896,0.8671,0.3913,0.75,0.5143,0.4589,0.4899
8,0.8896,0.8978,0.3478,0.8,0.4848,0.4336,0.481
9,0.9286,0.9456,0.6522,0.8333,0.7317,0.6912,0.6981


In [28]:
tuned_gbc = tune_model(mdl_gbc)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8516,0.8534,0.4348,0.5,0.4651,0.3795,0.3807
1,0.8645,0.9236,0.4783,0.55,0.5116,0.4334,0.4348
2,0.9032,0.9201,0.6087,0.7,0.6512,0.5953,0.5973
3,0.929,0.9493,0.7391,0.7727,0.7556,0.7141,0.7143
4,0.8839,0.9262,0.6522,0.6,0.625,0.5564,0.5571
5,0.8452,0.8031,0.4167,0.5,0.4545,0.3652,0.3672
6,0.8896,0.8868,0.5217,0.6667,0.5854,0.5228,0.528
7,0.8571,0.7765,0.3043,0.5385,0.3889,0.315,0.3315
8,0.8896,0.8686,0.4783,0.6875,0.5641,0.5032,0.5141
9,0.9091,0.9227,0.6522,0.7143,0.6818,0.6289,0.6298


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [29]:
tuned_lightgbm = tune_model(mdl_lightgbm)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8516,0.8725,0.3478,0.5,0.4103,0.3285,0.3356
1,0.9032,0.9486,0.5217,0.75,0.6154,0.5621,0.5742
2,0.9097,0.9234,0.6087,0.7368,0.6667,0.615,0.6187
3,0.9032,0.948,0.6087,0.7,0.6512,0.5953,0.5973
4,0.8903,0.8972,0.6522,0.625,0.6383,0.5737,0.5739
5,0.8774,0.8728,0.5417,0.619,0.5778,0.5065,0.508
6,0.8831,0.9004,0.5652,0.619,0.5909,0.5229,0.5236
7,0.8961,0.8521,0.4348,0.7692,0.5556,0.5018,0.5281
8,0.8961,0.9001,0.4783,0.7333,0.5789,0.5227,0.5382
9,0.9286,0.9562,0.6522,0.8333,0.7317,0.6912,0.6981


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [30]:
save_model(tuned_lightgbm, './pickle_lightgbm_pycaret')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\mathe\AppData\Local\Temp\joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['AcceptedCmp1', 'AcceptedCmp2',
                                              'AcceptedCmp3', 'AcceptedCmp4',
                                              'AcceptedCmp5', 'Age', 'Complain',
                                              'Income', 'Kidhome',
                                              'MntFishProducts', 'MntFruits',
                                              'MntGoldProds', 'MntMeatProducts',
                                              'MntSweetProducts', 'MntWines'...
                                 boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, feature_fraction=0.4,
                                 importance_type='split', learning_rate=0.1,
                                 max_depth=-1, min_child_sa

In [43]:
import os
import shutil

model_path = '.'

images_path = os.path.join(model_path, 'images')

if not os.path.exists(images_path):
    os.makedirs(images_path)

for filename in os.listdir(model_path):
    if filename.endswith('.png'):
        file_path = os.path.join(model_path, filename)
        shutil.move(file_path, images_path)