# Grid Search modelos finales no basados en Redes Neuronales

In [142]:
import pandas as pd
import numpy as np

import missingno as msno
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import datetime
from dateutil.parser import parse
from sklearn.tree import  DecisionTreeClassifier
import plotly.graph_objects as go
import umap
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
import ml_metrics


In [2]:
Data=pd.read_csv("consolidation_30May2022.csv").drop(columns=["Unnamed: 0"]).fillna("")
Data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,pref_B-B,pref_D-E,pref_C-D,pref_A-A,Duracion_Campaña,B,C,lec_B-B,lec_E-E,lec_D-E,...,Renta,Recibe_sueldo_en_cuenta,Segmento_consumidor,Meses_antiguedad,Comuna,Ciudad,Estado_civil,Principalidad,Profesion,Target
0,0.0,0.0,0.0,0.0,1.0,4.0,5.0,5.0,0.0,0.0,...,R1,0,A,Mayor a 10 años,331.0,13.0,D,B,P164,
1,0.0,0.0,0.0,0.0,1.0,6.0,6.0,8.0,0.0,1.0,...,R4,0,A,Mayor a 10 años,91.0,13.0,D,B,P85,E-E B-B D-E
2,0.0,0.0,0.0,0.0,1.0,2.0,3.0,1.0,0.0,0.0,...,R5,0,A,Mayor a 10 años,34.0,5.0,B,C,P143,D-E A-A
3,0.0,0.0,0.0,0.0,1.2,7.0,3.0,5.0,0.0,0.0,...,R9,0,A,Mayor a 10 años,331.0,13.0,B,C,P1,E-E
4,0.0,0.0,0.0,0.0,1.0,4.0,1.0,1.0,0.0,0.0,...,R2,0,B,Mayor a 10 años,37.0,5.0,D,F,P210,D-E


In [3]:
# Según criterio Gini (Todas variables númricas)
select_variables=['E-F',
 'C-C',
 'D-F',
 'A-K',
 'num_comunicaciones',
 'Length',
 'pref_A-A',
 'Positivo',
 'Recency',
 'A-G',
 'Monto',
 'Periodicity',
 'Frequency',
 'lec_B-B',
 'G-K']

# Train, Test Split

In [17]:
X=Data[select_variables]
X.head()

Unnamed: 0,E-F,C-C,D-F,A-K,num_comunicaciones,Length,pref_A-A,Positivo,Recency,A-G,Monto,Periodicity,Frequency,lec_B-B,G-K
0,0,0,0,274,22.0,371,0.0,36,652.012626,18,180754.821871,91.0,396,5.0,47
1,3,0,2,69,15.0,371,0.0,23,649.634615,16,154439.534968,102.0,416,8.0,266
2,0,0,2,51,26.0,371,0.0,22,629.587342,20,128719.371341,107.0,395,1.0,272
3,15,0,0,465,34.0,371,0.0,54,690.118911,20,142752.939212,69.0,698,5.0,144
4,1,0,3,27,24.0,371,0.0,23,613.438503,17,127120.6405,112.0,374,1.0,284


In [18]:
Y=Data["Target"]
Y.head()

0               
1    E-E B-B D-E
2        D-E A-A
3            E-E
4            D-E
Name: Target, dtype: object

In [92]:
# Generación de conjuntos de train y test
Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X, Y, test_size=0.33, 
    shuffle=True,
    
) 

In [93]:
# Sobremuestreo: Aumentar numero de datos de la clase minoritaria
# ROS(Duplica clases):
ros=RandomOverSampler(random_state=0) # Random_state=0

In [94]:
# ROS:
Xtrain, Ytrain=ros.fit_resample(Xtrain,Ytrain)

In [95]:
# Visualización
px.histogram(Ytrain, x="Target",title="Distribution Plot Target")

In [96]:
# Preprocesamiento de datos
preprocessing_transformer = ColumnTransformer(
    transformers=[('MinMax', MinMaxScaler(),select_variables),
    ])

# Naive

In [60]:
pipe_naive = Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer), 
     ("clf",MultinomialNB())
    ]
)

In [59]:
# Get params
pipe_naive[1].get_params().keys()

dict_keys(['alpha', 'class_prior', 'fit_prior'])

In [61]:
# Parametros grid clf
param_grid_clf_naive = {'clf__alpha': [0.1,0.5,1,10,50,100,1000]} 

In [69]:
# Clasificación
gs_pipe_naive= GridSearchCV(pipe_naive, param_grid_clf_naive, scoring='f1_weighted',cv=5,
                            refit=True,
                             verbose=5
                            )

In [70]:
# Grid
gs_pipe_naive.fit(Xtrain, Ytrain)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END ....................clf__alpha=0.1;, score=0.300 total time=   1.4s
[CV 2/5] END ....................clf__alpha=0.1;, score=0.299 total time=   1.3s
[CV 3/5] END ....................clf__alpha=0.1;, score=0.302 total time=   1.4s
[CV 4/5] END ....................clf__alpha=0.1;, score=0.299 total time=   1.4s
[CV 5/5] END ....................clf__alpha=0.1;, score=0.300 total time=   1.3s
[CV 1/5] END ....................clf__alpha=0.5;, score=0.300 total time=   1.3s
[CV 2/5] END ....................clf__alpha=0.5;, score=0.299 total time=   1.5s
[CV 3/5] END ....................clf__alpha=0.5;, score=0.301 total time=   1.3s
[CV 4/5] END ....................clf__alpha=0.5;, score=0.298 total time=   1.4s
[CV 5/5] END ....................clf__alpha=0.5;, score=0.299 total time=   1.4s
[CV 1/5] END ......................clf__alpha=1;, score=0.299 total time=   1.3s
[CV 2/5] END ......................clf__alpha=1;,

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocesamiento',
                                        ColumnTransformer(transformers=[('MinMax',
                                                                         MinMaxScaler(),
                                                                         ['E-F',
                                                                          'C-C',
                                                                          'D-F',
                                                                          'A-K',
                                                                          'num_comunicaciones',
                                                                          'Length',
                                                                          'pref_A-A',
                                                                          'Positivo',
                                                                         

In [71]:
# Best Params
gs_pipe_naive.best_params_

{'clf__alpha': 0.1}

In [72]:
# Best score
gs_pipe_naive.best_score_

0.2998864389686343

In [73]:
# Generación de predicciones
Ypred = gs_pipe_naive.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

              precision    recall  f1-score   support

                   0.58      0.62      0.60      3862
         A-A       0.31      0.38      0.35       424
     A-A B-B       0.07      0.05      0.06       262
     A-A D-E       0.06      0.04      0.05       316
         B-B       0.41      0.38      0.40      1686
     B-B C-D       0.58      0.22      0.32       537
     B-B D-E       0.19      0.24      0.21       796
 B-B D-E E-E       0.10      0.04      0.06       483
     B-B E-E       0.27      0.13      0.18       752
 B-B E-E D-E       0.14      0.08      0.10       386
         C-D       0.41      0.68      0.51       598
     C-D B-B       0.39      0.42      0.40       403
 C-D B-B D-E       0.21      0.29      0.24       224
     C-D D-E       0.36      0.45      0.40       407
 C-D D-E B-B       0.09      0.15      0.11       164
     C-D E-E       0.56      0.57      0.56       421
 C-D E-E D-E       0.46      0.46      0.46       189
         D-E       0.39    

In [None]:
# Save model

# Decision Tree

In [156]:
pipe_decision_tree = Pipeline(
    [("preprocesamiento", preprocessing_transformer),
     ("clf", DecisionTreeClassifier(random_state=0,criterion='entropy',max_depth=12))]
)


In [148]:
# Get params
pipe_decision_tree[1].get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])

In [149]:
# Parametros grid clf
param_grid_clf_tree = {
                  'clf__criterion':  ['gini', 'entropy'],
                  'clf__max_depth':[2,4,6,8,10,12]
                 } 

In [150]:
# Clasificación
gs_pipe_decision_tree= GridSearchCV(pipe_decision_tree, param_grid_clf_tree, scoring='accuracy',cv=5,
                            refit=True,
                             verbose=5
                            )

In [157]:
# Grid
pipe_decision_tree.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('MinMax', MinMaxScaler(),
                                                  ['E-F', 'C-C', 'D-F', 'A-K',
                                                   'num_comunicaciones',
                                                   'Length', 'pref_A-A',
                                                   'Positivo', 'Recency', 'A-G',
                                                   'Monto', 'Periodicity',
                                                   'Frequency', 'lec_B-B',
                                                   'G-K'])])),
                ('clf',
                 DecisionTreeClassifier(criterion='entropy', max_depth=12,
                                        random_state=0))])

In [151]:
# Grid
gs_pipe_decision_tree.fit(Xtrain, Ytrain)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END clf__criterion=gini, clf__max_depth=2;, score=0.122 total time=   1.1s
[CV 2/5] END clf__criterion=gini, clf__max_depth=2;, score=0.121 total time=   0.9s
[CV 3/5] END clf__criterion=gini, clf__max_depth=2;, score=0.121 total time=   0.7s
[CV 4/5] END clf__criterion=gini, clf__max_depth=2;, score=0.121 total time=   0.7s
[CV 5/5] END clf__criterion=gini, clf__max_depth=2;, score=0.122 total time=   0.7s
[CV 1/5] END clf__criterion=gini, clf__max_depth=4;, score=0.263 total time=   1.0s
[CV 2/5] END clf__criterion=gini, clf__max_depth=4;, score=0.262 total time=   1.1s
[CV 3/5] END clf__criterion=gini, clf__max_depth=4;, score=0.261 total time=   1.1s
[CV 4/5] END clf__criterion=gini, clf__max_depth=4;, score=0.265 total time=   1.1s
[CV 5/5] END clf__criterion=gini, clf__max_depth=4;, score=0.263 total time=   1.2s
[CV 1/5] END clf__criterion=gini, clf__max_depth=6;, score=0.328 total time=   1.4s
[CV 2/5] END cl

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocesamiento',
                                        ColumnTransformer(transformers=[('MinMax',
                                                                         MinMaxScaler(),
                                                                         ['E-F',
                                                                          'C-C',
                                                                          'D-F',
                                                                          'A-K',
                                                                          'num_comunicaciones',
                                                                          'Length',
                                                                          'pref_A-A',
                                                                          'Positivo',
                                                                         

In [152]:
# Best Params
gs_pipe_decision_tree.best_params_

{'clf__criterion': 'entropy', 'clf__max_depth': 12}

In [153]:
# Best score
gs_pipe_decision_tree.best_score_

0.5280587243553744

In [154]:
# Generación de predicciones
Ypred = gs_pipe_decision_tree.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

              precision    recall  f1-score   support

                   0.43      0.73      0.54      2374
         A-A       0.37      0.14      0.20      1357
     A-A B-B       0.44      0.06      0.11      1578
     A-A D-E       0.15      0.03      0.05      1004
         B-B       0.20      0.33      0.25       927
     B-B C-D       0.38      0.16      0.22       450
     B-B D-E       0.15      0.23      0.18       701
 B-B D-E E-E       0.06      0.03      0.04       417
     B-B E-E       0.30      0.11      0.16       976
 B-B E-E D-E       0.20      0.08      0.11       538
         C-D       0.44      0.67      0.53       670
     C-D B-B       0.46      0.36      0.41       567
 C-D B-B D-E       0.22      0.29      0.25       228
     C-D D-E       0.23      0.39      0.29       317
 C-D D-E B-B       0.18      0.17      0.17       300
     C-D E-E       0.47      0.52      0.49       379
 C-D E-E D-E       0.46      0.38      0.42       211
         D-E       0.17    

In [158]:
# Generación de predicciones
Ypred = pipe_decision_tree.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

              precision    recall  f1-score   support

                   0.43      0.73      0.54      2374
         A-A       0.37      0.14      0.20      1357
     A-A B-B       0.44      0.06      0.11      1578
     A-A D-E       0.15      0.03      0.05      1004
         B-B       0.20      0.33      0.25       927
     B-B C-D       0.38      0.16      0.22       450
     B-B D-E       0.15      0.23      0.18       701
 B-B D-E E-E       0.06      0.03      0.04       417
     B-B E-E       0.30      0.11      0.16       976
 B-B E-E D-E       0.20      0.08      0.11       538
         C-D       0.44      0.67      0.53       670
     C-D B-B       0.46      0.36      0.41       567
 C-D B-B D-E       0.22      0.29      0.25       228
     C-D D-E       0.23      0.39      0.29       317
 C-D D-E B-B       0.18      0.17      0.17       300
     C-D E-E       0.47      0.52      0.49       379
 C-D E-E D-E       0.46      0.38      0.42       211
         D-E       0.17    

In [None]:
# Save model

# Random Forest

In [127]:
pipe_random_forest = Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer), 
     ("clf",RandomForestClassifier(random_state=1,max_depth=50,n_estimators=700))
    ]
)

In [115]:
# Get params
pipe_random_forest[1].get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [124]:
# Parametros grid clf
param_grid_clf_forest = {
                    #'clf__criterion':  ['gini', 'entropy'],
                  'clf__max_depth':[50,100], # Maximum tree depth for base learners.
                  'clf__n_estimators': [700,1400],
                  #'clf__max_features': ['auto', 'sqrt']
                 }

In [125]:
# Clasificación
gs_pipe_forest= GridSearchCV(pipe_random_forest, param_grid_clf_forest, scoring='accuracy',cv=3,
                            refit=True,
                             verbose=10
                            )

In [None]:
# Grid
gs_pipe_forest.fit(Xtrain, Ytrain)

In [None]:
# Best Params
gs_pipe_forest.best_params_

{'clf__criterion': 'entropy', 'clf__max_depth': 12}

In [None]:
# Best score
gs_pipe_forest.best_score_

0.5280587243553744

In [None]:
# Generación de predicciones
Ypred = gs_pipe_forest.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

In [129]:
# Entrenamiento solo pipeline
pipe_random_forest.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('MinMax', MinMaxScaler(),
                                                  ['E-F', 'C-C', 'D-F', 'A-K',
                                                   'num_comunicaciones',
                                                   'Length', 'pref_A-A',
                                                   'Positivo', 'Recency', 'A-G',
                                                   'Monto', 'Periodicity',
                                                   'Frequency', 'lec_B-B',
                                                   'G-K'])])),
                ('clf',
                 RandomForestClassifier(max_depth=50, n_estimators=700,
                                        random_state=1))])

In [130]:
# Generación de predicciones
Ypred = pipe_random_forest.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

              precision    recall  f1-score   support

                   0.79      0.65      0.71      4913
         A-A       0.25      0.51      0.33       241
     A-A B-B       0.03      0.27      0.05        22
     A-A D-E       0.00      0.00      0.00         9
         B-B       0.54      0.41      0.46      2026
     B-B C-D       0.16      0.26      0.19       111
     B-B D-E       0.17      0.28      0.21       624
 B-B D-E E-E       0.03      0.10      0.04        51
     B-B E-E       0.10      0.20      0.13       180
 B-B E-E D-E       0.03      0.06      0.04       101
         C-D       0.76      0.61      0.68      1288
     C-D B-B       0.45      0.45      0.45       454
 C-D B-B D-E       0.25      0.33      0.29       225
     C-D D-E       0.48      0.42      0.45       607
 C-D D-E B-B       0.19      0.23      0.21       243
     C-D E-E       0.63      0.63      0.63       425
 C-D E-E D-E       0.51      0.60      0.55       150
         D-E       0.54    

In [144]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))

In [145]:
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
Ytest_map

64414         [E-E]
61364            []
54820         [B-B]
3605          [E-E]
32813    [B-B, D-E]
            ...    
46120    [A-A, B-B]
36047    [C-D, D-E]
919           [D-E]
5518     [D-E, B-B]
8756          [E-E]
Name: Target, Length: 21364, dtype: object

In [146]:
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.7434066134098899

In [None]:
# Save model

# XGBoost

In [40]:
pipe_xgb = Pipeline(
    [("preprocesamiento", preprocessing_transformer),
     ("clf", xgb.XGBClassifier(seed=1,  eval_metric='mlogloss'))]
)

In [None]:
# Get params
pipe_xgb[1].get_params().keys()

In [39]:
# Parametros grid clf
param_grid_clf_xgb = {'clf__learning_rate': [0.1,0.5,0.05,1], # Boosting learning rate
                  'clf__gamma':[0.1,1,2,10], # Minimum loss reduction required to make a further partition on a leaf node of the tree.
                  'clf__max_depth':[5,10,50,100], # Maximum tree depth for base learners.
                  'clf__min_child_weight':[40,5,100], # Minimum sum of instance weight(hessian) needed in a child.
                  'clf__colsample_bytree':[0.5,0.8], # Subsample ratio of columns when constructing each tree.
                  'clf__n_estimators': [100,500,1000] # Number of gradient boosted trees.
                 } 

In [None]:
# Clasificación
gs_pipe_xgb= GridSearchCV(pipe_xgb, param_grid_clf_xgb, scoring='accuracy',cv=5,
                            refit=True,
                             verbose=5
                            )

In [41]:
# Entrenamiento de pipeline
gs_pipe_xgb.fit(Xtrain, Ytrain)





Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('MinMax', MinMaxScaler(),
                                                  ['E-F', 'C-C', 'D-F', 'A-K',
                                                   'num_comunicaciones',
                                                   'Length', 'pref_A-A',
                                                   'Positivo', 'Recency', 'A-G',
                                                   'Monto', 'Periodicity',
                                                   'Frequency', 'lec_B-B',
                                                   'G-K'])])),
                ('clf',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=...
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0

In [None]:
# Best Params
gs_pipe_xgb.best_params_

{'clf__criterion': 'entropy', 'clf__max_depth': 12}

In [None]:
# Best score
gs_pipe_xgb.best_score_

0.5280587243553744

In [42]:
# Generación de predicciones
Ypred = gs_pipe_xgb.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

              precision    recall  f1-score   support

                   0.58      0.71      0.64      3341
         A-A       0.39      0.22      0.28       942
     A-A B-B       0.18      0.06      0.09       531
     A-A D-E       0.11      0.07      0.08       343
         B-B       0.47      0.38      0.42      1936
     B-B C-D       0.32      0.27      0.29       244
     B-B D-E       0.28      0.26      0.27      1084
 B-B D-E E-E       0.05      0.06      0.06       184
     B-B E-E       0.25      0.15      0.19       574
 B-B E-E D-E       0.17      0.11      0.13       348
         C-D       0.59      0.64      0.61       907
     C-D B-B       0.51      0.42      0.46       539
 C-D B-B D-E       0.24      0.26      0.25       277
     C-D D-E       0.40      0.40      0.40       499
 C-D D-E B-B       0.16      0.16      0.16       301
     C-D E-E       0.58      0.61      0.59       413
 C-D E-E D-E       0.43      0.58      0.49       137
         D-E       0.36    

In [None]:
# Save model