# Grid Search modelos finales no basados en Redes Neuronales

In [None]:
import pandas as pd
import numpy as np

import missingno as msno
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import datetime
from dateutil.parser import parse
from sklearn.tree import  DecisionTreeClassifier
import plotly.graph_objects as go
import umap
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
import ml_metrics
import pickle


In [90]:
Data=pd.read_csv("cleaned_01June2022.csv").drop(columns=["Unnamed: 0"]).fillna("")
Data.head()


Columns (40) have mixed types.Specify dtype option on import or set low_memory=False.



Unnamed: 0,pref_B-B,pref_D-E,pref_C-D,pref_A-A,Duracion_Campaña,B,C,lec_B-B,lec_E-E,lec_D-E,...,Renta,Recibe_sueldo_en_cuenta,Segmento_consumidor,Meses_antiguedad,Comuna,Ciudad,Estado_civil,Principalidad,Profesion,Target
0,0.0,0.0,0.0,0.0,1.0,4.0,5.0,5.0,0.0,0.0,...,R1,0,A,Mayor a 10 años,331.0,13.0,D,B,P164,
1,0.0,0.0,0.0,0.0,1.0,2.0,9.0,14.0,1.0,1.0,...,R9,0,B,Mayor a 10 años,37.0,5.0,D,E,P164,D-E C-D E-E B-B
2,0.0,0.0,0.0,0.0,1.0,2.0,2.0,5.0,0.0,0.0,...,R9,1,B,Mayor a 10 años,19.0,4.0,B,D,P114,E-E A-A B-B
3,0.0,0.0,0.0,0.0,1.0,6.0,6.0,8.0,0.0,1.0,...,R4,0,A,Mayor a 10 años,91.0,13.0,D,B,P85,E-E B-B D-E
4,0.0,0.0,0.0,0.0,1.0,2.0,3.0,1.0,0.0,0.0,...,R5,0,A,Mayor a 10 años,34.0,5.0,B,C,P143,D-E A-A


In [91]:
# datos númericos
numerical=list(Data.select_dtypes(exclude=["category","object"]).columns)
numerical.remove("Sexo")
numerical.remove('Recibe_sueldo_en_cuenta')


# datos categorical
categorical=list(Data.select_dtypes(include=["category","object"]).columns)+['Sexo','Recibe_sueldo_en_cuenta']
categorical.remove("Target")

In [92]:
for i in categorical:
    Data[i]=Data[i].astype(str)

In [105]:
# Según criterio Gini (Todas variables númricas) - Version antigua
select_variables=['E-F',
 'C-C',
 'D-F',
 'A-K',
 'num_comunicaciones',
 'Length',
 'pref_A-A',
 'Positivo',
 'Recency',
 'A-G',
 'Monto',
 'Periodicity',
 'Frequency',
 'lec_B-B',
 'G-K']

In [123]:
# Según criterio Gini (Todas variables númricas) - Version nueva
select_variables=['E-F',
 'C-C',
 'D-F',
 'A-K',
 'pref_A-A',
 'num_comunicaciones',
 'Length',
 'Sin Info',
 'F-I',
 'Monto',
 'B',
 'lec_B-B',
 'Comuna',
 'Edad',
 'Ciudad',
 'Profesion',
 'Renta',
 'F-H',
 'lec_A-A',
 'Principalidad',
 'A-L',
 'lec_C-D',
 'Estado_civil']

# Train, Test Split

In [124]:
X=Data[select_variables]
X.head()

Unnamed: 0,E-F,C-C,D-F,A-K,pref_A-A,num_comunicaciones,Length,Sin Info,F-I,Monto,...,Edad,Ciudad,Profesion,Renta,F-H,lec_A-A,Principalidad,A-L,lec_C-D,Estado_civil
0,0,0,0,274,0.0,22.0,371,18,2,180754.821871,...,Mayor a 70,13.0,P164,R1,19,0.0,B,0,0.0,D
1,3,6,14,148,0.0,21.0,371,19,2,303094.160627,...,Mayor a 70,5.0,P164,R9,5,0.0,E,0,3.0,D
2,10,0,0,257,0.0,20.0,371,37,5,144339.196858,...,Mayor a 70,4.0,P114,R9,19,0.0,D,1,0.0,B
3,3,0,2,69,0.0,15.0,371,16,16,154439.534968,...,Mayor a 70,13.0,P85,R4,17,0.0,B,0,1.0,D
4,0,0,2,51,0.0,26.0,371,20,13,128719.371341,...,Mayor a 70,5.0,P143,R5,15,0.0,C,0,1.0,B


In [125]:
Y=Data["Target"]
Y.head()

0                   
1    D-E C-D E-E B-B
2        E-E A-A B-B
3        E-E B-B D-E
4            D-E A-A
Name: Target, dtype: object

In [126]:
# Generación de conjuntos de train y test
Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X, Y, test_size=0.33, 
    shuffle=True,
    
) 

In [127]:
# Sobremuestreo: Aumentar numero de datos de la clase minoritaria
# ROS(Duplica clases):
ros=RandomOverSampler(random_state=0) # Random_state=0

In [128]:
# ROS:
Xtrain, Ytrain=ros.fit_resample(Xtrain,Ytrain)

In [129]:
numerical1=list(filter(lambda x:x  in numerical,select_variables))
categorical1=list(filter(lambda x:x  in categorical,select_variables))

In [130]:
# Preprocesamiento de datos
preprocessing_transformer = ColumnTransformer(
    transformers=[('OneHotEncoder', OneHotEncoder(handle_unknown="ignore"),categorical1 ),
        ('MinMax', MinMaxScaler(),numerical1),
    ])

# Naive

In [113]:
pipe_naive = Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer), 
     ("clf",MultinomialNB())
    ]
)

In [114]:
# Grid
pipe_naive.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('OneHotEncoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  []),
                                                 ('MinMax', MinMaxScaler(),
                                                  ['E-F', 'C-C', 'D-F', 'A-K',
                                                   'num_comunicaciones',
                                                   'Length', 'pref_A-A',
                                                   'Positivo', 'Recency', 'A-G',
                                                   'Monto', 'Periodicity',
                                                   'Frequency', 'lec_B-B',
                                                   'G-K'])])),
                ('clf', MultinomialNB())])

In [115]:
# Generación de predicciones
Ypred = pipe_naive.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

                 precision    recall  f1-score   support

                      0.58      0.60      0.59      3801
            A-A       0.20      0.42      0.27       227
        A-A B-B       0.16      0.06      0.09       498
    A-A B-B D-E       0.16      0.02      0.04       567
        A-A C-D       0.35      0.06      0.10       307
        A-A D-E       0.03      0.02      0.03       281
    A-A D-E B-B       0.09      0.02      0.03       510
        A-A E-E       0.05      0.02      0.03       136
            B-B       0.36      0.36      0.36      1544
        B-B A-A       0.22      0.14      0.17       249
    B-B A-A D-E       0.03      0.09      0.04        22
        B-B C-D       0.31      0.18      0.23       373
    B-B C-D D-E       0.07      0.09      0.08        74
B-B C-D D-E E-E       0.00      0.00      0.00        80
    B-B C-D E-E       0.05      0.03      0.04        64
        B-B D-E       0.08      0.21      0.12       422
    B-B D-E A-A       0.12    

In [116]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.6947627931769723

# Decision Tree

In [131]:
pipe_decision_tree = Pipeline(
    [("preprocesamiento", preprocessing_transformer),
     ("clf", DecisionTreeClassifier(random_state=0,criterion='entropy',max_depth=12))]
)


In [134]:
# Grid
pipe_decision_tree.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('OneHotEncoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Comuna', 'Edad', 'Ciudad',
                                                   'Profesion', 'Renta',
                                                   'Principalidad',
                                                   'Estado_civil']),
                                                 ('MinMax', MinMaxScaler(),
                                                  ['E-F', 'C-C', 'D-F', 'A-K',
                                                   'pref_A-A',
                                                   'num_comunicaciones',
                                                   'Length', 'Sin Info', 'F-I',
                                                   'Monto', 'B', 'lec_B-B',
                                                   'F-H', 'lec_A-A', '

In [135]:
# Generación de predicciones
Ypred = pipe_decision_tree.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

                 precision    recall  f1-score   support

                      0.44      0.73      0.55      2449
            A-A       0.33      0.12      0.17      1245
        A-A B-B       0.17      0.04      0.06       931
    A-A B-B D-E       0.03      0.01      0.01       434
        A-A C-D       0.19      0.05      0.08       213
        A-A D-E       0.09      0.04      0.05       529
    A-A D-E B-B       0.05      0.02      0.02       382
        A-A E-E       0.06      0.01      0.02       422
            B-B       0.12      0.36      0.18       525
        B-B A-A       0.18      0.04      0.07       592
    B-B A-A D-E       0.31      0.02      0.03      1216
        B-B C-D       0.24      0.19      0.21       290
    B-B C-D D-E       0.09      0.04      0.05       206
B-B C-D D-E E-E       0.03      0.01      0.01       119
    B-B C-D E-E       0.17      0.04      0.07       148
        B-B D-E       0.00      0.08      0.01        50
    B-B D-E A-A       0.16    

In [136]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.6782906666302189

# Random Forest

In [137]:
pipe_random_forest = Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer), 
     ("clf",RandomForestClassifier(random_state=1,max_depth=50,n_estimators=700))
    ]
)

In [None]:
# Get params
pipe_random_forest[1].get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [None]:
# Parametros grid clf
param_grid_clf_forest = {
                    #'clf__criterion':  ['gini', 'entropy'],
                  'clf__max_depth':[50,100], # Maximum tree depth for base learners.
                  'clf__n_estimators': [700,1400],
                  #'clf__max_features': ['auto', 'sqrt']
                 }

In [None]:
# Clasificación
gs_pipe_forest= GridSearchCV(pipe_random_forest, param_grid_clf_forest, scoring='accuracy',cv=3,
                            refit=True,
                             verbose=10
                            )

In [None]:
# Grid
gs_pipe_forest.fit(Xtrain, Ytrain)

In [None]:
# Best Params
gs_pipe_forest.best_params_

In [None]:
# Best score
gs_pipe_forest.best_score_

In [None]:
# Generación de predicciones
Ypred = gs_pipe_forest.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

In [None]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.6782906666302189

# XGBoost

In [11]:
pipe_xgb = Pipeline(
    [("preprocesamiento", preprocessing_transformer),
     ("clf", xgb.XGBClassifier(seed=1,  eval_metric='mlogloss',learning_rate= 0.5,n_estimators= 500))]
)



In [None]:
pipe_xgb[1].get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='mlogloss', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, seed=1, subsample=None,
              tree_method=None, validate_parameters=None, verbosity=None)>

In [None]:
# Get params
pipe_xgb[1].get_params().keys()

In [None]:
# Parametros grid clf
param_grid_clf_xgb = {'clf__learning_rate': [0.1,0.5,0.05,1], # Boosting learning rate
                  #'clf__gamma':[0.1,1,2,10], # Minimum loss reduction required to make a further partition on a leaf node of the tree.
                  #'clf__max_depth':[5,10,50,100], # Maximum tree depth for base learners.
                 # 'clf__min_child_weight':[40,5,100], # Minimum sum of instance weight(hessian) needed in a child.
                 # 'clf__colsample_bytree':[0.5,0.8], # Subsample ratio of columns when constructing each tree.
                  'clf__n_estimators': [100,500,1000] # Number of gradient boosted trees.
                 } 

In [None]:
# Clasificación
gs_pipe_xgb= GridSearchCV(pipe_xgb, param_grid_clf_xgb, scoring='accuracy',cv=2,
                            refit=True,
                             verbose=10
                            )

In [None]:
# Entrenamiento de pipeline
gs_pipe_xgb.fit(Xtrain, Ytrain)

In [None]:
# Best Params
gs_pipe_xgb.best_params_

{'clf__learning_rate': 0.5, 'clf__n_estimators': 500}

In [None]:
# Best score
gs_pipe_xgb.best_score_

0.9692194943202639

In [28]:
# Generación de predicciones
Ypred = gs_pipe_xgb.predict(Xtest)
# Métricas de evaluación 
print(classification_report(Ypred, Ytest))

              precision    recall  f1-score   support

                   0.76      0.63      0.69      5025
         A-A       0.23      0.38      0.28       292
     A-A B-B       0.08      0.19      0.11        77
     A-A D-E       0.00      0.00      0.00        37
         B-B       0.45      0.39      0.42      1798
     B-B C-D       0.21      0.23      0.22       171
     B-B D-E       0.19      0.23      0.21       925
 B-B D-E E-E       0.03      0.15      0.05        46
     B-B E-E       0.11      0.19      0.14       229
 B-B E-E D-E       0.04      0.08      0.06       106
         C-D       0.68      0.62      0.65      1122
     C-D B-B       0.44      0.44      0.44       444
 C-D B-B D-E       0.26      0.27      0.27       281
     C-D D-E       0.40      0.37      0.39       537
 C-D D-E B-B       0.16      0.19      0.18       271
     C-D E-E       0.57      0.55      0.56       467
 C-D E-E D-E       0.43      0.58      0.49       137
         D-E       0.54    

In [None]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.6782906666302189