# Experimentos Finales Modelos no RN

In [62]:
import pandas as pd
import numpy as np

import missingno as msno
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import datetime
from dateutil.parser import parse
from sklearn.tree import  DecisionTreeClassifier
import plotly.graph_objects as go
import umap
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
import ml_metrics
import pickle
from sklearn.metrics import accuracy_score


In [63]:
Data=pd.read_csv("cleaned_01June2022.csv").drop(columns=["Unnamed: 0"]).fillna("")
Data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,pref_B-B,pref_D-E,pref_C-D,pref_A-A,Duracion_Campaña,B,C,lec_B-B,lec_E-E,lec_D-E,...,Renta,Recibe_sueldo_en_cuenta,Segmento_consumidor,Meses_antiguedad,Comuna,Ciudad,Estado_civil,Principalidad,Profesion,Target
0,0.0,0.0,0.0,0.0,1.0,4.0,5.0,5.0,0.0,0.0,...,R1,0,A,Mayor a 10 años,331.0,13.0,D,B,P164,
1,0.0,0.0,0.0,0.0,1.0,2.0,9.0,14.0,1.0,1.0,...,R9,0,B,Mayor a 10 años,37.0,5.0,D,E,P164,D-E C-D E-E B-B
2,0.0,0.0,0.0,0.0,1.0,2.0,2.0,5.0,0.0,0.0,...,R9,1,B,Mayor a 10 años,19.0,4.0,B,D,P114,E-E A-A B-B
3,0.0,0.0,0.0,0.0,1.0,6.0,6.0,8.0,0.0,1.0,...,R4,0,A,Mayor a 10 años,91.0,13.0,D,B,P85,E-E B-B D-E
4,0.0,0.0,0.0,0.0,1.0,2.0,3.0,1.0,0.0,0.0,...,R5,0,A,Mayor a 10 años,34.0,5.0,B,C,P143,D-E A-A


In [64]:
# datos númericos
numerical=list(Data.select_dtypes(exclude=["category","object"]).columns)
numerical.remove("Sexo")
numerical.remove('Recibe_sueldo_en_cuenta')


# datos categorical
categorical=list(Data.select_dtypes(include=["category","object"]).columns)+['Sexo','Recibe_sueldo_en_cuenta']
categorical.remove("Target")

In [65]:
for i in categorical:
    Data[i]=Data[i].astype(str)

In [66]:
# Select 1 - Only Gini, corte visual
select_variables_1=['E-F',
 'C-C',
 'D-F',
 'A-K',
 'num_comunicaciones',
 'Length',
 'pref_A-A',
 'Positivo',
 'Recency',
 'A-G',
 'Monto',
 'Periodicity',
 'Frequency',
 'lec_B-B',
 'G-K']

In [68]:
# Select 2 - Criterio Corr Kill
select_variables_2=['E-F',
 'C-C',
 'D-F',
 'A-K',
 'pref_A-A',
 'num_comunicaciones',
 'Length',
 'Sin Info',
 'F-I',
 'Monto',
 'B',
 'lec_B-B',
 'Comuna',
 'Edad',
 'Ciudad',
 'Profesion',
 'Renta',
 'F-H',
 'lec_A-A',
 'Principalidad',
 'A-L',
 'lec_C-D',
 'Estado_civil']

# Train, Test Split

In [69]:
X=Data.drop(columns=["Target"]).copy()
X.head()

Unnamed: 0,pref_B-B,pref_D-E,pref_C-D,pref_A-A,Duracion_Campaña,B,C,lec_B-B,lec_E-E,lec_D-E,...,Sexo,Renta,Recibe_sueldo_en_cuenta,Segmento_consumidor,Meses_antiguedad,Comuna,Ciudad,Estado_civil,Principalidad,Profesion
0,0.0,0.0,0.0,0.0,1.0,4.0,5.0,5.0,0.0,0.0,...,1,R1,0,A,Mayor a 10 años,331.0,13.0,D,B,P164
1,0.0,0.0,0.0,0.0,1.0,2.0,9.0,14.0,1.0,1.0,...,1,R9,0,B,Mayor a 10 años,37.0,5.0,D,E,P164
2,0.0,0.0,0.0,0.0,1.0,2.0,2.0,5.0,0.0,0.0,...,1,R9,1,B,Mayor a 10 años,19.0,4.0,B,D,P114
3,0.0,0.0,0.0,0.0,1.0,6.0,6.0,8.0,0.0,1.0,...,1,R4,0,A,Mayor a 10 años,91.0,13.0,D,B,P85
4,0.0,0.0,0.0,0.0,1.0,2.0,3.0,1.0,0.0,0.0,...,1,R5,0,A,Mayor a 10 años,34.0,5.0,B,C,P143


In [70]:
Y=Data["Target"]
Y.head()

0                   
1    D-E C-D E-E B-B
2        E-E A-A B-B
3        E-E B-B D-E
4            D-E A-A
Name: Target, dtype: object

In [71]:
# Generación de conjuntos de train y test
Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X, Y, test_size=0.33, 
    shuffle=True,
    
) 

In [72]:
# Sobremuestreo: Aumentar numero de datos de la clase minoritaria
# ROS(Duplica clases):
ros=RandomOverSampler(random_state=0) # Random_state=0

In [73]:
# ROS:
Xtrain, Ytrain=ros.fit_resample(Xtrain,Ytrain)

In [74]:
Xtrain.shape

(615606, 43)

In [75]:
numerical1=list(filter(lambda x:x  in numerical,select_variables_1))
categorical1=list(filter(lambda x:x  in categorical,select_variables_1))

In [76]:
numerical2=list(filter(lambda x:x  in numerical,select_variables_2))
categorical2=list(filter(lambda x:x  in categorical,select_variables_2))

In [77]:
# Preprocesamiento de datos
preprocessing_transformer1 = ColumnTransformer(
    transformers=[('OneHotEncoder', OneHotEncoder(handle_unknown="ignore"),categorical1 ),
        ('MinMax', MinMaxScaler(),numerical1),
    ])

In [78]:
# Preprocesamiento de datos
preprocessing_transformer2 = ColumnTransformer(
    transformers=[('OneHotEncoder', OneHotEncoder(handle_unknown="ignore"),categorical2 ),
        ('MinMax', MinMaxScaler(),numerical2),
    ])

# Exp 1

In [79]:
pipe_naive1 = Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer1), 
     ("clf",MultinomialNB())
    ]
)

In [80]:
# Fit
pipe_naive1.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('OneHotEncoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  []),
                                                 ('MinMax', MinMaxScaler(),
                                                  ['E-F', 'C-C', 'D-F', 'A-K',
                                                   'num_comunicaciones',
                                                   'Length', 'pref_A-A',
                                                   'Positivo', 'Recency', 'A-G',
                                                   'Monto', 'Periodicity',
                                                   'Frequency', 'lec_B-B',
                                                   'G-K'])])),
                ('clf', MultinomialNB())])

In [81]:
# Generación de predicciones
Ypred = pipe_naive1.predict(Xtest)

In [82]:
accuracy_score(Ypred, Ytest)

0.31917336394948337

In [83]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.6991530443022981

# Exp 2

In [84]:
pipe_naive2 = Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer2), 
     ("clf",MultinomialNB())
    ]
)

In [85]:
# Fit
pipe_naive2.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('OneHotEncoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Comuna', 'Edad', 'Ciudad',
                                                   'Profesion', 'Renta',
                                                   'Principalidad',
                                                   'Estado_civil']),
                                                 ('MinMax', MinMaxScaler(),
                                                  ['E-F', 'C-C', 'D-F', 'A-K',
                                                   'pref_A-A',
                                                   'num_comunicaciones',
                                                   'Length', 'Sin Info', 'F-I',
                                                   'Monto', 'B', 'lec_B-B',
                                                   'F-H', 'lec_A-A', '

In [86]:
# Generación de predicciones
Ypred = pipe_naive2.predict(Xtest)

In [87]:
accuracy_score(Ypred, Ytest)

0.07671805806134165

In [88]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.5310307437173108

# Exp 3

In [89]:
pipe_random_forest1= Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer1), 
     ("clf",RandomForestClassifier(random_state=1,max_depth=50,n_estimators=700))
    ]
)

In [90]:
# Fit
pipe_random_forest1.fit(Xtrain, Ytrain)

MemoryError: could not allocate 19398656 bytes

In [None]:
# Generación de predicciones
Ypred = pipe_random_forest1.predict(Xtest)

In [None]:
accuracy_score(Ypred, Ytest)

In [None]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

In [None]:
# Guardar modelo:
pickle.dump(pipe_random_forest1, open('pipe_random_forest1.pkl', "wb"))

In [None]:
# Leer modelo:
pipe_random_forest1= pickle.load(open('pipe_random_forest1.pkl', 'rb'))

# Exp 4

In [None]:
pipe_random_forest2 = Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer2), 
     ("clf",RandomForestClassifier(random_state=1,max_depth=50,n_estimators=700))
    ]
)

In [None]:
# Fit
pipe_random_forest2.fit(Xtrain, Ytrain)

In [None]:
# Generación de predicciones
Ypred = pipe_random_forest2.predict(Xtest)

In [None]:
accuracy_score(Ypred, Ytest)

In [None]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

In [None]:
# Guardar modelo:
pickle.dump(pipe_random_forest2, open('pipe_random_forest2.pkl', "wb"))

In [None]:
# Leer modelo:
pipe_random_forest2= pickle.load(open('pipe_random_forest2.pkl', 'rb'))

# Exp 5

In [None]:
pipe_xgb1 = Pipeline(
    [("preprocesamiento", preprocessing_transformer1),
     ("clf", xgb.XGBClassifier(seed=1,  eval_metric='mlogloss',learning_rate= 0.5,n_estimators= 500))]
)


In [None]:
# Fit
pipe_xgb1.fit(Xtrain, Ytrain)

In [None]:
# Generación de predicciones
Ypred = pipe_xgb1.predict(Xtest)

In [None]:
accuracy_score(Ypred, Ytest)

In [None]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

In [None]:
# Guardar modelo:
pickle.dump(pipe_xgb1, open('pipe_xgb1.pkl', "wb"))

In [None]:
# Leer modelo:
pipe_xgb1= pickle.load(open('pipe_xgb1.pkl', 'rb'))

# Exp 6

In [None]:
pipe_xgb2 = Pipeline(
    [("preprocesamiento", preprocessing_transformer2),
     ("clf", xgb.XGBClassifier(seed=1,  eval_metric='mlogloss',learning_rate= 0.5,n_estimators= 500))]
)


In [None]:
# Fit
pipe_xgb2.fit(Xtrain, Ytrain)

In [None]:
# Generación de predicciones
Ypred = pipe_xgb2.predict(Xtest)

In [None]:
accuracy_score(Ypred, Ytest)

In [None]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

In [None]:
# Guardar modelo:
pickle.dump(pipe_xgb2, open('pipe_xgb2.pkl', "wb"))

In [None]:
# Leer modelo:
pipe_xgb2= pickle.load(open('pipe_xgb2.pkl', 'rb'))