# Experimentos Finales Modelos no RN

In [3]:
import pandas as pd
import numpy as np

import missingno as msno
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import datetime
from dateutil.parser import parse
from sklearn.tree import  DecisionTreeClassifier
import plotly.graph_objects as go
import umap
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
import ml_metrics
import pickle
from sklearn.metrics import accuracy_score
import json
import warnings
warnings.filterwarnings("ignore")


In [4]:
Data=pd.read_csv("../Datos/cleaned_05June2022.csv").drop(columns=["Unnamed: 0"]).fillna("")
Data.head()

Unnamed: 0,Duracion_Campaña,B,C,lec_B-B,lec_E-E,lec_D-E,lec_C-D,lec_A-A,num_comunicaciones,Length,...,Renta,Recibe_sueldo_en_cuenta,Segmento_consumidor,Meses_antiguedad,Comuna,Ciudad,Estado_civil,Principalidad,Profesion,Target
0,1.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,22.0,371,...,R1,0,A,Mayor a 10 años,331.0,13.0,D,B,P164,
1,1.0,6.0,6.0,8.0,0.0,1.0,1.0,0.0,15.0,371,...,R4,0,A,Mayor a 10 años,91.0,13.0,D,B,P85,E-E B-B D-E
2,1.0,2.0,3.0,1.0,0.0,0.0,1.0,0.0,26.0,371,...,R5,0,A,Mayor a 10 años,34.0,5.0,B,C,P143,D-E A-A
3,1.2,7.0,3.0,5.0,0.0,0.0,2.0,2.0,34.0,371,...,R9,0,A,Mayor a 10 años,331.0,13.0,B,C,P1,E-E
4,1.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,24.0,371,...,R2,0,B,Mayor a 10 años,37.0,5.0,D,F,P210,D-E


In [5]:
# datos númericos
numerical=list(Data.select_dtypes(exclude=["category","object"]).columns)
numerical.remove("Sexo")
numerical.remove('Recibe_sueldo_en_cuenta')


# datos categorical
categorical=list(Data.select_dtypes(include=["category","object"]).columns)+['Sexo','Recibe_sueldo_en_cuenta']
categorical.remove("Target")

In [6]:
for i in categorical:
    Data[i]=Data[i].astype(str)

In [7]:
# Select 
select_variables_1=['E-F',
 'D-F',
 'C-C',
 'A-K',
 'A-G',
 'num_comunicaciones',
 'Recency',
 'Positivo',
 'B',
 'Monto',
 'Periodicity',
 'Profesion',
 'lec_B-B',
 'Renta',
 'F-I',
 'Edad',
 'lec_A-A',
 'Comuna',
 'Sin Info',
 'Ciudad',
 'Duracion_Campaña',
 'Principalidad']

# Train, Test Split

In [8]:
X=Data.drop(columns=["Target"]).copy()
X.head()

Unnamed: 0,Duracion_Campaña,B,C,lec_B-B,lec_E-E,lec_D-E,lec_C-D,lec_A-A,num_comunicaciones,Length,...,Sexo,Renta,Recibe_sueldo_en_cuenta,Segmento_consumidor,Meses_antiguedad,Comuna,Ciudad,Estado_civil,Principalidad,Profesion
0,1.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,22.0,371,...,1,R1,0,A,Mayor a 10 años,331.0,13.0,D,B,P164
1,1.0,6.0,6.0,8.0,0.0,1.0,1.0,0.0,15.0,371,...,1,R4,0,A,Mayor a 10 años,91.0,13.0,D,B,P85
2,1.0,2.0,3.0,1.0,0.0,0.0,1.0,0.0,26.0,371,...,1,R5,0,A,Mayor a 10 años,34.0,5.0,B,C,P143
3,1.2,7.0,3.0,5.0,0.0,0.0,2.0,2.0,34.0,371,...,1,R9,0,A,Mayor a 10 años,331.0,13.0,B,C,P1
4,1.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,24.0,371,...,0,R2,0,B,Mayor a 10 años,37.0,5.0,D,F,P210


In [9]:
Y=Data["Target"]
Y.head()

0               
1    E-E B-B D-E
2        D-E A-A
3            E-E
4            D-E
Name: Target, dtype: object

In [10]:
# Generación de conjuntos de train y test
Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X, Y, test_size=0.33, 
    shuffle=True,
    
) 

In [11]:
# Sobremuestreo: Aumentar numero de datos de la clase minoritaria
# ROS(Duplica clases):
ros=RandomOverSampler(random_state=0) # Random_state=0

In [12]:
# ROS:
Xtrain, Ytrain=ros.fit_resample(Xtrain,Ytrain)

In [13]:
Xtrain.shape

(281330, 39)

In [12]:
numerical1=list(filter(lambda x:x  in numerical,select_variables_1))
categorical1=list(filter(lambda x:x  in categorical,select_variables_1))

In [13]:
# Preprocesamiento de datos
preprocessing_transformer1 = ColumnTransformer(
    transformers=[('OneHotEncoder', OneHotEncoder(handle_unknown="ignore"),categorical1 ),
        ('MinMax', MinMaxScaler(),numerical1),
    ])

# Model 0

In [14]:
tags_to_naive=['E-F',
 'D-F',
 'C-C',
 'A-K',
 'A-G',
 'num_comunicaciones',
 'Recency',
 'Positivo',
 'B',
 'Monto',
 'Periodicity',
 'lec_B-B',
 'F-I',
 'lec_A-A',
 'Sin Info',
 'Duracion_Campaña',
 ]

In [15]:
# Preprocesamiento de datos
preprocessing_transformer2 = ColumnTransformer(
    transformers=[
        ('MinMax', MinMaxScaler(),tags_to_naive),
    ])

In [16]:
pipe_naive = Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer2), 
     ("clf",MultinomialNB())
    ]
)

In [17]:
# Fit
pipe_naive.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('MinMax', MinMaxScaler(),
                                                  ['E-F', 'D-F', 'C-C', 'A-K',
                                                   'A-G', 'num_comunicaciones',
                                                   'Recency', 'Positivo', 'B',
                                                   'Monto', 'Periodicity',
                                                   'lec_B-B', 'F-I', 'lec_A-A',
                                                   'Sin Info',
                                                   'Duracion_Campaña'])])),
                ('clf', MultinomialNB())])

In [18]:
# Generación de predicciones
Ypred = pipe_naive.predict(Xtest)

In [19]:
accuracy_score(Ypred, Ytest)

0.2863466915191053

In [20]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.6473128300714509

In [21]:
# Guardar modelo:
pickle.dump(pipe_naive, open('../Modelos/pipe_naive.pkl', "wb"))

In [None]:
# Leer modelo:
pipe_naive= pickle.load(open('../Modelos/pipe_naive.pkl', 'rb'))

# Model 1

In [22]:
pipe_tree = Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer1), 
     ("clf",DecisionTreeClassifier(random_state=0))
    ]
)

In [23]:
# Fit
pipe_tree.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('OneHotEncoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Profesion', 'Renta', 'Edad',
                                                   'Comuna', 'Ciudad',
                                                   'Principalidad']),
                                                 ('MinMax', MinMaxScaler(),
                                                  ['E-F', 'D-F', 'C-C', 'A-K',
                                                   'A-G', 'num_comunicaciones',
                                                   'Recency', 'Positivo', 'B',
                                                   'Monto', 'Periodicity',
                                                   'lec_B-B', 'F-I', 'lec_A-A',
                                                   'Sin Info',
                                                   '

In [24]:
# Generación de predicciones
Ypred = pipe_tree.predict(Xtest)

In [25]:
accuracy_score(Ypred, Ytest)

0.31183597390493945

In [26]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.6510523454488971

In [27]:
# Guardar modelo:
pickle.dump(pipe_tree, open('../Modelos/pipe_tree.pkl', "wb"))

In [17]:
# Leer modelo:
pipe_tree= pickle.load(open('../Modelos/pipe_tree.pkl', 'rb'))

# Model 2

In [28]:
pipe_random_forest1= Pipeline(
    [
        ("preprocesamiento", preprocessing_transformer1), 
     ("clf",RandomForestClassifier(random_state=1,n_estimators=100))
    ]
)

In [29]:
# Fit
pipe_random_forest1.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocesamiento',
                 ColumnTransformer(transformers=[('OneHotEncoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Profesion', 'Renta', 'Edad',
                                                   'Comuna', 'Ciudad',
                                                   'Principalidad']),
                                                 ('MinMax', MinMaxScaler(),
                                                  ['E-F', 'D-F', 'C-C', 'A-K',
                                                   'A-G', 'num_comunicaciones',
                                                   'Recency', 'Positivo', 'B',
                                                   'Monto', 'Periodicity',
                                                   'lec_B-B', 'F-I', 'lec_A-A',
                                                   'Sin Info',
                                                   '

In [30]:
# Generación de predicciones
Ypred = pipe_random_forest1.predict(Xtest)

In [31]:
accuracy_score(Ypred, Ytest)

0.4391891891891892

In [32]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.6990292016154085

In [33]:
# Guardar modelo:
pickle.dump(pipe_random_forest1, open('../Modelos/pipe_random_forest1.pkl', "wb"))

In [24]:
# Leer modelo:
pipe_random_forest1= pickle.load(open('../Modelos/pipe_random_forest1.pkl', 'rb'))

# Model 3: Fue entrenado en GPU de la nube de google

In [None]:
# save to BIN
#model_xgb1.save_model("./pipe_xgb1.bin")

In [14]:
tf = open("../Modelos/myDictionary.json", "r")
dic_target = json.load(tf)

In [16]:
# Load
model_xgb1 = xgb.XGBClassifier()
model_xgb1.load_model("../Modelos/model_xgb1.bin")

In [17]:
# Leer pipeline:
pipe_xgb1= pickle.load(open('../Modelos/pipe_xgb1.pkl', 'rb'))

In [18]:
model_xgb1

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              seed=1, subsample=1, tree_method='auto', validate_parameters=1, ...)

-----------

In [19]:
# Preprocesamiento
Xtest1=pipe_xgb1.transform(Xtest)

In [20]:
inv_map = {v: k for k, v in dic_target.items()}

In [25]:
Ypred=pd.DataFrame(model_xgb1.predict(Xtest1)).replace(dic_target)[0]
Ypred

0            D-E B-B
1            B-B E-E
2            B-B A-A
3                D-E
4        D-E B-B E-E
            ...     
21455            B-B
21456        B-B D-E
21457            E-E
21458        D-E A-A
21459            C-D
Name: 0, Length: 21460, dtype: object

In [26]:
Ytest.replace(dic_target,inplace=True)
Ytest

30410        D-E B-B
49736        E-E B-B
34584        B-B A-A
37201            D-E
11055    D-E B-B E-E
            ...     
53153            B-B
3884         B-B D-E
38421            E-E
30413            D-E
62741            C-D
Name: Target, Length: 21460, dtype: object

In [27]:
accuracy_score(Ypred, Ytest)

0.5970177073625349

In [28]:
Ypred_map=pd.Series(Ypred).apply( lambda x: x.split(" "))
Ytest_map=pd.Series(Ytest).apply( lambda x: x.split(" "))
ml_metrics.mapk(Ytest_map, Ypred_map, 5)

0.7977335093714404

In [52]:
# Guardar pipeline pre-process:
#pickle.dump(pipe_xgb1, open('pipe_xgb1.pkl', "wb"))