En este notebook la parte del proyecto que se desarrolla 
es la seleccion de modelo y posteriormente, la mejora del mismo

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import MinMaxScaler

# Métodos de Validación
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Modelos de Clasificación

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Métricas para Clasificación
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

#from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# OverSampling y UnderSampling
from imblearn.over_sampling import SMOTE



In [2]:
#Leemos el df resultante de la limpieza de datos ""dfbank_clean.csv""
df=pd.read_csv("dfbank_clean.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,education,age,job,balance,duration,campaign,previous,y,Marital_single,Default_yes,Housing_yes,Loan_yes,Contact_telephone
0,0,1.0,58.0,4.0,2143.0,261.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,0.0,44.0,9.0,29.0,151.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,2,0.0,33.0,2.0,2.0,76.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,3,0.0,47.0,1.0,1506.0,92.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,4,0.0,33.0,11.0,1.0,198.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44658,41506,,48.0,0.0,178.0,228.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44659,41515,,54.0,0.0,66.0,167.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
44660,41516,,36.0,0.0,1224.0,482.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
44661,41522,,70.0,5.0,324.0,78.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
#Eliminar la columna Unnamed
df.drop("Unnamed: 0", axis =1, inplace = True)


In [5]:
#Aislar la variable objetivo del df
X = df.drop('y', axis=1)
y = np.array(df["y"])

In [6]:
#Aqui vemos como la variable objetivo necesita un oversampling ya que la clase 
#minoritaria esta muy desbalanceada en relacion a clase mayoritaria
from collections import Counter

conteo = Counter(y)

for valor, cantidad in conteo.items():
    print(f"El valor {valor} aparece {cantidad/len(y)*100} veces.")

El valor 0.0 aparece 85.2808812663726 veces.
El valor 1.0 aparece 7.733470658039093 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.002238989767816761 veces.
El valor nan aparece 0.0022

In [7]:
# Se realiza el oversampling
oversampling = SMOTE(sampling_strategy = 0.45) #queremos 80% de la clase mayoritaria y 20%de la minoritaria
X_balanceado, y_balanceado = oversampling.fit_resample(X, y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
#Comparamos como queda la proporcion  y_balanceada

conteo = Counter(y_balanceado)

for valor, cantidad in conteo.items():
    print(f"El valor {valor} aparece {cantidad/len(y)*100} veces.")


In [None]:
# Normalizar y se preparan los datos train y test
x_scaler = MinMaxScaler()
X = x_scaler.fit_transform(X_balanceado)

y_scaler = MinMaxScaler()
y = y_scaler.fit_transform(y_balanceado.reshape(-1,1))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [None]:
#Tenemos una lista con los modelos que vamos a probar y evaluar las mejores metricas

modelos = [LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(),
           RandomForestClassifier(), SVC(), NearestCentroid(), GaussianNB()]



In [None]:
%%time
datos_modelos = list()

for model in modelos:
    model.fit(X_train, y_train.ravel())
    yhat = model.predict(X_test)
    
    # Metricas
    Jaccard_index = jaccard_score(y_test, yhat, average = "macro")
    Accuracy = accuracy_score(y_test, yhat)
    Precisión = precision_score(y_test, yhat, average = "macro")
    Sensibilidad = recall_score(y_test, yhat, average = "macro")
    F1_score = f1_score(y_test, yhat, average = "macro")
    Roc_auc = roc_auc_score(y_test, yhat, average = "macro")
    
    datos_modelos.append([str(model), Jaccard_index, Accuracy,Precisión, Sensibilidad,F1_score,
                          Roc_auc])
    
df_modelo = pd.DataFrame(data = datos_modelos, columns = ["modelo", "Jaccard_index", "Accuracy", "Precisión", "Sensibilidad","F1_score","Roc_auc"])

df_modelo.sort_values("Roc_auc", ascending = False)

In [None]:
%%time
#El que mejores metricas ofrece es el Random Forest

model = RandomForestClassifier()

model.fit(X_train, y_train.ravel())

In [None]:
%%time
#Calculamos los parametros 

# Max depth

print(Counter([x.get_depth() for x in model.estimators_]))
sns.histplot([x.get_depth() for x in model.estimators_])
plt.show()

# N Leaves
from collections import Counter
print(Counter([x.get_n_leaves() for x in model.estimators_]))
sns.histplot([x.get_n_leaves() for x in model.estimators_])
plt.show()

In [None]:
%%time
# GridSearch 

model = RandomForestClassifier()

params = {"n_estimators"           : [100, 150], # Numero de arboles
          "criterion"              : ["gini", "entropy"], # Es la función para medir la calidad de una división/split.
          "max_depth"              : range(30, 45, 3), # La profundidad máxima del árbol.
          "max_features"           : ["sqrt", "log2", None], # El número de características (atributos) a considerar en cada split
          "max_leaf_nodes"         : range(3900, 4300, 50), # Maximo de nodos hoja del arbol
          "min_samples_split"      : [10, 15, 20, None], # El número mínimo de muestras requeridas para llegar a nodo hoja.
         } 

scorers = ["accuracy", "roc_auc"]

grid_solver = GridSearchCV(estimator  = model     , 
                           param_grid = params    , 
                           scoring    = scorers   ,
                           cv         = 5         ,
                           refit      = "roc_auc" ,
                           n_jobs     = -1        ,
                           verbose    = 3)

model_result = grid_solver.fit(X, y.ravel())

print(model_result.cv_results_["mean_test_roc_auc"].mean())
print(model_result.cv_results_["mean_test_accuracy"].mean())

print("*"*100)

print(model_result.best_score_)
print(model_result.best_params_)

In [None]:
import pickle

In [None]:
best_model = RandomForestClassifier(**model_result.best_params_)
model.fit(X, y.ravel())

with open("rfc_model_final.pkl", "bw") as file:
    pickle.dump(model, file)

In [None]:
import pickle


In [None]:
#Guardamos el mejor modelo resultante en un archivo pickle
best_model = RandomForestClassifier(**model_result.best_params_)
model.fit(X, y.ravel())

with open("rfc_model_final.pkl", "bw") as file:
    pickle.dump(model, file)

El modelo de Random Forest que mejores paramétro da es:
* criterion: gini
* max_depht: 36
* max_features: sqrt
* max_leave_node: 3950
* min_sample_spli: 10
* n_estimator: 150
