# Random Forest

In [1]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
from tqdm import tqdm

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score , cohen_kappa_score, roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [2]:
#abrimos el df con el balanceo SMOTE
df_balanceado = pd.read_csv ('datos\df_codificado_balanceado.csv', index_col=0)
#abrimos el df sin balancear
df_sin_balanceo = pd.read_csv ('datos\df_codificado_completo.csv', index_col=0)

In [3]:
X1 = df_balanceado.drop("Claim", axis = 1)
y1 = df_balanceado["Claim"]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

Se usan los mismos parámetros que en el decision tree porque resultaron buenas métricas. Eran estos:

In [5]:
{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None, #era 34 en el decision tree
 'max_features': None,# era de 5 en el decision tree
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 0,
 'splitter': 'best'}

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 0,
 'splitter': 'best'}

In [6]:

param = {
 'max_depth': [18,20,22,24],#bajamos un poco la profundidad con respecto al decison tree
 'max_features': [1,2,3,4,5],
 'min_samples_leaf': [1,2],#subimos un poco con respecto al decision tree
 'min_samples_split': [2,4]#subimos un poco con respecto al decision tree
 }

In [7]:
gs_rf = GridSearchCV(
            estimator=RandomForestClassifier(random_state=42), 
            param_grid= param,
            cv=10, 
            verbose=-1) 

In [8]:
gs_rf.fit(x_train, y_train)

In [9]:
# vemos cuál es el mejor bosque

bosque = gs_rf.best_estimator_
bosque

In [10]:
# si se quisiera visualizar este sería el código 

'''for arbol in tqdm(bosque.estimators_):
    fig = plt.figure(figsize=(20, 10))
    tree.plot_tree(arbol, feature_names= x_train.columns, filled = True)
    plt.show()'''

'for arbol in tqdm(bosque.estimators_):\n    fig = plt.figure(figsize=(20, 10))\n    tree.plot_tree(arbol, feature_names= x_train.columns, filled = True)\n    plt.show()'

In [11]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [12]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(clases_reales_test, clases_predichas_test).ravel()
    especifidad_test = tn_test / (tn_test+fp_test)


    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    tn_train, fp_train, fn_train, tp_train = confusion_matrix(clases_reales_train, clases_predichas_train).ravel()
    especificad_train = tn_train / (tn_train+fp_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       'especifidad':[ especifidad_test,especificad_train ],
                       "f1": [f1_test, f1_train],
                       "kapppa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

In [13]:
dt_results1 = metricas(y_test, y_pred_test_rf, y_train, y_pred_train_rf, "Random Forest Balanceado")
dt_results1

#ver nota abajo

Unnamed: 0,accuracy,precision,recall,especifidad,f1,kapppa,set,modelo
0,0.970908,0.954296,0.989128,0.952725,0.9714,0.941817,test,Decission Tree Balanceado
1,0.991692,0.984152,0.999483,0.983897,0.991758,0.983384,train,Decission Tree Balanceado


Dada la diferencia de resultados entre los modelos balanceados y no balanceados, no vamos a intentar hacer el Random Forest sobre el modelo sin balancear porque, además el Decision Tree ya nos daba muy buenas métricas y creemos que no tendría mucha utilidad. Además, consumiría muchísimos recursos y nos paralizaría en nuestro trabajo.

*NOTA: en el cuadro aparece 'decission tree' cuando se refiere a  'Random Forest'. En el código se ha corregido pero no se ha vuelto a correr para que el ordenador no quede bloqueado.*