# Montamos Unidad de Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/Colab Notebooks/TP_Ind4

/content/drive/MyDrive/Colab Notebooks/TP_Ind4


In [None]:
%cd drive/MyDrive/TP_Ind4

[Errno 2] No such file or directory: 'drive/MyDrive/TP_Ind4'
/content/drive/MyDrive/Colab Notebooks/TP_Ind4


In [None]:
%ls

 AnalisisExploratorio.ipynb
[0m[01;34m'best model'[0m/
 [01;34mdata[0m/
 Docs.gdoc
[01;34m'model tunning'[0m/
'TP 1 - Exploración, visualización de datos y Machine Learning.pdf'


# Paquetes

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import plotly.express as px

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Importamos el set de entrenamiento.
train = pd.read_csv("data/Train.csv", index_col=0)

In [None]:
train.shape

(33908, 35)

In [None]:
# Separamos entre variables explicativas y respuesta.
X_train = train.drop('Subscription', axis = 1)
y_train = train['Subscription']

## Optimization Function

Generamos la función de profit para optimizar los hiperparámetros.

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, make_scorer

def profit_mean(y_true, y_pred):
  mat = confusion_matrix(y_true=y_true, y_pred=y_pred)
  n = len(y_true)
  prof = 250 * mat[1,1] - 50 * mat[0,1] - 25 * mat[1,0]
  return prof/n

In [None]:
profit_mean_score = make_scorer(profit_mean, greater_is_better=True)

## A tener en cuenta
Cuando tuneamos un modelo:
* **Siempre** incluir todos sus **hiperparámetros en el grid**, aunque sea con el valor default. Esto hace que el dataframe que generamos con los resultados tenga toda la información y, si después queremos tunear un hiperparámetro que no habíamos tuneado, **no se agrega una nueva columna**.
* **Siempre** incluir el *train score* para ver si tenemos overfitting, comparando con el resultado de test.

In [None]:
# Grid de valores de hiperparámetros.
rf_grid = {'n_estimators':[75],
           'criterion': ['gini'],
            'max_depth': [9, 12, 14, 15, 16, 17, 18],
            'max_features': [0.5, 0.9],
            'ccp_alpha': [0],#, 0.001, 0.01, 0.1], 
            'max_leaf_nodes':[260, 280, 300, 320, 340],#, 20, 40], 
            'min_samples_leaf':[1],#,  #200, 400],
            'class_weight':[{1: w} for w in [5, 6, 7]]}

In [None]:
# Hiperparámetros fijos del modelo entrenado.
rf = RandomForestClassifier(n_jobs=-1, random_state=45)

In [None]:
time_start = time.time()

# Grid Search
rf_cv = GridSearchCV(rf, rf_grid, cv = 5, scoring=profit_mean_score, verbose=4, n_jobs=-1, return_train_score=True, refit=True)

# Randomized Search
#svm_cv = RandomizedSearchCV(svm, svm_grid, n_iter = 500, cv = 10, scoring='accuracy', verbose=10, n_jobs=-1, random_state=354)

rf_cv.fit(X_train, y_train.values.ravel())

# Generamos y mostramos un Dataframe con los resultados del GridSearch
rf_cv_results = pd.DataFrame(rf_cv.cv_results_)
print(rf_cv_results.head(10))

# Generamos un csv con los resultados para el modelo.
#rf_cv_results.to_csv('model tunning/RandomForest/RF_CVResults.csv')
rf_cv_results.to_csv('model tunning/RandomForest/RF_CVResults.csv', mode='a', header=False)

# Imprimimos el tiempo total de duración de Cross-Validation.
time_finish = time.time()
print(f'Duración {round(time_finish - time_start, 5)} segundos')
print(f'Duración {round((time_finish - time_start)/60, 5)} minutos')

Fitting 5 folds for each of 210 candidates, totalling 1050 fits
   mean_fit_time  std_fit_time  ...  mean_train_score  std_train_score
0       4.574513      0.044673  ...         17.918338         0.188972
1       4.479654      0.066794  ...         17.923315         0.190334
2       4.541423      0.068420  ...         17.923683         0.190140
3       4.467522      0.123478  ...         17.923683         0.190140
4       4.469275      0.036941  ...         17.923683         0.190140
5       7.123837      0.081057  ...         18.387806         0.146834
6       7.192114      0.060721  ...         18.390571         0.148372
7       7.232675      0.109002  ...         18.390939         0.148871
8       7.393360      0.077536  ...         18.390939         0.148871
9       7.294203      0.101107  ...         18.390939         0.148871

[10 rows x 28 columns]
Duración 3660.71208 segundos
Duración 61.01187 minutos


In [None]:
rf_cv_results.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
180,4.684155,0.209446,0.227016,0.009722,0,{1: 7},gini,16,0.5,260,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.913447,15.961368,15.574314,15.679841,17.125055,16.050805,0.555974,1,20.144879,20.460997,20.447172,20.69064,20.465772,20.441892,0.173813
181,4.650534,0.098834,0.226519,0.006463,0,{1: 7},gini,16,0.5,280,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.762312,15.86184,15.555883,15.55449,17.117682,15.970442,0.585858,2,20.45731,20.76329,20.635184,20.918273,20.714602,20.697732,0.151615
182,4.661699,0.079258,0.23187,0.003156,0,{1: 7},gini,16,0.5,300,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.66647,15.813919,15.662784,15.53237,17.158236,15.966756,0.60237,3,20.691403,21.095996,20.995539,21.21779,20.966196,20.993385,0.174828
170,4.675483,0.230407,0.22951,0.005103,0,{1: 7},gini,15,0.5,260,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.931879,16.186228,15.559569,15.163693,16.984958,15.965265,0.615925,4,20.098798,20.541178,20.431505,20.617835,20.482361,20.434335,0.178878
191,4.643491,0.092776,0.229122,0.005611,0,{1: 7},gini,17,0.5,280,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.740195,15.990858,15.570628,15.491815,16.999705,15.95864,0.547914,5,20.487724,20.972499,20.721817,20.989236,20.723818,20.779019,0.185902


# Results

In [4]:
# Importamos los resultados.
results = pd.read_csv('model tunning/RandomForest/RF_CVResults.csv', index_col = 0)

In [5]:
# Completamos los NAs con 'None'.
results.fillna(value = 'None', inplace = True)
results.shape

(2466, 28)

In [6]:
results.sort_values('mean_test_score', ascending = False).drop_duplicates().head(25)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
180,4.684155,0.209446,0.227016,0.009722,0.0,{1: 7},gini,16,0.5,260,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.913447,15.961368,15.574314,15.679841,17.125055,16.050805,0.555974,1,20.144879,20.460997,20.447172,20.69064,20.465772,20.441892,0.173813
184,4.574586,0.089586,0.230368,0.001308,0.0,{1: 7},gini,16,0.5,240,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.887644,15.994544,15.625922,15.598732,17.080814,16.037531,0.543088,1,19.926454,20.121839,20.193726,20.35426,20.146902,20.148636,0.137404
201,4.368174,0.133338,0.207231,0.038502,0.0,{1: 7},gini,18,0.5,160,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.92082,15.983486,15.611177,15.609792,16.970211,16.019097,0.499895,2,18.790091,18.958748,19.165192,19.097209,19.193055,19.040859,0.149281
62,6.46249,0.133688,0.188197,0.050379,0.0,{1: 6},gini,14,0.9,100,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 6}, 'crit...",15.946623,15.802861,15.765998,15.624539,16.94809,16.017622,0.476395,1,17.698887,17.975374,18.198407,18.063184,18.142441,18.015659,0.175327
63,6.747654,0.104193,0.208022,0.042358,0.0,{1: 6},gini,14,0.9,120,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 6}, 'crit...",15.758626,15.828664,15.968741,15.672467,16.859608,16.017621,0.432029,2,18.09795,18.402087,18.633414,18.492646,18.524901,18.4302,0.181848
151,4.296204,0.05592,0.229004,0.003235,0.0,{1: 7},gini,12,0.5,160,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.824978,16.068269,15.762312,15.510249,16.878042,16.00877,0.469514,3,18.550468,18.661985,18.715439,18.794928,18.829948,18.710554,0.099382
190,4.223843,0.191409,0.228057,0.002181,0.0,{1: 7},gini,17,0.5,140,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",15.865526,15.92082,15.777057,15.484442,16.970211,16.003611,0.506182,4,18.508073,18.657377,18.77258,18.827183,18.792163,18.711475,0.11656
55,6.858063,0.179867,0.164715,0.053457,0.0,{1: 6},gini,13,0.9,120,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 6}, 'crit...",15.869213,15.850781,15.839723,15.609792,16.800619,15.994026,0.414279,3,18.092421,18.401165,18.514525,18.480665,18.432742,18.384304,0.151043
171,4.378197,0.091072,0.147938,0.043282,0.0,{1: 7},gini,15,0.5,160,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 7}, 'crit...",16.001917,16.0867,15.566942,15.517623,16.789559,15.992548,0.458448,5,18.793777,18.942159,19.09054,19.052973,19.084307,18.992751,0.112891
105,6.892743,0.04815,0.20976,0.041396,0.0,{1: 6},gini,15,0.9,140,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 6}, 'crit...",15.810233,15.784429,15.732822,15.661407,16.970211,15.99182,0.49183,6,18.584568,18.793777,18.932943,19.002286,18.960814,18.854878,0.152214


In [None]:
params_list = ['param_ccp_alpha', 'param_class_weight', 'param_criterion', 'param_max_depth', 'param_max_features', 'param_max_leaf_nodes', 'param_min_samples_leaf', 'param_n_estimators']
results_plot = results[params_list+["mean_train_score", "mean_test_score"]]
results_plot.head()

Unnamed: 0,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_n_estimators,mean_train_score,mean_test_score
0,0.0,{1: 1},gini,5,,,1,50,7.191699,6.549357
1,0.0,{1: 1},gini,5,,,1,75,7.181932,6.591377
2,0.0,{1: 1},gini,5,,,1,100,7.148753,6.506598
3,0.0,{1: 1},gini,5,,10.0,1,50,6.559661,6.32447
4,0.0,{1: 1},gini,5,,10.0,1,75,6.431191,6.190274


In [None]:
for param in params_list: 
  (px.scatter(data_frame=results_plot,
           x=param,
           y="mean_test_score",
           color="param_class_weight",
           custom_data=params_list+["mean_train_score", "mean_test_score"])
  .update_traces(hovertemplate='<b>Train Profit: %{customdata[8]}</b> <br> <b>Test Profit: %{customdata[9]}</b> <br><br> Alpha: %{customdata[0]} <br> Class Weight: %{customdata[1]} <br> Trees: %{customdata[7]} <br> Depth: %{customdata[3]}, <br> Max Features: %{customdata[4]} <br> Criterion: %{customdata[2]} <br> Max Leaf Node. %{customdata[5]} <br> Min Sample Leaf: %{customdata[6]}')
  .update_layout(template="plotly_white")
  .show())