# Montamos Unidad de Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/Colab Notebooks/TP_Ind4

/content/drive/MyDrive/Colab Notebooks/TP_Ind4


In [None]:
%cd drive/MyDrive/TP_Ind4

/content/drive/.shortcut-targets-by-id/17FeAVfVLm-bhQ0cBKXS7bsPk0E6kXi_t/TP_Ind4


In [None]:
%ls

 AnalisisExploratorio.ipynb
 [0m[01;34mdata[0m/
 Docs.gdoc
[01;34m'model tunning'[0m/
'TP 1 - Exploración, visualización de datos y Machine Learning.pdf'


# Paquetes

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import plotly.express as px

In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Importamos el set de entrenamiento.
train = pd.read_csv("data/Train.csv", index_col=0)

In [None]:
train.shape

(33908, 35)

In [None]:
# Separamos entre variables explicativas y respuesta.
X_train = train.drop('Subscription', axis = 1)
y_train = train['Subscription']

## Optimization Function

Generamos la función de profit para optimizar los hiperparámetros.

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, make_scorer

def profit_mean(y_true, y_pred):
  mat = confusion_matrix(y_true=y_true, y_pred=y_pred)
  n = len(y_true)
  prof = 250 * mat[1,1] - 50 * mat[0,1] - 25 * mat[1,0]
  return prof/n

In [None]:
profit_mean_score = make_scorer(profit_mean, greater_is_better=True)

## A tener en cuenta
Cuando tuneamos un modelo:
* **Siempre** incluir todos sus **hiperparámetros en el grid**, aunque sea con el valor default. Esto hace que el dataframe que generamos con los resultados tenga toda la información y, si después queremos tunear un hiperparámetro que no habíamos tuneado, **no se agrega una nueva columna**.
* **Siempre** incluir el *train score* para ver si tenemos overfitting, comparando con el resultado de test.

In [None]:
# Grid de valores de hiperparámetros.
gbm_grid = {'n_estimators':[200],
           'criterion': ['friedman_mse'],
            'max_depth': [5, 10, 14, 16, 18, 20, None],
            'max_features': [None, 0.7],
            'ccp_alpha': [0, 0.01, 0.1],#, 0.001, 0.01, 0.1], 
            'max_leaf_nodes':[None, 10, 40], 
            'min_samples_leaf':[1, 200, 400],
            'validation_fraction':[0],
            'n_iter_no_change':[None]}

In [None]:
# Hiperparámetros fijos del modelo entrenado.
gbm = GradientBoostingClassifier(loss = 'deviance',
                                 random_state = 65,
                                 verbose=2)

In [None]:
time_start = time.time()

# Grid Search
gbm_cv = GridSearchCV(gbm, gbm_grid, cv = 5, scoring=profit_mean_score, verbose=4, n_jobs=-1, return_train_score=True, refit=True)

# Randomized Search
#svm_cv = RandomizedSearchCV(svm, svm_grid, n_iter = 500, cv = 10, scoring='accuracy', verbose=10, n_jobs=-1, random_state=354)

gbm_cv.fit(X_train, y_train.values.ravel())

# Generamos y mostramos un Dataframe con los resultados del GridSearch
gbm_cv_results = pd.DataFrame(gbm_cv.cv_results_)
print(gbm_cv_results.head(10))

# Generamos un csv con los resultados para el modelo.
#gbm_cv_results.to_csv('model tunning/GradientBoosting/GBM_CVResults.csv')
gbm_cv_results.to_csv('model tunning/GradientBoosting/GBM_CVResults.csv', mode='a', header=False)

# Imprimimos el tiempo total de duración de Cross-Validation.
time_finish = time.time()
print(f'Duración {round(time_finish - time_start, 5)} segundos')
print(f'Duración {round((time_finish - time_start)/60, 5)} minutos')

Fitting 5 folds for each of 378 candidates, totalling 1890 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



      Iter       Train Loss   Remaining Time 
         1           0.6676           19.77s
         2           0.6309           18.98s
         3           0.6035           18.35s
         4           0.5818           18.99s
         5           0.5647           19.19s
         6           0.5498           19.11s
         7           0.5370           18.74s
         8           0.5264           18.45s
         9           0.5173           18.29s
        10           0.5087           18.16s
        11           0.5014           17.93s
        12           0.4955           17.80s
        13           0.4898           17.76s
        14           0.4844           17.63s
        15           0.4800           17.37s
        16           0.4760           17.12s
        17           0.4724           17.02s
        18           0.4685           16.84s
        19           0.4646           16.72s
        20           0.4620           16.59s
        21           0.4593           16.55s
        2

In [None]:
gbm_cv_results.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_n_estimators,param_n_iter_no_change,param_validation_fraction,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
22,11.83062,2.786615,0.045398,0.009547,0,friedman_mse,14.0,0.7,40,200,1000,15,0.1,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.127101,8.644205,8.017546,8.232562,9.150568,8.634396,0.458533,1,11.270552,11.582983,11.328615,11.93829,12.405537,11.705195,0.422013
106,22.51337,7.005014,0.052701,0.012058,0,friedman_mse,,,40,200,1000,15,0.1,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.407254,8.378797,8.094957,8.033476,8.980976,8.579092,0.532835,2,11.307417,12.266829,12.547925,12.386184,13.016552,12.304981,0.560096
19,11.092825,3.286507,0.039681,0.007723,0,friedman_mse,14.0,0.7,20,200,1000,15,0.1,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.101298,9.031259,7.958567,7.893379,8.892494,8.575399,0.534895,3,10.493622,10.094559,11.895414,10.859107,10.011243,10.670789,0.683169
46,14.568896,3.751795,0.052786,0.010199,0,friedman_mse,16.0,0.7,40,200,1000,15,0.1,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.16765,8.596284,7.737393,8.155139,9.187435,8.56878,0.56647,4,10.678869,12.255769,12.515668,12.544697,12.472813,12.093563,0.71459
34,16.482888,3.623722,0.042597,0.006155,0,friedman_mse,16.0,,40,200,1000,15,0.1,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.536272,8.109702,7.965939,8.424274,8.726589,8.552555,0.557616,5,11.152584,11.247512,12.053012,12.334574,11.905113,11.738559,0.461811


# Results

In [6]:
# Importamos los resultados.
results = pd.read_csv('model tunning/GradientBoosting/GBM_CVResults.csv', index_col = 0)

In [7]:
# Completamos los NAs con 'None'.
results.fillna(value = 'None', inplace = True)

In [8]:
results.sort_values('mean_test_score', ascending = False).drop_duplicates().head(25)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_n_estimators,param_n_iter_no_change,param_validation_fraction,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
124,34.779542,0.336656,0.097006,0.007554,0.0,friedman_mse,,0.7,40.0,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.609997,9.281923,8.279269,8.910928,9.062085,9.02884,0.442416,1,14.759825,14.678722,15.203126,14.983227,14.710436,14.867067,0.198966
106,33.286096,0.387797,0.094705,0.006075,0.0,friedman_mse,20.0,0.7,40.0,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.149219,9.348275,8.242406,8.804011,9.508185,9.010419,0.450251,2,14.434491,14.488867,15.007742,14.907657,14.604453,14.688642,0.228641
97,50.245693,0.746555,0.093396,0.001046,0.0,friedman_mse,20.0,,40.0,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",8.758478,9.263492,8.62946,8.715529,9.014157,8.876223,0.232199,3,14.205928,14.500848,14.63817,14.321525,14.104951,14.354284,0.193557
64,34.87459,0.087348,0.103638,0.000808,0.0,friedman_mse,16.0,0.7,,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.307726,9.086553,8.330876,8.453768,9.128447,8.861474,0.392131,4,15.000369,15.059353,15.727531,15.573967,14.545471,15.181338,0.425309
79,48.804668,0.736197,0.090664,0.002276,0.0,friedman_mse,18.0,,40.0,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.097611,9.407254,7.954881,8.52013,9.253797,8.846735,0.537568,5,14.239106,14.376429,14.372742,14.317838,14.049655,14.271154,0.121385
115,54.994102,0.843053,0.101678,0.0067,0.0,friedman_mse,,,40.0,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",8.651578,9.396196,8.371424,8.516443,9.220616,8.831251,0.403384,6,14.745079,14.713743,14.806827,14.786928,14.460685,14.702652,0.125241
52,29.462326,0.198298,0.080903,0.000843,0.0,friedman_mse,14.0,0.7,40.0,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.009142,8.832203,8.117075,8.940422,9.036278,8.787024,0.342306,7,13.785667,14.22989,14.113765,14.314152,13.729863,14.034668,0.235536
100,39.348423,0.18041,0.120865,0.005973,0.0,friedman_mse,20.0,0.7,,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",8.662636,9.112356,8.109702,8.512756,9.45657,8.770804,0.469527,8,15.93121,16.251935,16.66851,16.38681,15.71497,16.190687,0.335906
88,32.103833,0.353568,0.090627,0.004091,0.0,friedman_mse,18.0,0.7,40.0,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",8.666323,9.27455,8.190799,8.357912,9.33122,8.764161,0.46592,9,14.245558,14.479651,14.703605,14.622885,14.229366,14.456213,0.192545
82,37.382884,0.230606,0.110559,0.000861,0.0,friedman_mse,18.0,0.7,,200,200,,0.0,"{'ccp_alpha': 0, 'criterion': 'friedman_mse', ...",9.130787,9.053377,8.198172,8.357912,9.076832,8.763416,0.400301,10,15.595738,15.898953,16.309998,16.449478,15.399786,15.930791,0.402014


In [None]:
params_list = ['param_ccp_alpha', 'param_criterion', 'param_max_depth', 'param_max_features', 'param_max_leaf_nodes', 'param_min_samples_leaf','param_n_estimators', 'param_n_iter_no_change', 'param_validation_fraction']
print(len(params_list))
results_plot = results[params_list+["mean_train_score", "mean_test_score"]]
results_plot.head()

9


Unnamed: 0,param_ccp_alpha,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_n_estimators,param_n_iter_no_change,param_validation_fraction,mean_train_score,mean_test_score
0,0.0,friedman_mse,2,,,1,1000,6,0.1,7.055671,6.490372
1,0.0,friedman_mse,2,,,200,1000,6,0.1,6.941578,6.654777
2,0.0,friedman_mse,2,,,400,1000,6,0.1,6.89421,6.227138
3,0.0,friedman_mse,2,,10.0,1,1000,6,0.1,7.055671,6.490372
4,0.0,friedman_mse,2,,10.0,200,1000,6,0.1,6.941578,6.654777


In [None]:
params_list

['param_ccp_alpha',
 'param_criterion',
 'param_max_depth',
 'param_max_features',
 'param_max_leaf_nodes',
 'param_min_samples_leaf',
 'param_n_estimators',
 'param_n_iter_no_change',
 'param_validation_fraction']

In [None]:
for param in params_list: 
  (px.scatter(data_frame=results_plot,
           x=param,
           y="mean_test_score",
           color="param_max_depth",
           custom_data=params_list+["mean_train_score", "mean_test_score"])
  .update_traces(hovertemplate='<b>Train Profit: %{customdata[9]}</b> <br> <b>Test Profit: %{customdata[10]}</b> <br><br> Alpha: %{customdata[0]} <br> Criterion: %{customdata[1]} <br> Depth: %{customdata[2]}, <br> Max Features: %{customdata[3]} <br> Max Leaf Node: %{customdata[4]} <br> Min Sample Leaf: %{customdata[5]},  <br> Trees: %{customdata[6]}')
  .update_layout(template="plotly_white")
  .show())