# Montamos Unidad de Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/Colab Notebooks/TP_Ind4

/content/drive/MyDrive/Colab Notebooks/TP_Ind4


In [None]:
%cd drive/MyDrive/TP_Ind4

[Errno 2] No such file or directory: 'drive/MyDrive/TP_Ind4'
/content/drive/MyDrive/Colab Notebooks/TP_Ind4


In [None]:
%ls

 AnalisisExploratorio.ipynb
[0m[01;34m'best model'[0m/
 [01;34mdata[0m/
 Docs.gdoc
[01;34m'model tunning'[0m/
'TP 1 - Exploración, visualización de datos y Machine Learning.pdf'


# Paquetes

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import plotly.express as px
from joblib import dump, load

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [38]:
# Importamos el set de entrenamiento.
train = pd.read_csv("data/Train_full.csv", index_col=0)

In [21]:
train.shape

(33908, 41)

In [39]:
train = train[train["Poutcome_unknown"] == 0]

P_vars = ["Pdays", "Previous", "Poutcome_failure", "Poutcome_other", "Poutcome_success", "Poutcome_unknown", "Subscription"]

train_base = train.drop(P_vars, axis = 1)
train = train[P_vars].drop("Poutcome_unknown", axis = 1)

In [40]:
train.shape

(6202, 6)

In [41]:
# Separamos entre variables explicativas y respuesta.
X_train = train.drop('Subscription', axis = 1)
y_train = train['Subscription']

## Optimization Function

Generamos la función de profit para optimizar los hiperparámetros.

In [42]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, make_scorer

def profit_mean(y_true, y_pred):
  mat = confusion_matrix(y_true=y_true, y_pred=y_pred)
  n = len(y_true)
  prof = 250 * mat[1,1] - 50 * mat[0,1] - 25 * mat[1,0]
  return prof/n

In [43]:
profit_mean_score = make_scorer(profit_mean, greater_is_better=True)

# Modelo Base

In [44]:
rf_base = load('best model/rf_base.joblib')

In [51]:
train_base_pred = pd.Series(rf_base.predict_proba(train_base)[:,1], index=train_base.index, name="BasePrediction")
X_train = pd.concat([X_train, train_base_pred], axis = 1)

In [52]:
X_train.head(10)

Unnamed: 0,Pdays,Previous,Poutcome_failure,Poutcome_other,Poutcome_success,BasePrediction
12868,200,4,1,0,0,0.031113
9890,185,1,0,0,1,0.86269
4156,272,2,1,0,0,0.473695
18063,79,3,0,0,1,0.062203
29288,119,1,1,0,0,0.413813
3216,120,1,0,1,0,0.909972
55,294,2,0,0,1,0.040382
24093,245,1,0,1,0,0.088923
15597,137,20,1,0,0,0.880793
20965,329,1,1,0,0,0.088328


In [37]:
# Este subgrupo de observaciones tiene el siguiente profit promedio:
print(profit_mean(train["Subscription"], rf_base.predict(train_base)))
print(confusion_matrix(train["Subscription"], rf_base.predict(train_base)))

36.367300870686876
[[3837  944]
 [ 300 1121]]


## A tener en cuenta
Cuando tuneamos un modelo:
* **Siempre** incluir todos sus **hiperparámetros en el grid**, aunque sea con el valor default. Esto hace que el dataframe que generamos con los resultados tenga toda la información y, si después queremos tunear un hiperparámetro que no habíamos tuneado, **no se agrega una nueva columna**.
* **Siempre** incluir el *train score* para ver si tenemos overfitting, comparando con el resultado de test.

In [71]:
# Grid de valores de hiperparámetros.
rf_grid = {'n_estimators':[75],
           'criterion': ['gini'],
            'max_depth': [4,5,7,8],
            'max_features': [0.5, None], #, 0.9
            'ccp_alpha': [0],#, 0.001, 0.01, 0.1], 
            'max_leaf_nodes':[10, 30, 50, None],#, 20, 40], 
            'min_samples_leaf':[1],#,  #200, 400],
            'class_weight':[{1: w} for w in [5, 6, 7, 8, 9, 10]]}

In [72]:
# Hiperparámetros fijos del modelo entrenado.
rf = RandomForestClassifier(n_jobs=-1, random_state=45)

In [73]:
time_start = time.time()

# Grid Search
rf_cv = GridSearchCV(rf, rf_grid, cv = 5, scoring=profit_mean_score, verbose=4, n_jobs=-1, return_train_score=True, refit=True)

# Randomized Search
#svm_cv = RandomizedSearchCV(svm, svm_grid, n_iter = 500, cv = 10, scoring='accuracy', verbose=10, n_jobs=-1, random_state=354)

rf_cv.fit(X_train, y_train.values.ravel())

# Generamos y mostramos un Dataframe con los resultados del GridSearch
rf_cv_results = pd.DataFrame(rf_cv.cv_results_)
print(rf_cv_results.head(10))

# Generamos un csv con los resultados para el modelo.
#rf_cv_results.to_csv('model tunning/RandomForest/RF_Prev_CVResults.csv')
rf_cv_results.to_csv('model tunning/RandomForest/RF_Prev_CVResults.csv', mode='a', header=False)

# Imprimimos el tiempo total de duración de Cross-Validation.
time_finish = time.time()
print(f'Duración {round(time_finish - time_start, 5)} segundos')
print(f'Duración {round((time_finish - time_start)/60, 5)} minutos')

Fitting 5 folds for each of 192 candidates, totalling 960 fits
   mean_fit_time  std_fit_time  ...  mean_train_score  std_train_score
0       0.463408      0.002040  ...         42.806767         0.193297
1       0.464162      0.003282  ...         42.931725         0.218707
2       0.449371      0.011319  ...         42.931725         0.218707
3       0.441924      0.034464  ...         42.927693         0.244929
4       0.578775      0.046633  ...         42.604194         0.115519
5       0.539218      0.040348  ...         42.742256         0.090338
6       0.553453      0.017613  ...         42.742256         0.090338
7       0.545009      0.043117  ...         42.742256         0.090338
8       0.421100      0.039395  ...         42.809780         0.118039
9       0.475084      0.039966  ...         43.434582         0.138129

[10 rows x 28 columns]
Duración 370.3787 segundos
Duración 6.17298 minutos


In [74]:
rf_cv_results.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
125,0.752589,0.009037,0.105151,0.001369,0,{1: 8},gini,8,,30,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 8}, 'crit...",43.432716,41.659952,42.056452,41.350806,43.58871,42.417727,0.921354,1,44.678492,44.628099,44.775292,45.405079,44.442765,44.785945,0.327925
12,0.537927,0.039906,0.111053,0.007142,0,{1: 5},gini,5,,10,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 5}, 'crit...",42.908944,41.760677,41.995968,41.572581,43.810484,42.409731,0.837211,2,43.030639,43.091111,43.384724,43.369609,42.780129,43.131242,0.226311
16,0.363105,0.010944,0.10642,0.001296,0,{1: 5},gini,7,0.5,10,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 5}, 'crit...",43.09025,41.639807,42.016129,42.016129,43.28629,42.409721,0.653313,3,42.970167,42.728281,42.951431,43.077388,42.588674,42.863188,0.178141
24,0.443858,0.047174,0.106483,0.002099,0,{1: 5},gini,8,0.5,10,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 5}, 'crit...",43.09025,41.639807,41.794355,42.056452,43.28629,42.373431,0.681344,4,42.960089,42.728281,42.951431,43.132809,42.598751,42.874272,0.188353
153,0.519158,0.045082,0.105659,0.001318,0,{1: 9},gini,8,0.5,30,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 9}, 'crit...",42.767929,42.062853,42.076613,41.391129,43.568548,42.373414,0.73938,5,44.073775,44.250151,44.367191,44.377267,44.200927,44.253862,0.112534


# Results

In [66]:
# Importamos los resultados.
results = pd.read_csv('model tunning/RandomForest/RF_Prev_CVResults.csv', index_col = 0)

In [67]:
# Completamos los NAs con 'None'.
results.fillna(value = 'None', inplace = True)
results.shape

(300, 28)

In [68]:
results.sort_values('mean_test_score', ascending = False).drop_duplicates().head(25)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
237,0.613556,0.04905,0.116755,0.004888,0,{1: 8},gini,9,,30.0,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 8}, 'crit...",43.00967,41.740532,42.137097,41.391129,43.58871,42.373427,0.812777,1,44.844789,44.789357,44.820637,45.319428,44.493148,44.853472,0.26541
112,0.355437,0.004675,0.107242,0.002416,0,{1: 5},gini,9,0.5,10.0,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 5}, 'crit...",43.00967,41.639807,41.794355,42.056452,43.28629,42.357315,0.664955,2,42.95001,42.728281,42.951431,43.132809,42.598751,42.872256,0.187476
225,0.531491,0.036818,0.104609,6.1e-05,0,{1: 8},gini,6,0.5,30.0,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 8}, 'crit...",43.392425,41.700242,42.318548,40.967742,43.366935,42.349179,0.943887,3,43.554727,43.695827,44.019549,44.064893,43.772672,43.821534,0.193809
116,0.53279,0.037108,0.109431,0.004319,0,{1: 5},gini,9,,10.0,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 5}, 'crit...",42.687349,41.760677,41.814516,41.653226,43.810484,42.34525,0.820517,4,43.081032,43.126386,43.414954,43.339379,42.845627,43.161476,0.201675
108,0.575517,0.01992,0.10838,0.004982,0,{1: 5},gini,6,,10.0,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 5}, 'crit...",42.687349,41.760677,41.814516,41.612903,43.810484,42.337186,0.827448,5,43.081032,43.126386,43.414954,43.319226,42.845627,43.157445,0.198251
104,0.46201,0.004262,0.104512,0.000292,0,{1: 5},gini,6,0.5,10.0,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 5}, 'crit...",42.96938,41.639807,41.794355,41.975806,43.28629,42.333128,0.665129,6,42.929853,42.748438,42.951431,43.052197,42.618904,42.860164,0.155404
63,0.501718,0.052088,0.10646,0.001661,0,{1: 4},gini,3,,,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 4}, 'crit...",43.17083,41.559226,41.653226,42.217742,42.842742,42.288753,0.63657,7,42.108446,42.51159,42.538291,42.492946,42.256147,42.381484,0.169823
62,0.556701,0.012114,0.104659,0.000412,0,{1: 4},gini,3,,50.0,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 4}, 'crit...",43.17083,41.559226,41.653226,42.217742,42.842742,42.288753,0.63657,7,42.108446,42.51159,42.538291,42.492946,42.256147,42.381484,0.169823
61,0.545644,0.039821,0.105906,0.001633,0,{1: 4},gini,3,,30.0,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 4}, 'crit...",43.17083,41.559226,41.653226,42.217742,42.842742,42.288753,0.63657,7,42.108446,42.51159,42.538291,42.492946,42.256147,42.381484,0.169823
60,0.492463,0.042058,0.106201,0.001734,0,{1: 4},gini,3,,10.0,1,75,"{'ccp_alpha': 0, 'class_weight': {1: 4}, 'crit...",43.17083,41.559226,41.653226,42.217742,42.842742,42.288753,0.63657,7,42.108446,42.51159,42.538291,42.492946,42.256147,42.381484,0.169823


In [69]:
params_list = ['param_ccp_alpha', 'param_class_weight', 'param_criterion', 'param_max_depth', 'param_max_features', 'param_max_leaf_nodes', 'param_min_samples_leaf', 'param_n_estimators']
results_plot = results[params_list+["mean_train_score", "mean_test_score"]]
results_plot.head()

Unnamed: 0,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_n_estimators,mean_train_score,mean_test_score
0,0,{1: 5},gini,3,0.5,50,1,75,42.707988,42.24847
1,0,{1: 5},gini,3,0.5,100,1,75,42.707988,42.24847
2,0,{1: 5},gini,3,0.5,150,1,75,42.707988,42.24847
3,0,{1: 5},gini,3,0.5,200,1,75,42.707988,42.24847
4,0,{1: 5},gini,6,0.5,50,1,75,44.446334,41.768956


In [70]:
for param in params_list: 
  (px.scatter(data_frame=results_plot,
           x=param,
           y="mean_test_score",
           color="param_class_weight",
           custom_data=params_list+["mean_train_score", "mean_test_score"])
  .update_traces(hovertemplate='<b>Train Profit: %{customdata[8]}</b> <br> <b>Test Profit: %{customdata[9]}</b> <br><br> Alpha: %{customdata[0]} <br> Class Weight: %{customdata[1]} <br> Trees: %{customdata[7]} <br> Depth: %{customdata[3]}, <br> Max Features: %{customdata[4]} <br> Criterion: %{customdata[2]} <br> Max Leaf Node. %{customdata[5]} <br> Min Sample Leaf: %{customdata[6]}')
  .update_layout(template="plotly_white")
  .show())