# Montamos Unidad de Drive

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/MyDrive/Colab Notebooks/TP_Ind4

/content/drive/MyDrive/Colab Notebooks/TP_Ind4


In [None]:
%cd drive/MyDrive/TP_Ind4

[Errno 2] No such file or directory: 'drive/MyDrive/TP_Ind4'
/content/drive/MyDrive/Colab Notebooks/TP_Ind4


In [4]:
%ls

 AnalisisExploratorio.ipynb
[0m[01;34m'best model'[0m/
 [01;34mdata[0m/
 Docs.gdoc
[01;34m'model tunning'[0m/
'TP 1 - Exploración, visualización de datos y Machine Learning.pdf'


# Datasets

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

In [6]:
# Importamos el set de entrenamiento.
train = pd.read_csv("data/Train_full.csv", index_col=0)

In [7]:
# Lista de variables previas.
P_vars = ["Pdays", "Previous", "Poutcome_failure", "Poutcome_other", "Poutcome_success", "Poutcome_unknown"]

In [8]:
# Eliminamos las variables previas para el entrenamiento del modelo base.
train_base = train.drop(P_vars, axis = 1)

# Separamos en variables explicativas y la respuesta.
X_train_base = train_base.drop("Subscription", axis = 1)
y_train = train_base["Subscription"]

# Base Model Training

In [15]:
rf_base = RandomForestClassifier(n_estimators=250,
                                 criterion = 'gini', 
                                 max_depth = 16,
                                 max_features = 0.5, 
                                 max_leaf_nodes = 260, 
                                 n_jobs = -1,
                                 random_state = 45,
                                 class_weight = {1:7},      
                                 ccp_alpha=0)

In [16]:
rf_base.fit(X_train_base, y_train)

RandomForestClassifier(ccp_alpha=0, class_weight={1: 7}, max_depth=16,
                       max_features=0.5, max_leaf_nodes=260, n_estimators=250,
                       n_jobs=-1, random_state=45)

In [17]:
dump(rf_base, 'best model/rf_base.joblib')

['best model/rf_base.joblib']

# Prev Model Training

In [33]:
train_prev = pd.concat([train[P_vars], y_train], axis=1, ignore_index=False)
train_prev = train_prev[train_prev["Poutcome_unknown"]==0].drop("Poutcome_unknown", axis = 1)

base_pred = pd.Series(rf_base.predict_proba(X_train_base)[:,1], index=train_base.index, name="BasePrediction")
train_prev = pd.merge(left=train_prev, 
                      right=base_pred,
                      how='left',
                      left_index=True,
                      right_index=True)


In [34]:
print(train_prev.shape)
train_prev.head()

(6202, 7)


Unnamed: 0,Pdays,Previous,Poutcome_failure,Poutcome_other,Poutcome_success,Subscription,BasePrediction
12868,200,4,1,0,0,0,0.030235
9890,185,1,0,0,1,1,0.879559
4156,272,2,1,0,0,0,0.349778
18063,79,3,0,0,1,0,0.080889
29288,119,1,1,0,0,1,0.58147


In [35]:
X_train_prev = train_prev.drop("Subscription", axis = 1)
y_train_prev = train_prev["Subscription"]

In [36]:
rf_prev = RandomForestClassifier(n_estimators=150,
                            criterion = 'gini', 
                            max_depth = 3,
                            max_features = 0.5, 
                            max_leaf_nodes = 50, 
                            n_jobs = -1,
                            random_state = 45,
                            class_weight = {1:5}, 
                            ccp_alpha=0)

In [37]:
rf_prev.fit(X_train_prev, y_train_prev)

RandomForestClassifier(ccp_alpha=0, class_weight={1: 5}, max_depth=3,
                       max_features=0.5, max_leaf_nodes=50, n_estimators=150,
                       n_jobs=-1, random_state=45)

In [39]:
dump(rf_prev, 'best model/rf_prev.joblib')

['best model/rf_prev.joblib']