In [None]:
# https://stackoverflow.com/questions/21971449/how-do-i-increase-the-cell-width-of-the-jupyter-ipython-notebook-in-my-browser
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
display(HTML("<style>.prompt { display:none !important; }</style>"))

# Trabajo Práctico 2: Entrenamiento y evaluación de modelos
---

## Fecha y hora de entrega máxima:
09/05/2022 18:00

## Dataset "Datos de clientes del banco"
Los datos están relacionados con campañas de marketing directo (llamadas telefónicas) de una institución bancaria portuguesa. El objetivo de la clasificación es predecir si el cliente suscribirá un depósito a plazo.

<img src="https://storage.googleapis.com/kaggle-datasets-images/864595/1473402/1f559c7d6d646d0a5f24c1847fb10225/dataset-cover.jpg?t=2020-09-08-19-15-14"></img>

In [None]:
# Import dependencies
import numpy as np
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import pandas as pd
import sklearn_pandas
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder, QuantileTransformer
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from collections import defaultdict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# Ignore all warnings
import warnings
warnings.filterwarnings('ignore')

## **IMPORTANDO DATASET**

In [None]:
# To replace these values with NaN, we must provide a list with all missing value formats
missing_value_formats = ["unknown", "n.a.","?","NA","n/a", "na", "--"]
dataset_original = pd.read_csv("BankCustomerData.csv", na_values = missing_value_formats)
# We will fill all NaN values with a string with value 'missing'
dataset_original.fillna('missing', inplace=True)
ds = dataset_original
ds_feature_eng = dataset_original
ds

## **FEATURE ENGINEERING**

In [None]:
# Still need to finish this

Usamos 5 quantiles para la edad...

### **Train, Validation and Test**

In [None]:
ds["term_deposit"] = ds.term_deposit.replace(['no', 'yes'], [0,1])
ds_feature_eng["term_deposit"] = ds_feature_eng.term_deposit.replace(['no', 'yes'], [0,1])
# Dividimos el dataset en train (60%), test (20%) y validation (20%)
train, not_train = train_test_split(ds, test_size=0.4, random_state=42)
validation, test = train_test_split(not_train, test_size=0.5, random_state=42)

## **MÉTRICA A UTILIZAR**
La métrica que utilizaremos es Accuracy, debido a que permite medir el porcentaje de casos acertados en la predicción, si

## **MAPPING DE VARIABLES**

In [None]:
mapper = DataFrameMapper([
    (['age'],[StandardScaler()]),
    (['loan'],[OneHotEncoder()]),
    (['housing'],[OneHotEncoder()]),
    (['job'],[OneHotEncoder()]),
    (['education'],[OneHotEncoder()]),
    (['balance'],[StandardScaler()])
])
mapper.fit(train)
mapper.transform(train)

## ENTRENAMIENTO DE MODELOS
Elegimos los siguientes 6 modelos para entrenar:
- Logistic Regression
- Arbol de Decisión
- Random Forest
- Gradient Boosting Classifier
- KNN
- Neural Networks MLP

In [None]:
def evaluate_model(model, set_names=('train', 'validation'), title='', is_feature_engineering=False, show_confusion_matrix=True):
    if title:
        display(title)
    metrics_to_show = defaultdict(list)
    if show_confusion_matrix:
        fig, axis = plt.subplots(1, len(set_names), sharey=True, figsize=(15, 3))
    for i, set_name in enumerate(set_names):
        if is_feature_engineering:
          assert set_name in ['train', 'validation', 'test']
        set_data = globals()[set_name]
        y = set_data.term_deposit
        y_pred = model.predict(set_data)
        metrics_to_show['Accuracy'].append(accuracy_score(y, y_pred))
        metrics_to_show['Precision'].append(precision_score(y, y_pred))
        metrics_to_show['Recall'].append(recall_score(y, y_pred))
        metrics_to_show['F1'].append(f1_score(y, y_pred))
        metrics_to_show['Roc Auc'].append(roc_auc_score(y, y_pred))
        if show_confusion_matrix:
            ax = axis[i]
            sns.heatmap(confusion_matrix(y, y_pred), ax=ax, cmap='Blues', annot=True, fmt='.0f', cbar=False)

            ax.set_title(set_name)
            ax.xaxis.set_ticklabels(['No se suscribe', 'Se suscribe'])
            ax.yaxis.set_ticklabels(['No se suscribe', 'Se suscribe'])
            ax.set_xlabel('Clase Predecida')
            ax.set_ylabel('Clase Original')

    display(pd.DataFrame(metrics_to_show, index=set_names))
    if show_confusion_matrix:
        plt.tight_layout()
        plt.show()

### **Logistic Regression**

In [None]:
model_logistic_regression = Pipeline([
    ('mapper', mapper),
    ('classifier', LogisticRegression(random_state=100)),
])
model_logistic_regression.fit(train, train.term_deposit)
evaluate_model(model_logistic_regression, title='Logistic Regression')

### **Arbol de Decisión**

In [None]:
model_tree_decision = Pipeline([
    ('mapper', mapper),
    ('classifier', DecisionTreeClassifier(max_depth = 7, random_state=100)),
])
model_tree_decision.fit(train, train.term_deposit)
evaluate_model(model_tree_decision, title='Arbol de Decisión')

### **Random Forest**

In [None]:
model_random_forest = Pipeline([
    ('mapper', mapper),
    ('classifier', RandomForestClassifier(random_state=100)),
])
model_random_forest.fit(train, train.term_deposit)
evaluate_model(model_random_forest, title='Random Forest')

Hay overfitting, para esto modificamos los hyperparametros dandole más profundidad

In [None]:
model_random_forest_modified = Pipeline([
    ('mapper', mapper),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=15, max_features=15, random_state=100)),
])
model_random_forest_modified.fit(train, train.term_deposit)
evaluate_model(model_random_forest_modified, title='Random Forest with Depth = 15')

### **Gradient Boosting Classifier**

In [None]:
model_gradient_boosting = Pipeline([
    ('mapper', mapper),
    ('classifier', GradientBoostingClassifier(random_state=42)),
])
model_gradient_boosting.fit(train, train.term_deposit)
evaluate_model(model_gradient_boosting, title='Gradient Boosting Classifier')

### **KNN**

In [None]:
model_knn = Pipeline([
    ('mapper', mapper),
    ('classifier', KNeighborsClassifier(n_neighbors=3)),
])
model_knn.fit(train, train.term_deposit)
evaluate_model(model_knn, title='K Nearest Neighbors with K = 6')

In [None]:
model_knn = Pipeline([
    ('mapper', mapper),
    ('classifier', KNeighborsClassifier(n_neighbors=10)),
])
model_knn.fit(train, train.term_deposit)
evaluate_model(model_knn, title='K Nearest Neighbors with K = 10')

## OVERFITTING

In [None]:
def generate_curve(selected_model="DecisionTree", list_to_iterate=list(range(1, 11))):
  train_prediction =  []
  eval_prediction = []
  for x in list_to_iterate:
    models = {
                "DecisionTree": Pipeline([
                  ('mapper', mapper),
                  ('classifier', DecisionTreeClassifier(max_depth=x, random_state=100)),
                ]),
                "RandomForest": Pipeline([
                  ('mapper', mapper),
                  ('classifier', RandomForestClassifier(n_estimators=100, max_depth=x, max_features=15, random_state=100)),
                ]),
                "KNN": Pipeline([
                  ('mapper', mapper),
                  ('classifier', KNeighborsClassifier(n_neighbors=x)),
                ])
              }
    model = models[selected_model]
    model=model.fit(train, train.term_deposit)    
    train_prediction.append(model.score(train, train.term_deposit))
    eval_prediction.append(model.score(validation, validation.term_deposit))
  plt.plot(list_to_iterate, train_prediction, color='r', label='Train')
  plt.plot(list_to_iterate, eval_prediction, color='b', label='Validation')
  plt.title('Grafico ' + selected_model)
  plt.legend()
  plt.ylabel('Precisión')
  plt.xlabel('Cantidad de vecinos')
  plt.show()

### Curva para Árbol de decisión

In [None]:
generate_curve("DecisionTree")

### Curva para Random Forest

In [None]:
generate_curve("RandomForest")

### Curva para KNN

In [None]:
generate_curve("KNN")

Como podemos ver en el gráfico anterior, entre un K de 4 a 8 se encuentra el valor más eficiente para el algoritmo KNN. Luego de ésto podemos ver como ambas líneas convergen hacia el 0.91 aproximadamente, es decir se estancan y deja de aprender.

## COMPARACIÓN FINAL ENTRE LOS DISTINTOS MODELOS
Comparación entre:
- Logistic Regression
- Arbol de Decisión
- Random Forest
- Gradient Boosting Classifier
- KNN
- Neural Networks MLP

In [None]:
evaluate_model(model_logistic_regression, title='Logistic Regression', set_names=('train', 'validation', 'test'), is_feature_engineering=False, show_confusion_matrix=False)
evaluate_model(model_tree_decision, title='Arbol de Decisión', set_names=('train', 'validation', 'test'), is_feature_engineering=False, show_confusion_matrix=False)
evaluate_model(model_random_forest_modified, title='Random Forest with Depth = 15', set_names=('train', 'validation', 'test'), is_feature_engineering=False, show_confusion_matrix=False)
evaluate_model(model_gradient_boosting, title='Gradient Boosting Classifier', set_names=('train', 'validation', 'test'), is_feature_engineering=False, show_confusion_matrix=False)
evaluate_model(model_knn, title='K Nearest Neighbors with K = 6', set_names=('train', 'validation', 'test'), is_feature_engineering=False, show_confusion_matrix=False)

Como podemos ver arriba...

### Still need to finish the rest of the conclusions!