In [None]:
# https://stackoverflow.com/questions/21971449/how-do-i-increase-the-cell-width-of-the-jupyter-ipython-notebook-in-my-browser
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
display(HTML("<style>.prompt { display:none !important; }</style>"))

# Trabajo Práctico 2: Entrenamiento y evaluación de modelos
---

## Fecha y hora de entrega máxima:
09/05/2022 18:00

## Dataset "Datos de clientes del banco"
Los datos están relacionados con campañas de marketing directo (llamadas telefónicas) de una institución bancaria portuguesa. El objetivo de la clasificación es predecir si el cliente suscribirá un depósito a plazo.

<img src="https://storage.googleapis.com/kaggle-datasets-images/864595/1473402/1f559c7d6d646d0a5f24c1847fb10225/dataset-cover.jpg?t=2020-09-08-19-15-14"></img>

In [None]:
# Import dependencies
import numpy as np
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import pandas as pd
import sklearn_pandas
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from collections import defaultdict


## **IMPORTANDO DATASET**

In [None]:
# If we import the dataset from the csv file we see we have values with the value "unknown"
# We copy the dataset so we don't change directly the original dataset after working on it
# Then return the first five rows of the DataFrame
dataset_original = pd.read_csv("BankCustomerData.csv")
ds = dataset_original
ds.head()

In [None]:
# To replace these values with NaN, we must provide a list with all missing value formats
# missing_value_formats = ["n.a.","?","NA","n/a", "na", "--"]
# Display the firsts and lasts lines of the file
missing_value_formats = ["unknown", "n.a.","?","NA","n/a", "na", "--"]
ds = pd.read_csv("BankCustomerData.csv", na_values = missing_value_formats)
ds

# ds.dropna(inplace=True)
# ds.reset_index(drop=True, inplace=True)
# Display the firsts and lasts lines of the file
# ds

### **Train, Validation and Test**

In [None]:
ds["term_deposit"] = ds.term_deposit.replace(['no', 'yes'], [0,1])
ds["housing"] = ds.housing.replace(['no', 'yes'], [0,1])
ds["loan"] = ds.loan.replace(['no', 'yes'], [0,1])
# Dividimos el dataset en train (60%), test (20%) y validation (20%)
train, not_train = train_test_split(ds, test_size=0.4, random_state=42)
validation, test = train_test_split(not_train, test_size=0.5, random_state=42)


## **MÉTRICA A UTILIZAR**
La métrica que utilizaremos es Accuracy, debido a que permite medir el porcentaje de casos acertados en la predicción, si

## **FEATURE ENGINEERING**

In [None]:
# Still need to finish this

## **MAPPING DE VARIABLES**

In [None]:
mapper = DataFrameMapper([
    (['age'],[MinMaxScaler()]),
    (['loan'],None),
    (['term_deposit'],None),
    (['housing'],None),
    (['job'],[OneHotEncoder()]),
    (['education'],[OneHotEncoder()]),
    (['balance'],[MinMaxScaler()])
])
mapper.fit(train)
mapper.transform(train)

## ENTRENAMIENTO DE MODELOS
Elegimos los siguientes 3 modelos para entrenar:
- Logistic Regression
- Neural Networks MLP
- KNN

In [None]:
def evaluate_model(model, set_names=('train', 'validation'), title='', show_cm=True):
    if title:
        display(title)
    metrics_to_show = defaultdict(list)
    if show_cm:
        fig, axis = plt.subplots(1, len(set_names), sharey=True, figsize=(15, 3))
    for i, set_name in enumerate(set_names):
        assert set_name in ['train', 'validation', 'test']
        set_data = globals()[set_name]
        y = set_data.term_deposit
        y_pred = model.predict(set_data)
        metrics_to_show['Accuracy'].append(accuracy_score(y, y_pred))
        # metrics_to_show['Precision'].append(precision_score(y, y_pred))
        # metrics_to_show['Recall'].append(recall_score(y, y_pred))
        # metrics_to_show['Roc'].append(roc_curve(y, y_pred))
        # metrics_to_show['Roc Auc'].append(roc_auc_score(y, y_pred))        
        if show_cm:
            ax = axis[i]
            sns.heatmap(confusion_matrix(y, y_pred), ax=ax, cmap='Blues', annot=True, fmt='.0f', cbar=False)

            ax.set_title(set_name)
            ax.xaxis.set_ticklabels(['real', 'fake'])
            ax.yaxis.set_ticklabels(['real', 'fake'])
            ax.set_xlabel('Predicted class')
            ax.set_ylabel('True class')
        
    display(pd.DataFrame(metrics_to_show, index=set_names))
    if show_cm:
        plt.tight_layout()
        plt.show()

### **Logistic Regression**

In [None]:
model_logistic_regression = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', LogisticRegression(random_state=42)),
])

model_logistic_regression.fit(train, train.term_deposit)

evaluate_model(model_logistic_regression)

# ACA EMPIEZA LO VIEJO

#### Logistic Regression

In [None]:
# We selected this parameters to prevent overfitting and because as we saw in 
# the previous TP, these might have a correlation to term_deposit
ds_some_rows = ds[['loan', 'housing', 'job', 'education', 'age', 'balance', 'term_deposit']]
x = ds_some_rows.iloc[:, :-1].values
y = ds_some_rows.iloc[:, -1].values
ds_some_rows

In [None]:
x

In [None]:
ct = ColumnTransformer(transformers=[('encode', 
                                      OneHotEncoder(), 
                                      [0, 1, 2, 3])], 
                       remainder='passthrough')
x = np.array(ct.fit_transform(x))
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
x

In [None]:
y

In [None]:
# Splitting data to training and test set 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
# Applying logistic regression to the data
logistic_regression = LogisticRegression(random_state=0)
logistic_regression.fit(x_train, y_train)

In [None]:
# Then predict results 
y_pred = logistic_regression.predict(x_test)
print(y_pred)

In [None]:
# Checking my correct and in-correct predictions using the confusion_matrix and 
# then checking the accuracy of the model with the accuracy_score
confusion_matrix_result = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:")
confusion_matrix_result

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))
# TODO: See why precision and recall is zero

### Model Evaluation using Confusion Matrix
A confusion matrix is a table that is used to evaluate the performance of a classification model. You can also visualize the performance of an algorithm. The fundamental of a confusion matrix is the number of correct and incorrect predictions are summed up class-wise.

The dimension of this matrix is 2*2 because this model is binary classification. You have two classes 0 and 1. Diagonal values represent accurate predictions, while non-diagonal elements are inaccurate predictions.

**In the output, 7733 and 0 are actual predictions, and 1 and 794 are incorrect predictions.**

In [None]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(confusion_matrix_result), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

### ROC Curve
Receiver Operating Characteristic(ROC) curve is a plot of the true positive rate against the false positive rate. It shows the tradeoff between sensitivity and specificity.

In [None]:
y_pred_proba = logistic_regression.predict_proba(x_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

**AUC score for the case is 0.60.**

AUC score 1 represents perfect classifier, and 0.5 represents a worthless classifier.