# Trabajo Práctico 2: Entrenamiento y evaluación de modelos #

## 1. Métrica ##

Las métricas que vamos a utilizar son **F-SCORE** y además **Re-Call**.

### Por qué F-SCORE? ###
La misma consiste en combinar precission y recall. Los valores estan cerca de 1, cuando precissión y recall son buenos, en cambio, F-SCORE comienza a caer si uno de los dos cae.

### Por qué Re-Call? ###
Re-Call nos permite medir que no se nos escapen interesados a las tarjetas de crédito, y en caso que NO esté interesado, el rechazo de la tarjeta, no afecta al negocio. En cambio si se escapa un caso de un cliente con interés, la empresa perdería la posibilidad de generar ingresos.

## Librerías ##


In [None]:
%matplotlib inline
import warnings
import numpy as np
import pandas as pd 
import matplotlib
import keras
import h5py
import PIL
import seaborn as sns
import sklearn
import pytz
import plotly.graph_objects as go

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split

# One Hot Encoder
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, QuantileTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn import datasets
import plotly.express as px 
pd.set_option('display.float_format',lambda x:'%.0f'% x) #Sacar notación científica en pandas
pd.options.display.max_columns = 0

plt.rcParams.update({
    "font.family": ["serif"],
    "font.sans-serif": ["Roboto"],
    "font.size": 9,
    "axes.labelsize": 11,
    "axes.titlesize": 13,
    "xtick.labelsize": 11,
    "ytick.labelsize": 11,
    "legend.fontsize": 11,
    'figure.figsize': (15.0, 4.0),
    'axes.grid': False,
    'axes.spines.left': True,
    'axes.spines.right': True,
    'axes.spines.top': True,
    'axes.spines.bottom': True,
})

np.set_printoptions(suppress=True)

warnings.filterwarnings('ignore')

## Train ##


In [None]:
data_TC = pd.read_csv('train.csv')

BETTER_COLUMN_NAMES = {
    'ID': 'id',
    'Gender': 'sexo',
    'Age': 'edad',
    'Region_Code': 'codigo_region',
    'Occupation': 'ocupacion',
    'Channel_Code': 'codigo_canal',
    'Vintage': 'antiguedad',
    'Credit_Product': 'tiene_producto_credito_activo',
    'Avg_Account_Balance': 'saldo_promedio_cuenta',
    'Is_Active': 'es_activo',
    'Is_Lead': 'esta_interesado',
}
data_TC.rename(columns=BETTER_COLUMN_NAMES, inplace=True)

data_TC.set_index('id', inplace=True)

DATA_MODIFICADA = data_TC


## 2. Técnica feature engineering ##

- **Técnica Quantile Transformation(edades)** Utilizaremos esta técnica para llevar los datos a una distribución uniforme o normal, generando robustez para los outliers.

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

x = DATA_MODIFICADA.edad

fig = make_subplots(rows=3, cols=2)

trace0 = go.Histogram(x=x, nbinsx=100)
trace1 = go.Histogram(x=x, nbinsx=5)
trace2 = go.Histogram(x=x, nbinsx = 8)
trace3 = go.Histogram(x=x, nbinsx = 10)
trace4 = go.Histogram(x=x, nbinsx = 12)

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 2, 2)
fig.append_trace(trace3, 3, 1)
fig.append_trace(trace4, 3, 2)

fig.show()
#Revisar

In [None]:
number_of_quantile = 10
DATA_MODIFICADA['Quantiles_edad'],edges = pd.qcut(x=DATA_MODIFICADA['edad'], q=number_of_quantile, labels=False, retbins=True )
print('values of edges: ' + str(edges))

#Evitar Nans:
ed = np.delete(edges, 0)
ed = np.delete(ed, -1)
ed = np.append(np.append([DATA_MODIFICADA.edad.min()], [ed]), [DATA_MODIFICADA.edad.max()])
print('\nvalues of ed: ' + str(ed))



# Transform the array as an IntervalIndex
Interval_Index = pd.IntervalIndex.from_breaks(ed,closed='right',dtype='interval[int64]')
print('\nvalues of Interval_Index: ' + str(Interval_Index))

# Create a column displaying the interval
DATA_MODIFICADA['quantile_interval'] = pd.cut(DATA_MODIFICADA['edad'], bins=Interval_Index)

dict_inter_quantile = pd.Series(DATA_MODIFICADA['quantile_interval'].unique().sort_values(ascending=False), name='interval').reset_index()
dict_inter_quantile.columns = ['Quantiles_edad', 'quantile_interval']
dict_inter_quantile = dict_inter_quantile.set_index('quantile_interval')

DATA_MODIFICADA.sample(8)

In [None]:
for col in DATA_MODIFICADA.columns: 
    print(col) 

In [None]:
df = px.data.tips()

fig = px.histogram(DATA_MODIFICADA, x="edad",color="esta_interesado", marginal = 'box',title='Distribución sin quantiles')
fig.update_layout(bargap=0.2)
fig.show()

fig = px.histogram(DATA_MODIFICADA,x="Quantiles_edad",color="esta_interesado", marginal = 'box',title='Distribución por quantiles')
fig.update_layout(bargap=0.2)
fig.show()

- **Técnica Quantile Transformation(saldo promedio de cuenta)** Utilizaremos esta técnica para llevar los datos a una distribución uniforme o normal, generando robustez para los outliers.

In [None]:
number_of_quantile = 20
DATA_MODIFICADA['Quantiles_saldos'],edges = pd.qcut(x=DATA_MODIFICADA['saldo_promedio_cuenta'], q=number_of_quantile, labels=False, retbins=True )
print('values of edges: ' + str(edges))

#Evitar Nans:
ed = np.delete(edges, 0)
ed = np.delete(ed, -1)
ed = np.append(np.append([DATA_MODIFICADA.saldo_promedio_cuenta.min()], [ed]), [DATA_MODIFICADA.saldo_promedio_cuenta.max()])
print('\nvalues of ed: ' + str(ed))

# Transform the array as an IntervalIndex
Interval_Index = pd.IntervalIndex.from_breaks(ed,closed='right',dtype='interval[int64]')
print('\nvalues of Interval_Index: ' + str(Interval_Index))

# Create a column displaying the interval
DATA_MODIFICADA['Intervalos saldos'] = pd.cut(DATA_MODIFICADA['saldo_promedio_cuenta'], bins=Interval_Index)

dict_inter_quantile = pd.Series(DATA_MODIFICADA['Intervalos saldos'].unique().sort_values(ascending=False), name='interval').reset_index()
dict_inter_quantile.columns = ['Quantiles_saldos', 'Intervalos saldos']
dict_inter_quantile = dict_inter_quantile.set_index('Intervalos saldos')

DATA_MODIFICADA.sample(8)

In [None]:
df = px.data.tips()

fig = px.histogram(DATA_MODIFICADA, x="saldo_promedio_cuenta",color="esta_interesado", marginal = 'box',title='Distribución sin quantiles')
fig.update_layout(bargap=0.2)
fig.show()

fig = px.histogram(DATA_MODIFICADA,x="Quantiles_saldos",color="esta_interesado", marginal = 'box',title='Distribución por quantiles')
fig.update_layout(bargap=0.2)
fig.show()

- **Técnica Quantile Transformation(antiguedad)** Utilizaremos esta técnica para llevar los datos a una distribución uniforme o normal, generando robustez para los outliers.

In [None]:
number_of_quantile = 5
DATA_MODIFICADA['Quantiles_antiguedad'],edges = pd.qcut(x=data_TC['antiguedad'], q=number_of_quantile, labels=False, retbins=True )
print('values of edges: ' + str(edges))

#Evitar Nans:
ed = np.delete(edges, 0)
ed = np.delete(ed, -1)
ed = np.append(np.append([data_TC.antiguedad.min()], [ed]), [data_TC.antiguedad.max()])
print('\nvalues of ed: ' + str(ed))

# Transform the array as an IntervalIndex
Interval_Index = pd.IntervalIndex.from_breaks(ed,closed='right',dtype='interval[int64]')
print('\nvalues of Interval_Index: ' + str(Interval_Index))

# Create a column displaying the interval
DATA_MODIFICADA['Intervalos antiguedad'] = pd.cut(DATA_MODIFICADA['antiguedad'], bins=Interval_Index)

dict_inter_quantile = pd.Series(DATA_MODIFICADA['Intervalos antiguedad'].unique().sort_values(ascending=False), name='interval').reset_index()
dict_inter_quantile.columns = ['Quantiles_antiguedad', 'Intervalos antiguedad']
dict_inter_quantile = dict_inter_quantile.set_index('Intervalos antiguedad')

DATA_MODIFICADA.sample(8)

In [None]:
df = px.data.tips()

fig = px.histogram(DATA_MODIFICADA, x="antiguedad",color="esta_interesado", marginal = 'box',title='Distribución sin quantiles')
fig.update_layout(bargap=0.2)
fig.show()

fig = px.histogram(DATA_MODIFICADA,x="Quantiles_antiguedad",color="esta_interesado", marginal = 'box',title='Distribución por quantiles')
fig.update_layout(bargap=0.2)
fig.show()

## 3. Qué modelos vamos a evaluar?

- Gradient Boosting
- Regresión Logística
- Ver otros...


#### Preparación Datos ####


In [None]:
for col in DATA_MODIFICADA.columns: 
    print(col) 

In [None]:
DATA_MODIFICADA.shape #Ver filas y columnas

In [None]:
#Eliminamos la columnas que no vamos a utilizar:
DATA_MODIFICADA = DATA_MODIFICADA.drop(['edad', 'antiguedad', 'saldo_promedio_cuenta', 'quantile_interval','Intervalos saldos', 'Intervalos antiguedad'], axis=1)

In [None]:
DATA_MODIFICADA.shape #Ver filas y columnas

In [None]:
DATA_MODIFICADA

In [None]:
for col in DATA_MODIFICADA.columns: 
    print(col) 

## División del data set

In [None]:
#esto capaz nos convenga hacerlo en el mapper con sklearn
# Esto es para eliminar los nulos
#DATA_MODIFICADA = DATA_MODIFICADA[DATA_MODIFICADA['tiene_producto_credito_activo'].notna()]

In [None]:
from sklearn.model_selection import train_test_split
# 60% train, 20% test, 20% validation
train, not_train = train_test_split(DATA_MODIFICADA, test_size=0.4, random_state=42)
validation, test = train_test_split(not_train, test_size=0.5, random_state=42)

train.shape, validation.shape, test.shape

##### Prepando el Mapper: #####

In [None]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
for col in DATA_MODIFICADA.columns: 
    print(col) 

# Mapper SimpleImputer

In [None]:
mapper = DataFrameMapper([
    (['sexo'], [LabelBinarizer()]), 
    (['codigo_region'], [OneHotEncoder()]),
    (['ocupacion'], [OneHotEncoder()]),
    (['codigo_canal'], [OneHotEncoder()]),
    (['tiene_producto_credito_activo'], [SimpleImputer(strategy='most_frequent'), 
                                         LabelBinarizer()]),
    (['es_activo'], [LabelBinarizer()]),
    (['Quantiles_edad'], [StandardScaler()]),
    (['Quantiles_saldos'], [StandardScaler()]),
    (['Quantiles_antiguedad'], [StandardScaler()])
], df_out=True) # df_out=True → Es lo que muestra el nombre de la columna



In [None]:
# Sample transformado
mapper.fit(train)

In [None]:
np.round(mapper.fit_transform(train), 2)

In [None]:
# Nombres de los faetures
mapper.transformed_names_

In [None]:
# Sample transformado
mapper.transform(train)

# Pipeline SimpleImputer

In [None]:
# pipe_si = Pipeline([
#     ('imputer', IterativeImputer(random_state=94)),
#     ('mapper', mapper),
# ])
# # Lo entrenamos con train
# pipe_si.fit(train)

#### Programemos una función para evaluar un modelo...

In [None]:
from sklearn import metrics

In [None]:
from collections import defaultdict

import seaborn as sns


def evaluate_model(model, set_names=('train', 'validation'), title='', show_cm=True):
    if title:
        display(title)
        
    final_metrics = defaultdict(list)
    
    if show_cm:
        fig, axis = plt.subplots(1, len(set_names), sharey=True, figsize=(15, 3))
    
    for i, set_name in enumerate(set_names):
        assert set_name in ['train', 'validation', 'test']
        set_data = globals()[set_name]  # <- hack feo...

        y = set_data.esta_interesado
        y_pred = model.predict(set_data)
        final_metrics['Accuracy'].append(metrics.accuracy_score(y, y_pred))
        final_metrics['Precision'].append(metrics.precision_score(y, y_pred))
        final_metrics['Recall'].append(metrics.recall_score(y, y_pred))
        final_metrics['F1'].append(metrics.f1_score(y, y_pred))
        
        if show_cm:
            ax = axis[i]
            sns.heatmap(metrics.confusion_matrix(y, y_pred), ax=ax, cmap='Blues', annot=True, fmt='.0f', cbar=False)

            ax.set_title(set_name)
            ax.xaxis.set_ticklabels(['no esta interesado', 'esta interesado'])
            ax.yaxis.set_ticklabels(['no esta interesado', 'esta interesado'])
            ax.set_xlabel('Predicted class')
            ax.set_ylabel('True class')

        
    display(pd.DataFrame(final_metrics, index=set_names))
    if show_cm:
        plt.tight_layout()
        plt.show()

#### Gradient Boosting ####

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
gbc_model = Pipeline([
    ('mapper', mapper),
    ('classifier', GradientBoostingClassifier(random_state=94)),
])

gbc_model.fit(train, train.esta_interesado)

evaluate_model(gbc_model, title='Gradient Boosting')

#### Regresión Logística ####

In [None]:
from sklearn.linear_model import LogisticRegression
regresion_logistica = LogisticRegression()
#regresion_logistica.fit(X,y)

In [None]:
lr_model = Pipeline([
    ('mapper', mapper),
    ('classifier', LogisticRegression(random_state=94)),
])

lr_model.fit(train, train.esta_interesado)

y_pred = lr_model.predict(validation)

y_pred

In [None]:
print(metrics.classification_report(validation.esta_interesado, y_pred))

# Mapper En progreso (eliminar columna o probar con otros imputer)

In [None]:
# mapper_ii = DataFrameMapper([
#     ('clmn_drpr', 'drop', ['tiene_producto_credito_activo'])])
#     (['sexo'], [LabelBinarizer()]), 
#     (['codigo_region'], [OneHotEncoder()]),
#     (['ocupacion'], [OneHotEncoder()]),
#     (['codigo_canal'], [OneHotEncoder()]),
#     (['tiene_producto_credito_activo'], [KNNImputer(n_neighbors=10), 
#                                          LabelBinarizer()]),
#     (['es_activo'], [LabelBinarizer()]),
#     (['Quantiles_edad'], [StandardScaler()]),
#     (['Quantiles_saldos'], [StandardScaler()]),
#     (['Quantiles_antiguedad'], [StandardScaler()])
# ], df_out=True, default=False) # df_out=True → Es lo que muestra el nombre de la columna



In [None]:
# mapper_ii = DataFrameMapper(
#     drop_cols=['tiene_producto_credito_activo'],
#     [
#     (['sexo'], [LabelBinarizer()]), 
#     (['codigo_region'], [OneHotEncoder()]),
#     (['ocupacion'], [OneHotEncoder()]),
#     (['codigo_canal'], [OneHotEncoder()]),
#     (['es_activo'], [LabelBinarizer()]),
#     (['Quantiles_edad'], [StandardScaler()]),
#     (['Quantiles_saldos'], [StandardScaler()]),
#     (['Quantiles_antiguedad'], [StandardScaler()])
# ], df_out=True, default=False) # df_out=True → Es lo que muestra el nombre de la columna



In [None]:
# Sample transformado
mapper_ii.fit(train)

In [None]:
np.round(mapper_ii.fit_transform(train), 2)

In [None]:
# Nombres de los faetures
mapper.transformed_names_

In [None]:
# Sample transformado
mapper.transform(train)

# Pipeline SimpleImputer

In [None]:
# pipe_si = Pipeline([
#     ('imputer', IterativeImputer(random_state=94)),
#     ('mapper', mapper),
# ])
# # Lo entrenamos con train
# pipe_si.fit(train)

#### Programemos una función para evaluar un modelo...

In [None]:
from collections import defaultdict

import seaborn as sns


def evaluate_model(model, set_names=('train', 'validation'), title='', show_cm=True):
    if title:
        display(title)
        
    final_metrics = defaultdict(list)
    
    if show_cm:
        fig, axis = plt.subplots(1, len(set_names), sharey=True, figsize=(15, 3))
    
    for i, set_name in enumerate(set_names):
        assert set_name in ['train', 'validation', 'test']
        set_data = globals()[set_name]  # <- hack feo...

        y = set_data.esta_interesado
        y_pred = model.predict(set_data)
        final_metrics['Accuracy'].append(metrics.accuracy_score(y, y_pred))
        final_metrics['Precision'].append(metrics.precision_score(y, y_pred))
        final_metrics['Recall'].append(metrics.recall_score(y, y_pred))
        final_metrics['F1'].append(metrics.f1_score(y, y_pred))
        
        if show_cm:
            ax = axis[i]
            sns.heatmap(metrics.confusion_matrix(y, y_pred), ax=ax, cmap='Blues', annot=True, fmt='.0f', cbar=False)

            ax.set_title(set_name)
            ax.xaxis.set_ticklabels(['no esta interesado', 'esta interesado'])
            ax.yaxis.set_ticklabels(['no esta interesado', 'esta interesado'])
            ax.set_xlabel('Predicted class')
            ax.set_ylabel('True class')

        
    display(pd.DataFrame(final_metrics, index=set_names))
    if show_cm:
        plt.tight_layout()
        plt.show()

#### Gradient Boosting ####

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
gbc_model = Pipeline([
    ('mapper', mapper),
    ('classifier', GradientBoostingClassifier(random_state=94)),
])

gbc_model.fit(train, train.esta_interesado)

evaluate_model(gbc_model, title='Gradient Boosting')

#### Regresión Logística ####

In [None]:
from sklearn.linear_model import LogisticRegression
regresion_logistica = LogisticRegression()
#regresion_logistica.fit(X,y)

In [None]:
lr_model = Pipeline([
    ('mapper', mapper),
    ('classifier', LogisticRegression(random_state=94)),
])

lr_model.fit(train, train.esta_interesado)

y_pred = lr_model.predict(validation)

y_pred

In [None]:
from sklearn import metrics

In [None]:
print(metrics.classification_report(validation.esta_interesado, y_pred))