# TP1

In [3]:
import numpy as np
import pandas as pd
import pydotplus
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from sklearn import preprocessing
from sklearn.externals.six import StringIO  
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

## Lectura del dataset

Se reemplaza el atributo "Data of birth" por "Age", encodeamos "Social Class". También encodeamos "Presence of Children" para que tome 2 valores: 0 si no hay hijos, 1 en caso contrario

In [5]:
def readCsv():
    df = pd.read_csv("FullData2.csv")
    
    #Agregamos atributo edad
    df['Date of birth'] = pd.to_datetime('today').year - pd.to_datetime(df['Date of birth'],format='%Y-%m-%d').dt.year
    df = df.rename(columns={'Date of birth': 'Age' })
    
    #Encode social class
    socialClassEncoder = preprocessing.LabelEncoder()
    socialClassEncoder.fit(["AB", "C1", "C2", "D ", "E "])
    
    df['Social Class'] = socialClassEncoder.transform(df['Social Class'])
    
    #Agrupamos en 2 clases "Presence of Children"
    df['Presence of Children'] = df['Presence of Children'].apply(lambda x: 1 if x > 1 else 0)
    
    return df

df = readCsv()

## 1. Partición de datos 

Particionamos el conjunto entre datos de entramiento y de test. Los de validación se van a ir generando en cada uno de los k-folds. 

In [6]:
def partitionate(df):
    #Seteamos Presence of Children como target
    targetColumn = "Presence of Children"
    
    #El resto de las columnas las vamos a usar como feature para clasificar
    features = list(df.columns)
    features.remove(targetColumn)
    
    #Eliminamos otras columnas que están muy relacionadas con lo que queremos predecir (deberíamos sacarlas del csv)
    features.remove("Demographic cell 1")
    features.remove("Mosaic Classification")
    features.remove("Life stage")
    features.remove("No of People")
    features.remove("Age")
    features.remove("Terminal age of education")

    return train_test_split(df[features], df[targetColumn], train_size=0.8, test_size=0.2, random_state=42)

X, test_X, y, test_y = partitionate(df)

## 2. Árboles de decisión 

Entrenar un árbol de decisión con altura 3 y el resto de los hiperparámetros con su valor en default. Estimar la performance del modelo utilizando 5-fold cross validation utilizando el Accuracy y ROC AUC

In [7]:
def kfold(max_depth = 3, criterion = "gini", round_decimals = 3):
    kf = KFold(n_splits = 5)
    
    accuracyValidation = []
    accuracyTrain = []
    rocAucValidation = []
    rocAucTrain = []
    
    for train_index, validation_index in kf.split(X):
        #Separamos los folds
        train_X, validation_X = X.iloc[train_index, :], X.iloc[validation_index, :]
        train_y, validation_y = y.iloc[train_index], y.iloc[validation_index]
    
        #Entrenamos el árbol
        dt = DecisionTreeClassifier(max_depth = max_depth, criterion = criterion)
        dt.fit(train_X, train_y)
        
        #Guardamos los resultados
        accuracyValidation.append(np.round(accuracy_score(validation_y, dt.predict(validation_X)), round_decimals))
        accuracyTrain.append(np.round(accuracy_score(train_y, dt.predict(train_X)), round_decimals))
        
        rocAucValidation.append(np.round(roc_auc_score(validation_y, dt.predict(validation_X)), round_decimals))
        rocAucTrain.append(np.round(roc_auc_score(train_y, dt.predict(train_X)), round_decimals))
                
    return accuracyValidation, accuracyTrain, rocAucValidation, rocAucTrain

def imprimir_tabla(nombres_columnas,valores_por_columna,nombres_filas):
    diccionario = {}
    
    for i in range(0,len(nombres_columnas)):
        diccionario[nombres_columnas[i]] = valores_por_columna[i]

    dataFrame = pd.DataFrame(diccionario)
    dataFrame.index = nombres_filas
    
    display(dataFrame)

    
#2.1
accuracyValidation, accuracyTrain, rocAucValidation, rocAucTrain = kfold()

columnas = ['It 1','It 2','It 3','It 4','It 5','Promedio','Desvio']
filas = ['Accuracy: Conjunto de validación','Accuracy: Conjunto de entrenamiento','ROC AUC: Conjunto de validación','ROC AUC: Conjunto de entrenamiento']
valores_por_columnas = [[],[],[],[],[],[],[]]

for valores in [accuracyValidation, accuracyTrain, rocAucValidation, rocAucTrain]:
    for i in range(0,5):
        valores_por_columnas[i].append(valores[i])
    valores_por_columnas[5].append(np.round(np.mean(valores), 3))
    valores_por_columnas[6].append(np.round(np.std(valores), 3))

imprimir_tabla(columnas,valores_por_columnas,filas)

Unnamed: 0,It 1,It 2,It 3,It 4,It 5,Promedio,Desvio
Accuracy: Conjunto de validación,0.794,0.799,0.806,0.804,0.786,0.798,0.007
Accuracy: Conjunto de entrenamiento,0.801,0.8,0.796,0.797,0.803,0.799,0.003
ROC AUC: Conjunto de validación,0.754,0.755,0.761,0.758,0.749,0.755,0.004
ROC AUC: Conjunto de entrenamiento,0.76,0.76,0.752,0.755,0.761,0.758,0.003


Entrenar árboles de decisión con las siguientes combinaciones. En todos los casos probar e informar Accuracy y ROC AUC para training y para validación con Gini y con Information Gain haciendo cross validation:
    a. Altura máxima 3
    b. Altura máxima 6
    c. Sin límite de altura máxima

In [9]:
alturas = [3,6,None]
alt_names = ['Altura 3','Altura 6','Sin limite']
criterio = ["gini","entropy"]
crit_names = ['Gini','Information Gain']

columnas = ['Acc Val','Acc Train','ROC AUC Val','ROC AUC Train']
valores_por_columnas = [[],[],[],[]]
filas = []

for i in range(0,len(alturas)):
    for j in range(0,len(criterio)):
        result = kfold(max_depth = alturas[i],criterion = criterio[j])
        for k in range(0,4):
            valores_por_columnas[k].append(np.mean(result[k]))
        
        filas.append(alt_names[i] + " " + crit_names[j])

imprimir_tabla(columnas,valores_por_columnas,filas)


Unnamed: 0,Acc Val,Acc Train,ROC AUC Val,ROC AUC Train
Altura 3 Gini,0.7978,0.7994,0.7554,0.7576
Altura 3 Information Gain,0.7988,0.7996,0.757,0.758
Altura 6 Gini,0.8046,0.8242,0.7746,0.7946
Altura 6 Information Gain,0.8026,0.8174,0.7656,0.7814
Sin limite Gini,0.782,0.999,0.767,0.999
Sin limite Information Gain,0.7794,0.999,0.7658,0.999
