# TP1

In [1]:
import numpy as np
import pandas as pd
import pydotplus
import matplotlib.pyplot as plt
from IPython.display import Image  
from sklearn import preprocessing
from sklearn.externals.six import StringIO  
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

## Lectura del dataset

Se reemplaza el atributo "Data of birth" por "Age", encodeamos "Social Class". También encodeamos "Presence of Children" para que tome 2 valores: 0 si no hay hijos, 1 en caso contrario

In [2]:
def readCsv():
    df = pd.read_csv("FullData2.csv")
    
    #Agregamos atributo edad
    df['Date of birth'] = pd.to_datetime('today').year - pd.to_datetime(df['Date of birth'],format='%Y-%m-%d').dt.year
    df = df.rename(columns={'Date of birth': 'Age' })
    
    #Encode social class
    socialClassEncoder = preprocessing.LabelEncoder()
    socialClassEncoder.fit(["AB", "C1", "C2", "D ", "E "])
    
    df['Social Class'] = socialClassEncoder.transform(df['Social Class'])
    
    #Agrupamos en 2 clases "Presence of Children"
    df['Presence of Children'] = df['Presence of Children'].apply(lambda x: 1 if x > 1 else 0)
    
    return df

df = readCsv()

## 1. Partición de datos 

Particionamos el conjunto entre datos de entramiento y de test. Los de validación se van a ir generando en cada uno de los k-folds. 

In [3]:
def partitionate(df):
    #Seteamos Presence of Children como target
    targetColumn = "Presence of Children"
    
    #El resto de las columnas las vamos a usar como feature para clasificar
    features = list(df.columns)
    features.remove(targetColumn)
    
    #Eliminamos otras columnas que están muy relacionadas con lo que queremos predecir (deberíamos sacarlas del csv)
    features.remove("Demographic cell 1")
    features.remove("Mosaic Classification")
    features.remove("Life stage")
    features.remove("No of People")
    features.remove("Age")
    features.remove("Terminal age of education")

    return train_test_split(df[features], df[targetColumn], train_size=0.8, test_size=0.2, random_state=42)

X, test_X, y, test_y = partitionate(df)

## 2. Árboles de decisión 

Entrenar un árbol de decisión con altura 3 y el resto de los hiperparámetros con su valor en default. Estimar la performance del modelo utilizando 5-fold cross validation utilizando el Accuracy y ROC AUC

In [4]:
def kfold(max_depth = 3, criterion = "gini", round_decimals = 3):
    kf = KFold(n_splits = 5)
    
    accuracyValidation = []
    accuracyTrain = []
    rocAucValidation = []
    rocAucTrain = []
    
    for train_index, validation_index in kf.split(X):
        #Separamos los folds
        train_X, validation_X = X.iloc[train_index, :], X.iloc[validation_index, :]
        train_y, validation_y = y.iloc[train_index], y.iloc[validation_index]
    
        #Entrenamos el árbol
        dt = DecisionTreeClassifier(max_depth = max_depth, criterion = criterion)
        dt.fit(train_X, train_y)
        
        #Guardamos los resultados
        accuracyValidation.append(np.round(accuracy_score(validation_y, dt.predict(validation_X)), round_decimals))
        accuracyTrain.append(np.round(accuracy_score(train_y, dt.predict(train_X)), round_decimals))
        
        rocAucValidation.append(np.round(roc_auc_score(validation_y, dt.predict(validation_X)), round_decimals))
        rocAucTrain.append(np.round(roc_auc_score(train_y, dt.predict(train_X)), round_decimals))
                
    return accuracyValidation, accuracyTrain, rocAucValidation, rocAucTrain

def showResults(scores, label):
    print(label)
    print("Values: ", scores, " Promedio = ", np.round(np.mean(scores), 3), " Desvio = ", np.round(np.std(scores), 3),"\n")
    

#2.1
accuracyValidation, accuracyTrain, rocAucValidation, rocAucTrain = kfold()

showResults(accuracyValidation, "Accuracy: Conjunto de validación")
showResults(accuracyTrain, "Accuracy: Conjunto de entrenamiento")
showResults(rocAucValidation, "ROC AUC: Conjunto de validación")
showResults(rocAucTrain, "ROC AUC: Conjunto de entrenamiento")

Accuracy: Conjunto de validación
Values:  [0.794, 0.799, 0.806, 0.804, 0.786]  Promedio =  0.798  Desvio =  0.007 

Accuracy: Conjunto de entrenamiento
Values:  [0.801, 0.8, 0.796, 0.797, 0.803]  Promedio =  0.799  Desvio =  0.003 

ROC AUC: Conjunto de validación
Values:  [0.754, 0.755, 0.761, 0.758, 0.749]  Promedio =  0.755  Desvio =  0.004 

ROC AUC: Conjunto de entrenamiento
Values:  [0.76, 0.76, 0.752, 0.755, 0.761]  Promedio =  0.758  Desvio =  0.003 



Entrenar árboles de decisión con las siguientes combinaciones. En todos los casos probar e informar Accuracy y ROC AUC para training y para validación con Gini y con Information Gain haciendo cross validation:
    a. Altura máxima 3
    b. Altura máxima 6
    c. Sin límite de altura máxima

In [5]:
def showSummary(results, label):
    print(label)
    print('Acc Val: {:.3f}, Acc Train: {:.3f}, ROC AUC Val: {:.3f}, ROC AUC Train: {:.3f}\n'.format(
            np.mean(results[0]), np.mean(results[1]), np.mean(results[2]), np.mean(results[3])))

#Altura 3 Gini
showSummary(kfold(max_depth = 3, criterion = "gini"), "Áltura 3 Gini")
showSummary(kfold(max_depth = 3, criterion = "entropy"), "Áltura 3 Information Gain")
showSummary(kfold(max_depth = 6, criterion = "gini"), "Áltura 6 Gini")
showSummary(kfold(max_depth = 6, criterion = "entropy"), "Áltura 6 Information Gain")
showSummary(kfold(max_depth = None, criterion = "gini"), "Sin límite de áltura Gini")
showSummary(kfold(max_depth = None, criterion = "entropy"), "Sin límite áltura Information Gain")

Áltura 3 Gini
Acc Val: 0.798, Acc Train: 0.799, ROC AUC Val: 0.755, ROC AUC Train: 0.758

Áltura 3 Information Gain
Acc Val: 0.799, Acc Train: 0.800, ROC AUC Val: 0.757, ROC AUC Train: 0.758

Áltura 6 Gini
Acc Val: 0.804, Acc Train: 0.824, ROC AUC Val: 0.774, ROC AUC Train: 0.795

Áltura 6 Information Gain
Acc Val: 0.803, Acc Train: 0.817, ROC AUC Val: 0.766, ROC AUC Train: 0.781

Sin límite de áltura Gini
Acc Val: 0.784, Acc Train: 0.999, ROC AUC Val: 0.771, ROC AUC Train: 0.999

Sin límite áltura Information Gain
Acc Val: 0.774, Acc Train: 0.999, ROC AUC Val: 0.761, ROC AUC Train: 0.999

