#### Parte II - Construcción de funciones

El objetivo de esta parte del trabajo es generar código que sea flexible y que esté modularizado

In [1]:
#Importamos las librerias necesarias
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
import statistics

# Definimos el directorio
os.chdir('C:/Users/Usuario\Desktop/MAESTRIA\Big Data/TPs/BigData/TP3')

1. Función evalua_metodo(X_train, Y_trein, x_test)

In [None]:
def evalua_metodo(X_train, X_test, y_train, y_test, model, k=5, landa=1):
    '''    
    Toma como parametros las bases de train y test y el modelo deseado: LogisticRegression, LinearDiscriminantAnalysis,KNeighborsClassifier
    Devuelve las métricas: Accuracy, AUC y ECM.
    '''
    if model==KNeighborsClassifier:
        y = model(k).fit(X_train, y_train)
    elif model== LogisticRegression:
        y = model(C=1/landa).fit(X_train, y_train)
    elif model== LinearDiscriminantAnalysis:
        y = model().fit(X_train, y_train)
    else:
        y = model.fit(X_train, y_train)
    
    y_pred = y.predict(X_test)

    matriz_confusion = confusion_matrix(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    fpr, tpr, tresholds = roc_curve(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    ecm = mean_squared_error(y_test, y_pred)

    df_metrics = pd.DataFrame({'modelo': [model], 'accuracy': [accuracy], 'auc':[auc], 'ecm':[ecm]})

    return df_metrics

2. Funcioón cross_validatio(model, k, x, y)

In [None]:
def cross_validation(model, k, x, y, standard=False):
    sc = StandardScaler()
    
    if standard==True:
    
        kf = KFold(n_splits=k, shuffle=True)

        ecm = []

        for i, (train_index, test_index) in enumerate(kf.split(x)):   
            x_train, x_test = x.iloc[list(train_index)], x.iloc[list(test_index)]  
            y_train, y_test = y.iloc[list(train_index)], y.iloc[list(test_index)]

            # Estandarizamos las observaciones de entrenamiento
            x_train_transformed = pd.DataFrame(sc.fit_transform(x_train),index=x_train.index, columns=x_train.columns)
            # Estandarizamos las observaciones de test
            x_test_transformed = pd.DataFrame(sc.transform(x_test),index=x_test.index, columns=x_test.columns)

            ecm2 = evalua_metodo(x_train_transformed, x_test_transformed, y_train, y_test, model)
            ecm3 = ecm2['ecm'][0]
            ecm.append(ecm3)
        
        final_ecm = statistics.mean(ecm)

        return final_ecm
    
    elif standard==False:
    
        kf = KFold(n_splits=k, shuffle=True)

        ecm = []

        for i, (train_index, test_index) in enumerate(kf.split(x)):   
            x_train, x_test = x.iloc[list(train_index)], x.iloc[list(test_index)]  
            y_train, y_test = y.iloc[list(train_index)], y.iloc[list(test_index)]

            ecm2 = evalua_metodo(x_train, x_test, y_train, y_test, model)
            ecm3 = ecm2['ecm']
            ecm.append(ecm3)

        final_ecm = statistics.mean(ecm)

        return final_ecm

3. Función evalua_config(landa, k, x, y, l1_ratio=1)

    Toma cómo input una serie de valores de Lambda y el K para CV, y devuelve el lambda que minimiza el ECM por CV, usando elastic net para reg_logit
    l1_ratio=1 se hace Lasso
    l1_ratio=0 se hace Ridge
    0 <l1_ratio< 1 combinación de ambos en elastic net.

In [None]:
def evalua_config (landa, k, x, y, l1_ratio=1):    
    ret={}
    for elem in landa:
        clf_en_LR = LogisticRegression(
        C=1/elem, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01)
        ecm=cross_validation(clf_en_LR,k, x, y, standard=True)
        ret[elem]= ecm 

        
    return min(ret, key=ret.get)

4. Función evalua_multples_metodos(k_cv,k_knn, landa_max, x_train, x_test , y_train, y_test, x, y)

In [None]:
def evalua_multiples_metodos (k_cv,k_knn, landa_max, x_train, x_test , y_train, y_test, x, y): 
    
    modelos = ['regresion_logistica','k_vecinos_cercanos','analisis_discriminante']

    landas= np.arange(0.01, landa_max,10).tolist()

    matriz = pd.DataFrame(columns=["Modelo", "Hiperparametro","Precisión", "AUC", "ECM"])

    for modelo in modelos:
        if modelo == 'regresion_logistica':
            landa_optimo = evalua_config( landas, k_cv ,x, y) #hiperparametro landa
            metricas_log= evalua_metodo(x_train, x_test , y_train, y_test , LogisticRegression, landa=landa_optimo )
            results = [modelo, landa_optimo, metricas_log['accuracy'][0], metricas_log['auc'][0], metricas_log['ecm'][0]]
            matriz.loc[len(matriz)] = results

        elif modelo =='k_vecinos_cercanos':
            metricas_kvc = evalua_metodo( x_train, x_test, y_train, y_test, KNeighborsClassifier, k=k_knn)
            results = [modelo, k_knn, metricas_kvc['accuracy'][0], metricas_kvc['auc'][0], metricas_kvc['ecm'][0]]
            matriz.loc[len(matriz)] = results
        
        elif modelo == 'analisis_discriminante':
            metricas_ad = evalua_metodo(x_train, x_test, y_train, y_test, LinearDiscriminantAnalysis)
            results = [modelo, "NA", metricas_ad['accuracy'][0], metricas_ad['auc'][0], metricas_ad['ecm'][0]]
            matriz.loc[len(matriz)] = results
        
    
    return(matriz)