# K-Nearest Neighbors

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import preprocessing # LabelEncoder
from sklearn.preprocessing import MinMaxScaler # Escala los datos
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import scikitplot as skplt 
from string import ascii_uppercase 
# import seaborn as sns
import qgrid
import time
#from sklearn.externals import joblib # Para guardar el modelo
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

from scipy import stats #Para la moda

from sklearn.neighbors import KNeighborsClassifier

In [14]:
def loadCSV(pathSamples, pathMatrix):
    df_loaded = pd.read_table(pathMatrix, sep=',')
    data = pd.read_table(pathSamples, sep=',')
    clin_trial_values = df_loaded.values
    
    Y = data['Eligible']
    Y = Y.astype(int)
    X = clin_trial_values[:, :]
    return X, Y

In [4]:
def classification_error(y_est, y_real):
    err = 0
    for y_e, y_r in zip(y_est, y_real):

        if y_e != y_r:
            err += 1

    return err/np.size(y_est)

In [5]:
def plot_roc(Xtest, Ytest, probs, xlabel):
    ns_probs = [0 for _ in range(len(Ytest))]
    
    probs = probs[:, 1]
    ns_auc = roc_auc_score(Ytest, ns_probs)
    auc = roc_auc_score(Ytest, probs)  

    print('No Skill: ROC AUC=%.3f' % (ns_auc))
    print('Logistic: ROC AUC=%.3f' % (auc))

    ns_fpr, ns_tpr, _ = roc_curve(Ytest, ns_probs)
    fpr, tpr, _ = roc_curve(Ytest, probs)   

    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, marker='.', label= xlabel)

    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the legend
    plt.legend()
    # show the plot
    plt.show()

In [6]:
def model_KNN(k, impresion = False):

    tiempo_i = time.time()
    accuracy_list_train = np.zeros([10])
    accuracy_list = np.zeros([10])
    precision_list = np.zeros([10,2])
    recall_list = np.zeros([10,2])
    f_list = np.zeros([10,2]) 
    errores = np.zeros(10)
    knn = KNeighborsClassifier(k)
    list_acc_mean = []
    list_acc_train_mean = []
    list_acc_std = []
    list_acc_train_std = []
    list_percentage = [0.05,0.10,0.15,0.20,0.25,0.3]
    
    for i in list_percentage:
        for j in range(10):
            Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=i) # Modificar metodología de validación
            scaler = MinMaxScaler()#Escala entre 0 y 1
            Xtrain = scaler.fit_transform(Xtrain)
            Xtest = scaler.transform(Xtest)

            knn.fit(Xtrain, Ytrain)
            pred = knn.predict(Xtest)
            pred_train = knn.predict(Xtrain)

            #code for calculating accuracy 
            _accuracy_ = accuracy_score(Ytest, pred, normalize=True)
            accuracy_list[j] = _accuracy_
        
            _accuracy_train_ = accuracy_score(Ytrain, pred_train, normalize=True)
            accuracy_list_train[j] = _accuracy_train_     

            #code for calculating recall 
            _recalls_ = recall_score(Ytest, pred, average=None)
            recall_list[j] = _recalls_

            #code for calculating precision 
            _precisions_ = precision_score(Ytest, pred, average=None)
            precision_list[j] = _precisions_

            _f_score_ = f1_score(Ytest, pred, average=None)
            f_list[j] = _f_score_

            errores[j] = classification_error(pred, Ytest)
            
        list_acc_mean.append(np.mean(accuracy_list))
        list_acc_std.append(np.std(accuracy_list))
        list_acc_train_mean.append(np.mean(accuracy_list_train))
        list_acc_train_std.append(np.std(accuracy_list_train))
          
    if impresion == True:
        
        x = [10000-(i* 10000) for i in list_percentage]
        sneg = [m-s for m,s in zip(list_acc_mean,list_acc_std)]
        spos = [m+s for m,s in zip(list_acc_mean,list_acc_std)]
        
        sneg_train = [m-s for m,s in zip(list_acc_train_mean,list_acc_train_std)]
        spos_train = [m+s for m,s in zip(list_acc_train_mean,list_acc_train_std)]
        
        fig1, ax1 = plt.subplots()
        
        ax1.fill_between(x,sneg_train,spos_train,alpha=.4)
        ax1.plot(x, list_acc_train_mean, marker = 'v', label = "train")
        ax1.fill_between(x,sneg,spos,alpha=.4)
        ax1.plot(x, list_acc_mean, marker = 'p', label = "test")

        ax1.set_xlabel("Número de muestras para train")
        ax1.set_ylabel("Accuracy")

        ax1.legend(loc="upper right", title="Curva de aprendizaje", frameon=False)
        plt.show()
            
        #Curva ROC
        knn_probs = knn.predict_proba(Xtest)
        
        plot_roc(Xtest, Ytest, knn_probs, "KNN")

        skplt.metrics.plot_confusion_matrix(Ytest, pred, normalize=True)

    return str(np.mean(accuracy_list)), str(np.std(accuracy_list)), str(np.mean(recall_list)), str(np.std(recall_list)), str(np.mean(precision_list)), str(np.std(precision_list)),  str(np.mean(f_list)), str(np.std(f_list)), str(np.mean(errores)), str(np.std(errores)), str(time.time()-tiempo_i)

# LSA 10k_1Col_NoCarEsp

In [15]:
X, Y = loadCSV("../../Dataset/10k_1Col_NoCarEsp_LSA.csv", "../../Tables/docsTopicsLSA1200.csv") #Cargar SCV

In [None]:
# Llenar tabla con diferentes hiperparámetros
pd.options.mode.chained_assignment = None

randn = np.random.randn
df_types = pd.DataFrame({
    'Numero de vecinos' : pd.Series(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'])})
df_types["Eficiencia"] = ""
df_types["Int_Eficiencia"] = ""
df_types["Sensibilidad"] = ""
df_types["Int_Sensibilidad"] = ""
df_types["Precision"] = ""
df_types["Int_Precision"] = ""
df_types["F-Score"] = ""
df_types["Int_F-Score"] = ""
df_types["Error_Prueba"] = ""
df_types["Int_error"] = ""
df_types["Tiempo de ejecución"] = ""
df_types.set_index(['Numero de vecinos'], inplace=True)

i = 0
for k in df_types.index:
    Acc, IntAcc, Sen, IntSen, Pre, IntPre, f, IntF, error, stdError, tiempo = model_KNN(int(k), impresion = False)
    df_types["Eficiencia"][i] = Acc
    df_types["Int_Eficiencia"][i] = IntAcc
    df_types["Sensibilidad"][i] = Sen
    df_types["Int_Sensibilidad"][i] = IntSen
    df_types["Precision"][i] = Pre
    df_types["Int_Precision"][i] = IntPre
    df_types["F-Score"][i] = f
    df_types["Int_F-Score"][i] = IntF
    df_types["Error_Prueba"][i] = error
    df_types["Int_error"][i] = stdError
    df_types["Tiempo de ejecución"][i] = tiempo
    
    i += 1
    
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget.get_changed_df()

In [None]:
# Llamar función con el mejor modelo
Acc, IntAcc, Sen, IntSen, Pre, IntPre, f, IntF, error, stdError, tiempo = model_KNN(7, impresion = True)
print('Eficiencia',Acc, ' Int_Eficiencia', IntAcc,' Sensibilidad', Sen, ' Int_Sensibilidad',IntSen,' Precision', Pre, ' Int_Precision',IntPre,' F-Score', f, ' Int_F-Score',IntF,' Error_Prueba', error,' Int_Error', stdError,' Tiempo ejecución', tiempo)