# K-Nearest Neighbors

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import preprocessing # LabelEncoder
from sklearn.preprocessing import MinMaxScaler # Escala los datos
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import scikitplot as skplt 
from string import ascii_uppercase 
# import seaborn as sns
import qgrid
import time
#from sklearn.externals import joblib # Para guardar el modelo
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


from scipy import stats #Para la moda

from sklearn.neighbors import KNeighborsClassifier

In [2]:
def loadCSV(pathSamples, pathMatrix):
    df_loaded = pd.read_table(pathMatrix, sep=',')
    data = pd.read_table(pathSamples, sep=',')
    clin_trial_values = df_loaded.values
    
    Y = data['Eligible']
    Y = Y.astype(int)
    X = clin_trial_values[:, :]
    return X, Y

In [3]:
def classification_error(y_est, y_real):
    err = 0
    for y_e, y_r in zip(y_est, y_real):

        if y_e != y_r:
            err += 1

    return err/np.size(y_est)

In [4]:
def plot_roc(Xtest, Ytest, probs, xlabel):
    ns_probs = [0 for _ in range(len(Ytest))]
    
    probs = probs[:, 1]
    ns_auc = roc_auc_score(Ytest, ns_probs)
    auc = roc_auc_score(Ytest, probs)  

    print('No Skill: ROC AUC=%.3f' % (ns_auc))
    print('Logistic: ROC AUC=%.3f' % (auc))

    ns_fpr, ns_tpr, _ = roc_curve(Ytest, ns_probs)
    fpr, tpr, _ = roc_curve(Ytest, probs)   

    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, marker='.', label= xlabel)

    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the legend
    plt.legend()
    # show the plot
    plt.show()

In [5]:
def model_KNN(k, impresion = False):

    tiempo_i = time.time()
    accuracy_list_train = np.zeros([10])
    accuracy_list = np.zeros([10])
    precision_list = np.zeros([10,2])
    recall_list = np.zeros([10,2])
    f_list = np.zeros([10,2]) 
    errores = np.zeros(10)
    knn = KNeighborsClassifier(k)
    list_acc_mean = []
    list_acc_train_mean = []
    list_acc_std = []
    list_acc_train_std = []
    #list_percentage = [0.05,0.10,0.15,0.20,0.25,0.3]
    list_percentage = [0.2]
    
    
    for i in list_percentage:
        for j in range(10):
            Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=i) # Modificar metodología de validación
            scaler = MinMaxScaler()#Escala entre 0 y 1
            Xtrain = scaler.fit_transform(Xtrain)
            Xtest = scaler.transform(Xtest)
            
            knn.fit(Xtrain, Ytrain)
            pred = knn.predict(Xtest)
            pred_train = knn.predict(Xtrain)

            #code for calculating accuracy 
            _accuracy_ = accuracy_score(Ytest, pred, normalize=True)
            accuracy_list[j] = _accuracy_
        
            _accuracy_train_ = accuracy_score(Ytrain, pred_train, normalize=True)
            accuracy_list_train[j] = _accuracy_train_     

            #code for calculating recall 
            _recalls_ = recall_score(Ytest, pred, average=None)
            recall_list[j] = _recalls_

            #code for calculating precision 
            _precisions_ = precision_score(Ytest, pred, average=None)
            precision_list[j] = _precisions_

            _f_score_ = f1_score(Ytest, pred, average=None)
            f_list[j] = _f_score_

            errores[j] = classification_error(pred, Ytest)
            
        list_acc_mean.append(np.mean(accuracy_list))
        list_acc_std.append(np.std(accuracy_list))
        list_acc_train_mean.append(np.mean(accuracy_list_train))
        list_acc_train_std.append(np.std(accuracy_list_train))
          
    if impresion == True:
        
        x = [10000-(i* 10000) for i in list_percentage]
        sneg = [m-s for m,s in zip(list_acc_mean,list_acc_std)]
        spos = [m+s for m,s in zip(list_acc_mean,list_acc_std)]
        
        sneg_train = [m-s for m,s in zip(list_acc_train_mean,list_acc_train_std)]
        spos_train = [m+s for m,s in zip(list_acc_train_mean,list_acc_train_std)]
        
        fig1, ax1 = plt.subplots()
        
        ax1.fill_between(x,sneg_train,spos_train,alpha=.4)
        ax1.plot(x, list_acc_train_mean, marker = 'v', label = "train")
        ax1.fill_between(x,sneg,spos,alpha=.4)
        ax1.plot(x, list_acc_mean, marker = 'p', label = "test")

        ax1.set_xlabel("Número de muestras para train")
        ax1.set_ylabel("Accuracy")

        ax1.legend(loc="upper right", title="Curva de aprendizaje", frameon=False)
        plt.show()
            
        #Curva ROC
        knn_probs = knn.predict_proba(Xtest)
        
        plot_roc(Xtest, Ytest, knn_probs, "KNN")

        skplt.metrics.plot_confusion_matrix(Ytest, pred, normalize=True)

    return str(np.mean(accuracy_list)), str(np.std(accuracy_list)), str(np.mean(recall_list)), str(np.std(recall_list)), str(np.mean(precision_list)), str(np.std(precision_list)),  str(np.mean(f_list)), str(np.std(f_list)), str(np.mean(errores)), str(np.std(errores)), str(time.time()-tiempo_i)

# LSA 10k_1Col_NoCarEsp

In [6]:
X, Y = loadCSV("../../Dataset/10k_1Col_NoCarEsp_LSA.csv", "../../Tables/docsTopicsLSA1200.csv") #Cargar SCV

In [7]:
# Llenar tabla con diferentes hiperparámetros
pd.options.mode.chained_assignment = None

randn = np.random.randn
df_types = pd.DataFrame({
    'Numero de vecinos' : pd.Series(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'])})
df_types["Eficiencia"] = ""
df_types["Int_Eficiencia"] = ""
df_types["Sensibilidad"] = ""
df_types["Int_Sensibilidad"] = ""
df_types["Precision"] = ""
df_types["Int_Precision"] = ""
df_types["F-Score"] = ""
df_types["Int_F-Score"] = ""
df_types["Error_Prueba"] = ""
df_types["Int_error"] = ""
df_types["Tiempo de ejecución"] = ""
df_types.set_index(['Numero de vecinos'], inplace=True)

i = 0
for k in df_types.index:
    print("Entrenando k: ", k)
    Acc, IntAcc, Sen, IntSen, Pre, IntPre, f, IntF, error, stdError, tiempo = model_KNN(int(k))
    print("Terminado k: ", k)
    df_types["Eficiencia"][i] = Acc
    df_types["Int_Eficiencia"][i] = IntAcc
    df_types["Sensibilidad"][i] = Sen
    df_types["Int_Sensibilidad"][i] = IntSen
    df_types["Precision"][i] = Pre
    df_types["Int_Precision"][i] = IntPre
    df_types["F-Score"][i] = f
    df_types["Int_F-Score"][i] = IntF
    df_types["Error_Prueba"][i] = error
    df_types["Int_error"][i] = stdError
    df_types["Tiempo de ejecución"][i] = tiempo
    
    i += 1
    
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget.get_changed_df()

Entrenando k:  1
Terminado k:  1
Entrenando k:  2
Terminado k:  2
Entrenando k:  3
Terminado k:  3
Entrenando k:  4
Terminado k:  4
Entrenando k:  5
Terminado k:  5
Entrenando k:  6
Terminado k:  6
Entrenando k:  7
Terminado k:  7
Entrenando k:  8
Terminado k:  8
Entrenando k:  9
Terminado k:  9
Entrenando k:  10
Terminado k:  10
Entrenando k:  11
Terminado k:  11
Entrenando k:  12
Terminado k:  12
Entrenando k:  13
Terminado k:  13
Entrenando k:  14
Terminado k:  14
Entrenando k:  15
Terminado k:  15
Entrenando k:  16
Terminado k:  16
Entrenando k:  17
Terminado k:  17
Entrenando k:  18
Terminado k:  18
Entrenando k:  19
Terminado k:  19
Entrenando k:  20
Terminado k:  20


Unnamed: 0_level_0,Eficiencia,Int_Eficiencia,Sensibilidad,Int_Sensibilidad,Precision,Int_Precision,F-Score,Int_F-Score,Error_Prueba,Int_error,Tiempo de ejecución
Numero de vecinos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.6980999999999999,0.0108990825301949,0.6979712911667239,0.0303568590389075,0.6986092655710706,0.0162540960623963,0.6977638576153925,0.014842059620319,0.3019,0.0108990825301949,478.5493469238281
2,0.65655,0.0081622607162476,0.6575825315515988,0.2184265839604201,0.6943733567350261,0.0879058648641242,0.6398058052715577,0.0782628256968929,0.34345,0.0081622607162476,2093.186415910721
3,0.69695,0.0077085990945177,0.696967130616607,0.0483181615877056,0.6987132779782037,0.0215985353139892,0.6962193703437569,0.0168673503153453,0.30305,0.0077085990945177,2067.862370967865
4,0.6710999999999999,0.0097154516107075,0.670038055278354,0.1779109618364518,0.6947430387940604,0.0677704295439758,0.659769375716647,0.0633975347223383,0.3289,0.0097154516107075,2240.062573194504
5,0.6952499999999999,0.008140178130729,0.6957512466400226,0.0674346398282272,0.6990467181617659,0.0310696022636389,0.6940169731905288,0.021530362772706,0.30475,0.008140178130729,2120.530597686768
6,0.6717500000000001,0.0078206457533889,0.6727589508135199,0.1705357106241202,0.6953274305656081,0.071046323306262,0.6622899841910384,0.0573465894428376,0.32825,0.0078206457533889,2187.9998264312744
7,0.69145,0.006412682745934,0.691831374895556,0.0706038113549049,0.695542881359228,0.0313196269584902,0.6900276385393582,0.0220082437098893,0.30855,0.006412682745934,2167.0649077892303
8,0.66815,0.0137114003661186,0.6660317179211026,0.176608703095773,0.6900612142329388,0.0630328194744027,0.6563827011972994,0.0659000075912133,0.33185,0.0137114003661187,2187.5677711963654
9,0.6903,0.0118241278748159,0.6902412911577016,0.0787356327437936,0.6949260412294495,0.0316982352556649,0.6883289194855899,0.027844727158008,0.3097,0.0118241278748159,2233.980273246765
10,0.6789999999999999,0.0064691575958543,0.6764855007104593,0.1478950629524115,0.6940903852027327,0.0512226521001114,0.6705543917423873,0.0534674516051346,0.321,0.0064691575958543,2191.166218519211


In [None]:
# Llamar función con el mejor modelo
Acc, IntAcc, Sen, IntSen, Pre, IntPre, f, IntF, error, stdError, tiempo = model_KNN(7, impresion = True)
print('Eficiencia',Acc, ' Int_Eficiencia', IntAcc,' Sensibilidad', Sen, ' Int_Sensibilidad',IntSen,' Precision', Pre, ' Int_Precision',IntPre,' F-Score', f, ' Int_F-Score',IntF,' Error_Prueba', error,' Int_Error', stdError,' Tiempo ejecución', tiempo)