In [None]:
# Class of k-Nearest Neighbour Classifier
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error


class kNN:
    def __init__(self, k = 3, exp = 2):
    # constructor for kNN classifier 
    # k is the number of neighbor for local class estimation
    # exp is the exponent for the Minkowski distance
        self.k = k
        self.exp = exp
    def fit(self, X_train, Y_train):
    # training k-NN method
    # X_train is the training data given with input attributes. n-th row correponds to n-th instance.
    # Y_train is the output data (output vector): n-th element of Y_train is the output value for n-th instance in X_train.
        self.X_train = X_train
        self.Y_train = Y_train
    def get_discrete_classification(self, X_test):
    # predict-class k-NN method
    # X_test is the test data given with input attributes. Rows correpond to instances
    # Method outputs prediction vector Y_pred_test:  n-th element of Y_pred_test is the prediction for n-th instance in X_test
    
        Y_pred_test = [] #prediction vector Y_pred_test for all the test instances in X_test is initialized to empty list []

   
        for i in range(len(X_test)):   #iterate over all instances in X_test
            test_instance = X_test.iloc[i] #i-th test instance 
            
            distances = []  #list of distances of the i-th test_instance for all the train_instance s in X_train, initially empty.
          
            for j in range(len(self.X_train)):  #iterate over all instances in X_train
                train_instance = self.X_train.iloc[j] #j-th training instance 
                distance = self.minkowski_distance(test_instance, train_instance) #distance between i-th test instance and j-th training instance
                distances.append(distance) #add the distance to the list of distances of the i-th test_instance
        
            # Store distances in a dataframe. The dataframe has the index of Y_train in order to keep the correspondence with the classes of the training instances 
            df_dists = pd.DataFrame(data=distances, columns=['dist'], index = self.Y_train.index)
        
            # Sort distances, and only consider the k closest points in the new dataframe df_knn
            df_nn = df_dists.sort_values(by=['dist'], axis=0)
            df_knn =  df_nn[:self.k]
            
            # Note that the index df_knn.index of df_knn contains indices in Y_train of the k-closed training instances to 
            # the i-th test instance. Thus, the dataframe self.Y_train[df_knn.index] contains the classes of those k-closed 
            # training instances. Method value_counts() computes the counts (number of occurencies) for each class in 
            # self.Y_train[df_knn.index] in dataframe predictions. 
            predictions = self.Y_train[df_knn.index].value_counts()
                 
            # the first element of the index predictions.index contains the class with the highest count; i.e. the prediction y_pred_test.
            y_pred_test = predictions.index[0]

            # add the prediction y_pred_test to the prediction vector Y_pred_test for all the test instances in X_test
            Y_pred_test.append(y_pred_test)
        
        return Y_pred_test
    def minkowski_distance(self, x1, x2):
    # computes the Minkowski distance of x1 and x2 for two labeled instances (x1,y1) and (x2,y2)
    
        # Set initial distance to 0
        distance = 0
    
        # Calculate Minkowski distance using the exponent exp
        for i in range(len(x1)):
            distance = distance + abs(x1[i] - x2[i])**self.exp
        
        distance = distance**(1/self.exp)
    
        return distance
    @staticmethod
    def normalize(df):
        column_maxes = df.max()
        df_max = column_maxes.max()
        column_mins = df.min()  # if dataset contains negative values
        df_min = column_mins.min()
        return (df - df_min) / (df_max - df_min)
    @staticmethod
    def read_data(file_name):
        data = pd.read_csv(file_name)
        y = data['class']
        x = data.drop(['class'],axis = 1)
        xtr, xt, ytr, yt = train_test_split(x,y,test_size=0.34, random_state=10)
        return  xtr, xt, ytr, yt


In [None]:
##################################################
# Hold-out testing: Training and Test set creation
##################################################
def hold_out_k_range(filename):
    xtr, xt, ytr, yt = kNN.read_data(filename)
    # range for the values of parameter k for kNN
    k_range = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]

    tr_acc = np.zeros(len(k_range))
    t_acc = np.zeros(len(k_range))

    i = 0
    for k  in  k_range:
        clf = kNN(k)
        clf.fit(xtr, ytr)
        yp_tr = clf.get_discrete_classification(xtr)
        yp_t = clf.get_discrete_classification(xt)
        tr_acc[i] = accuracy_score(ytr, yp_tr)
        t_acc[i] = accuracy_score(yt, yp_t)
        i += 1

    #########################################
    # Plot of training and test accuracies
    #########################################
    plt.plot(k_range,tr_acc,'ro-',k_range,t_acc,'bv--')
    plt.legend(['Training Accuracy','Test Accuracy'])
    plt.xlabel('k')
    plt.ylabel('Accuracy')

In [None]:
##################################################
# Hold-out testing: Training and Test set creation
##################################################
def hold_out_exp_range(filename):
    xtr, xt, ytr, yt = kNN.read_data(filename)
    # range for the values of parameter exp for kNN

    exp_range = [2,  100, 10000]

    tr_acc = np.zeros(len(exp_range))
    t_acc = np.zeros(len(exp_range))


    i = 0
    for exp  in  exp_range:
        clf = kNN(k = 3, exp = exp)
        clf.fit(xtr, ytr)
        yp_tr = clf.get_discrete_classification(xtr)
        yp_t = clf.get_discrete_classification(xt)
        tr_acc[i] = accuracy_score(ytr, yp_tr)
        tr_acc[i] = accuracy_score(yt, yp_t)
        i += 1


    #########################################
    # Plot of training and test accuracies
    #########################################

    plt.plot(exp_range,tr_acc,'ro-',exp_range,t_acc,'bv--')
    plt.legend(['Training Accuracy','Test Accuracy'])
    plt.xlabel('exp')
    plt.ylabel('Accuracy')


In [None]:
def mean_abs_err():
    y_true = [3, -0.5, 2, 7]
    y_pred = [2.5, 0.0, 2, 8]
    return mean_absolute_error(y_true, y_pred)

# Lab 2
## task B
**Test the kNN classifier on the diabetes and glass classification data sets for the case when the data
is not normalized and the case when the data is normalized.**

In [None]:
filename = 'data/diabetes.csv'
hold_out_k_range(filename= filename)
hold_out_exp_range(filename=filename)

**Indicate whether the training and hold-out accuracy rates improve with normalization.**




