In [1]:
import os
import csv
import collections
import pandas as pd
import numpy as np
    
file_dir_train = os.path.join(os.getcwd(), 'train.csv')
file_dir_test = os.path.join(os.getcwd(), 'test.csv')
df_train = pd.read_csv(file_dir_train)
df_test = pd.read_csv(file_dir_test)

In [2]:
class KNearestNeighbor:
    def __init__(self, k):
        self.k = k

    def train(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X_test, distance=0):
        if(distance =="Euclidean"):
            distances = self.compute_euclidean_distance(X_test)
        elif(distance =="Manhattan"):
            distances = self.compute_manhattan_distance(X_test)

        return self.predict_labels(distances)

    def compute_euclidean_distance(self, X_test):

        num_test = X_test.shape[0]
        num_train = self.X_train.shape[0]
        distances = np.zeros((num_test, num_train))

        for i in range(num_test):
            for j in range(num_train):
                distances[i, j] = np.linalg.norm(X_test[i, :]-self.X_train[j, :])

        return distances

    def compute_manhattan_distance(self, X_test):

        num_test = X_test.shape[0]
        num_train = self.X_train.shape[0]
        distances = np.zeros((num_test, num_train))

        for i in range(num_test):
            for j in range(num_train):
                distances[i, j] = sum(abs(X_test[i, :]-self.X_train[j, :]))

        return distances


    def predict_labels(self, distances):
        num_test = distances.shape[0]
        y_pred = np.zeros(num_test)

        for i in range(num_test):
            y_indices = np.argsort(distances[i, :])
            k_closest_classes = self.y_train[y_indices[: self.k]].astype(int)
            y_pred[i] = np.argmax(np.bincount(k_closest_classes))

        return y_pred


if __name__ == "__main__":
    K = [1, 3, 5, 10, 15]
    X_train = np.array(df_train.iloc[:,1:])
    y_train = np.array(df_train.iloc[:,0])
    X_test= np.array(df_test.iloc[:,1:])
    y_test = np.array(df_test.iloc[:,0])
    metric = ["Euclidean", "Manhattan"]
    accuracy  = []
    for k in K:
        KNN = KNearestNeighbor(k=k)
        KNN.train(X_train, y_train)
        for m in metric:
            y_pred = KNN.predict(X_test, m)
            acc = (sum(y_pred == y_test)) / y_test.shape[0]
            print(f"k= {k}, metric = {m}, Accuracy: {acc}")

k= 1, metric = Euclidean, Accuracy: 0.9386666666666666
k= 1, metric = Manhattan, Accuracy: 0.9286666666666666
k= 3, metric = Euclidean, Accuracy: 0.9376666666666666
k= 3, metric = Manhattan, Accuracy: 0.9273333333333333
k= 5, metric = Euclidean, Accuracy: 0.9376666666666666
k= 5, metric = Manhattan, Accuracy: 0.925
k= 10, metric = Euclidean, Accuracy: 0.931
k= 10, metric = Manhattan, Accuracy: 0.9166666666666666
k= 15, metric = Euclidean, Accuracy: 0.9223333333333333
k= 15, metric = Manhattan, Accuracy: 0.9073333333333333
