Use this notebook to run experiments and plot results easily. You do not need to submit it, but you may use some of the generated plots in your report.

In [None]:
#Part I – Implementing a kNN from scratch: 

In [None]:
from scipy.spatial import distance_matrix
import numpy as np
from sklearn.metrics import accuracy_score

In [None]:
class KNN:
    '''
    k nearest neighboors algorithm class
    __init__() initialize the model
    train() trains the model
    predict() predict the class for a new point
    '''

    def __init__(self, k):
        if k <= 0:
            raise Exception("Sorry, no numbers below or equal to zero. Start again!")
        
        # empty initialization of X and y
        self.X = []
        self.y = []
        # k is the parameter of the algorithm representing the number of neighborhoods
        self.k = k
        
    def train(self, X, y):
        self.X = X
        self.y = y
       
    def predict(self, X_new, p):
        if self.X is None or self.y is None:
            raise Exception("Sorry, the model is not trained. Call train() with training data")
            
        distance = self.minkowski_dist(X_new, p)
        y_hat = []
        
        for row in distance:
            k_indice = np.argsort(row)[:self.k]
            k_nearest = [self.y[i] for i in k_indice]
            prediction = np.bincount(k_nearest).argmax()
            y_hat.append(prediction)

        return np.array(y_hat)
    
    def minkowski_dist(self, X_new, p):
        '''
        INPUT : 
        - X_new : is a MxD numpy array containing the coordinates of points for which the distance to the training set X will be estimated
        - p : parameter of the Minkowski distance
        
        OUTPUT :
        - dst : is an MxN numpy array containing the distance of each point in X_new to X
        '''
        # Verify if the training data is provided or not
        if self.X is None:
            raise Exception("Sorry, the model is not trained. Try providing training data")
        
        dst = distance_matrix(X_new, self.X, p=p)
        print(dst.shape)
        return dst


In [None]:
import pandas as pd
import numpy as np

In [None]:
training_data = pd.read_csv('training.csv')
validation_data = pd.read_csv('validation.csv')


In [None]:
X_train = training_data.drop('y', axis=1)
y_train = training_data['y']


In [None]:
X_train.shape
y_train.shape


(2800,)

X_val = validation_data.drop('y', axis=1)
y_val = validation_data['y']

In [None]:
X_val = validation_data.drop('y', axis=1)
y_val = validation_data['y']

In [None]:
X_val.shape
y_val.shape

(480,)

In [None]:
print("Training Data:")
print(X_train.head())
print(y_train.head())

Training Data:
         X1        X2
0  2.515463  1.991283
1  0.447593  0.559807
2  0.749229 -0.433780
3 -0.289287  1.536832
4  3.675082  2.146912
0    1.0
1    1.0
2    0.0
3    1.0
4    1.0
Name: y, dtype: float64


In [None]:
print("Validation data : ")
print(X_val.head())
print(y_val.head())

Validation data : 
         X1        X2
0  0.150792  0.131491
1  1.637961  0.865304
2  2.489981 -0.399073
3  3.162354  0.131336
4  0.267035 -0.319733
0    1.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: y, dtype: float64


In [None]:
k = 5
knn_instance = KNN(k)
knn_instance.train(X_train,y_train)


In [1]:
accuracies = []
k_values = list(range(1, 30))
p_values = list(range(1, 10))
best_acc = 0.0
for k in k_values:
    for p in p_values:
        knn_model = KNN(k)
        knn_model.train(X_train, y_train)
        y_pred = knn_model.predict(X_val, p)
        acc = accuracy_score(y_val, y_pred)
        accuracies.append(acc)
        print(f"accuracy is {acc} with k={k}")
        if acc > best_acc:
            best_acc = acc
            best_k = k
            best_p = p
print(f"Validation accuracy: {best_acc}, k={best_k}, p={best_p}")

NameError: name 'KNN' is not defined