# K-Nearest Neighbor Algorithm

## 1/ Import Library

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import datasets
from scipy.stats import mode

In [2]:
data = datasets.load_iris()

## 2/ Data Preprocessing

In [3]:
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [4]:
X, y = data.data, data.target

In [5]:
X.shape

(150, 4)

In [6]:
y.shape

(150,)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)

In [8]:
X_train.shape

(120, 4)

In [9]:
X_test.shape

(30, 4)

## 3/ Build Knn Model


## Steps: 
+ Calculate distance to all neighbors
+ Sort neighbors ( based on the smallest distance )
+ Count possibilities of each class for k nearest neighbours
+ Chose the highest possibility for the predictions

In [17]:
from scipy.stats import mode
from sklearn.metrics import accuracy_score

In [18]:
def eucledian(p1,p2):
    dist = np.sqrt(np.sum((p1-p2)**2))
    return dist

In [19]:
class Knn:
    def __init__(self, k):
        self.k = k
        self.distance_matrix = []
        self.y_pred = []
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X):
        for item in X: 
            temp = []
            #Calculate matrix distance
            for j in range(len(self.X_train)): 
                distances = eucledian(np.array(self.X_train[j,:]) , item) 
                #Calculating the distance
                temp.append(distances) 
            self.distance_matrix = np.array(temp) 
            
            #Sorting the array while preserving the index
            #Keeping the first K datapoints
            self.distance_matrix = np.argsort(self.distance_matrix)[:self.k] 

            #Labels of the K datapoints from above
            labels = self.y_train[self.distance_matrix]
            # Chose the highest possibility for the predictions
            lab = mode(labels) 

            self.y_pred.append(lab.mode[0])


        return self.y_pred    
    
    def accuracy(self, y):
        return accuracy_score(y_test, self.y_pred)
 

In [20]:
model = Knn(9)

In [21]:
model.fit(X_train, y_train)

In [22]:
model.predict(X_test)

[0,
 0,
 0,
 1,
 2,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 2,
 2,
 0,
 1,
 0,
 2,
 2,
 1,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 1,
 1,
 0]

In [23]:
model.accuracy(y_test)

0.9666666666666667