### Implement Nearest Neighbour without sklearn
* Recap of nearest Neighbour
* Implement Nearest Neighbour Classifier

### Algo:
* During Training, data are stored in an optimized data structure suited for fast search.
* During prediction, for each data we search for the nearest neighbours using euclidian/manhatten/humming distance vector calculation technique
* Finally, decide based on majority

In [1]:
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [3]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [4]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
import pandas as pd

In [6]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [7]:
df['type'] = iris.target

In [8]:
df.sample(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),type
46,5.1,3.8,1.6,0.2,0
51,6.4,3.2,4.5,1.5,1
92,5.8,2.6,4.0,1.2,1
83,6.0,2.7,5.1,1.6,1
33,5.5,4.2,1.4,0.2,0


In [9]:
def calculate_distance_between_vectors(v1,v2):
    pass

def store_training_data(train_data):
    pass

def find_k_neighbours(k,data):
    pass

def identify_class_from_neighbours(neighbours):
    pass

In [10]:
import numpy as np

## KNN Classifier

In [11]:
class MyKNN:
    def __init__(self,k=5):
        self.k = k
        
    def my_fit(self, feature_data, target_data):
        self.feature_data = np.array(feature_data)
        self.target_data = np.array(target_data)
        
    def calculate_distance_vector_matrix(self, one_data):
        distances = np.sqrt(np.sum(np.square(self.feature_data - one_data),axis=1))
        return distances
    
    def find_k_neighbours(self,one_data_feature):
        res = self.calculate_distance_vector_matrix(one_data_feature)
        return res.argsort()[:self.k]
        
    def find_k_neighbours_class(self, one_data_feature):
        indexs_of_neighbours = self.find_k_neighbours(one_data_feature)
        return self.target_data[indexs_of_neighbours]
    
    def my_predict(self, one_data_feature):
        classes = self.find_k_neighbours_class(one_data_feature)
        return np.bincount(classes).argmax()

In [12]:
model = MyKNN(k=5)

In [13]:
feature_data = df.drop(columns=['type'],axis=1)
target_data = df.type

In [14]:
model.my_fit(feature_data, target_data)

* Can we calculate euclidian distance between one_data & comp_data in one shot
* Below is euclidian distance between one_data & all the comp_data

In [16]:
one_data = np.array([1,2,3,4])

In [17]:
model.find_k_neighbours_class(one_data)

array([2, 1, 1, 1, 1])

In [18]:
model.my_predict([4,4,4,4])

2

 # KNN Regressor

In [21]:
class MyKNN_regressor:
    def __init__(self,k=5):
        self.k = k
        
    def my_fit(self, feature_data, target_data):
        self.feature_data = np.array(feature_data)
        self.target_data = np.array(target_data)
        
    def calculate_distance_vector_matrix(self, one_data):
        distances = np.sqrt(np.sum(np.square(self.feature_data - one_data),axis=1))
        return distances
    
    def find_k_neighbours(self,one_data_feature):
        res = self.calculate_distance_vector_matrix(one_data_feature)
        return res.argsort()[:self.k]
        
    def find_k_neighbours_values(self, one_data_feature):
        indexs_of_neighbours = self.find_k_neighbours(one_data_feature)
        return self.target_data[indexs_of_neighbours]
    
    def my_predict(self, one_data_feature):
        values = self.find_k_neighbours_values(one_data_feature)
        return np.mean(value)