### Writing our very own K-nearest neighbors classifier from scratch

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split 
iris = datasets.load_iris()

In [2]:
X = iris.data
Y = iris.target

In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5)

K-NN algorithm.

In [4]:
from scipy.spatial import distance
# Euclidean distance between points a and b
def euc(a,b):
    return distance.euclidean(a,b)

In [10]:
import random

class ScrappyKNN():
    
    # we need a fit method
    def fit (self, X_train, y_train):
        
        # storing data in the class
        self.X_train = X_train
        self.y_train = y_train
        

    def predict(self, X_test):
        
        """
        The predict method takes the X_test dataset which consists of rows with values for the features.
        It then uses the closest method to determine to which point in training dataset it is closest.
        The closest method returns a label which is appended to the predictions list.
        """
        
        # new list to store future predictions
        predictions = []
        
        # for every row of features per 
        for row in X_test:
            label = self.closest(row)
            predictions.append(label)
        return predictions
    
    def closest(self, row):
        
        """
        This method uses a row of values of 4 features from the test set.
        It measures the Euclidean distance with all cases from the training set. 
        It looks for the case to which it has the lowest Euclidean distance.
        It grabs the index of that case and uses it to find the label that is associated with that index.
        This label is the prediction it returns.        
        """
        
        # keep track of the closest point
        best_dist = euc(row, self.X_train[0])
        best_index = 0
        
        # iterate over all the points
        for i in range(1, len(self.X_train)):
            dist = euc(row, self.X_train[i])
            # every time you find a closer one, we will update our variable
            if dist < best_dist:
                best_dist = dist
                best_index = i
        
        # use the index to return the label 
        return self.y_train[best_index]

In [6]:
my_classifier = ScrappyKNN()

In [9]:
my_classifier.fit(X_train, Y_train)
predictions = my_classifier.predict(X_test)
predictions

[2,
 2,
 0,
 0,
 1,
 0,
 1,
 2,
 0,
 0,
 1,
 1,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 2,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 0,
 2,
 0,
 1,
 2,
 1,
 0,
 2,
 1,
 2,
 2,
 2,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 2,
 2,
 2,
 2,
 1,
 0,
 0,
 2,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 2,
 2,
 0,
 2,
 0]

In [11]:
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, predictions))

0.96


***Pros:***
- relatively simple to understand

***Cons***
- Computationally intensive
- Hard to represent relationships between features