In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn import datasets
from sklearn.model_selection import train_test_split
from collections import Counter
%matplotlib inline

In [65]:
class simple_knn():
    
    def __init__(self):
        pass
    
    def train(self,X,y):
        self.X_train = X
        self.y_train = y
    
    def distance(self,instance1,instance2):
        instance1 = np.array(instance1)
        instance2=np.array(instance2)
        return np.linalg.norm(instance1-instance2)
    
    def get_neighbors(self,k,test_instance):
        distances = []
        for index in range(self.X_train.shape[0]): # loop over every training data point
            #dist = self.distance(test_instance,self.X_train[index])
            dist = self.distance(test_instance,self.X_train.iloc[index])
            distances.append((self.X_train.iloc[index],dist,self.y_train.iloc[index]))
        distances.sort(key = lambda x: x[1])
        neighbors = distances[:k]
        return neighbors
    
    def vote(self,neighbors):
        class_counter = Counter()
        for neighbor in neighbors:
            class_counter[neighbor[2]] += 1
        return class_counter.most_common(1)[0][0]            

In [4]:
iris = datasets.load_iris()

In [5]:
knn_train_x=[[0,0,1],[0,1,0],[1,0,0]]
knn_train_y = ['Apple','Banana','Orange']
knn_test_x=[[.99,0,0],[.5,.25,0],[0,0,1]]

In [6]:
knn_example = simple_knn()

In [7]:
knn_example.train(knn_train_x,knn_train_y)

In [8]:
for points in knn_test_x:
    neighbors = knn_example.get_neighbors(1,points)
    label = knn_example.vote(neighbors)
    print('The neighbors are: %s. The predicted label is: %s. The point was %s.' %(neighbors[0][2],label,points))

The neighbors are: Orange. The predicted label is: Orange. The point was [0.99, 0, 0].
The neighbors are: Orange. The predicted label is: Orange. The point was [0.5, 0.25, 0].
The neighbors are: Apple. The predicted label is: Apple. The point was [0, 0, 1].


In [9]:
for points in knn_test_x:
    neighbors = knn_example.get_neighbors(2,points)
    label = knn_example.vote(neighbors)
    print('The neighbors are: %s. The predicted label is: %s. The point was %s.' %(neighbors[0][2],label,points))

The neighbors are: Orange. The predicted label is: Orange. The point was [0.99, 0, 0].
The neighbors are: Orange. The predicted label is: Orange. The point was [0.5, 0.25, 0].
The neighbors are: Apple. The predicted label is: Apple. The point was [0, 0, 1].


Let's take a look at the data set

In [10]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [15]:
#separate x and y labels
training = iris['data']
training_labels = iris['target']

#create x and y dataframes
training_df = pd.DataFrame(training,columns=['sepal length','sepal width','petal length','petal width'])
labels_df = pd.DataFrame(training_labels,columns=['Iris Species'])

#join them together horizontally
training_df = pd.concat([training_df,labels_df],axis=1)

In [34]:
#separate x and y dataframes
labels_df = training_df['Iris Species']
training_df.drop('Iris Species',axis=1,inplace=True)

#create training and testing sets
x_train,x_test,y_train,y_test = train_test_split(training_df,labels_df,test_size=.33,random_state=42)

In [62]:
num_test = x_test.shape[0]

In [66]:
iris_knn = simple_knn()
iris_knn.train(x_train,y_train)

In [79]:
correct,wrong = 0,0
for index in range(num_test):
    flower = x_test.iloc[index]
    neighbors = iris_knn.get_neighbors(3,flower)
    labels = iris_knn.vote(neighbors)
    if labels == y_test.iloc[index]:
        correct += 1
    else:
        print('The algorithm incorrectly classified row %s. The perdicted label was %s and the actual label is %s.' % (index,labels,y_test.iloc[index]))
        wrong += 1
print('The algorithm guessed %s correct and %s wrong.' % (correct,wrong))

The algorithm incorrectly classified row 46. The perdicted label was 1 and the actual label is 2.
The algorithm guessed 49 correct and 1 wrong.
