In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
df = pd.read_csv('iris.csv')

In [2]:
def distance(vec1, vec2):
    return np.sqrt(np.sum((np.array(vec1) - np.array(vec2))**2))

def mode_knn(lab, dist):
    lab = pd.Series(lab)
    dist = pd.Series(dist)
    
    possible_ans = lab.mode()
    if len(possible_ans) == 1:
        return possible_ans[0]
    else:
        distances = {}
        for ans in possible_ans:
            distances[ans] = dist[lab == ans].sum()
        return min(distances, key = distances.get)

# K-Nearest Neighbours

In [3]:
def knn(data, obj, k):
    """ input  - data is assumed to be a pandas dataframe with last column having 
                 the labels obj is the single row dataframe with attributes for which
                 label is to be predicted k (int / list of int) is algorithmic 
                 parameter for k nearest neighbour algorithm. 
        return - function returns the predicted label"""
    
    entries_data = data.shape[0]
    attributes_data = data.shape[1]-1 # last column being label is omited
    obj = pd.Series(obj)
    
    dist = [] # list to hold euclidean distances of obj with each entry in data 
    for entry in range(entries_data):
        dist.append(distance(data.iloc[entry,0:attributes_data], obj[0:attributes_data]))

    data['dist'] = dist
    data.sort_values(by="dist", inplace=True)       
    ans = [ mode_knn(data.iloc[:,-2][0:k_i], data.iloc[:,-1][0:k_i]) for k_i in k]
    
    data.drop('dist', axis = 1,inplace=True)
    return ans

# K-Fold Cross-Validation

In [4]:
def kfold_CV(data, k, knn_k):
    """ input  - data is assumed to be a pandas dataframe with last column 
                 having the labels k is algorithmic parameter for k fold cross 
                 validation algorithm and knn_k (int / list of int) is algothmic 
                 parameter for prediction algorithm used 
        output - function returns the average accuracy for k trials with diffrent 
                 test and train splits"""
    
    # shuffling the entries as test and train are to be picked at random
    data = data.sample(frac = 1) 
    
    accu = np.zeros(len(knn_k))
    row, col = data.shape
    
    for i in range(k): #loop for selecting different test and train splits
        test  = data.iloc[i*row//k : (i+1)*row//k].copy()
        train = data.iloc[ :i*row//k].append(data.iloc[(i+1)*row//k: ]).copy()
        
        test_elements = test.shape[0]
        for j in range(test_elements): #loop for each element of test set
            test_results = knn(train, test.iloc[j], knn_k)
            
            for k_i in range(len(knn_k)): #loop for each value of k of KNN
                if test.iloc[j,-1] == test_results[k_i]:
                    accu[k_i] += 1

    return accu/data.shape[0]

In [5]:
k_vals = range(1,21,2)
accuracy = kfold_CV(df,5,k_vals)

In [6]:
accuracy_list = pd.DataFrame(columns=['k', 'Accuracy'])
for i in range(len(accuracy)):
    accuracy_list = accuracy_list.append({"k":k_vals[i], "Accuracy":round((accuracy[i]*100),2)}, ignore_index=True)
accuracy_list

Unnamed: 0,k,Accuracy
0,1.0,96.0
1,3.0,96.0
2,5.0,95.33
3,7.0,95.33
4,9.0,96.0
5,11.0,95.33
6,13.0,96.67
7,15.0,98.0
8,17.0,98.0
9,19.0,96.67
