In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
df = pd.read_csv('iris.csv')

In [2]:
def knn(data, obj, k):
    """ input  - data is assumed to be a pandas dataframe with last column having the labels
                 obj is the single row dataframe with attributes for which label is to be predicted
                 k is algorithmic parameter for k nearest neighbour algorithm. 
        return - function returns the predicted label"""
    
    entries_data = data.shape[0]
    attributes_data = data.shape[1]-1 # last column being label is omited
    
    dist = [] # list to hold euclidean distances of obj with each entry in data 

    for entry in range(entries_data):
        d = 0
        for attribute in range(attributes_data):
            d += (data.iloc[entry,attribute] - obj.iloc[attribute])**2
        dist.append(d**0.5)
        
    data['dist'] = dist
    data.sort_values(by="dist",inplace=True)
    ans = data.iloc[:,-2][0:k].mode()[0]
    data.drop('dist', axis = 1,inplace=True)
    return ans

In [3]:
def kfold_CV(data, k, knn_k):
    """ input  - data is assumed to be a pandas dataframe with last column having the labels
                 k is algorithmic parameter for k fold cross validation algorithm and knn_k is 
                 algothmic parameter for prediction algorithm used 
        output - function returns the average accuracy for k trials with diffrent test and train splits"""
    
    data = data.sample(frac = 1) # shuffling the entries as test and train are to be picked at random
    accu = 0
    row, col = data.shape
    for i in range(k): #loop for selecting different test and train splits
        test  = data.iloc[i*row//k : (i+1)*row//k].copy()
        train = data.iloc[ :i*row//k].append(data.iloc[(i+1)*row//k: ]).copy()
        
        test_elements = test.shape[0]   
        for j in range(test_elements): #loop for each element of test set
            if test.iloc[j,-1] == knn(train, test.iloc[j], knn_k):
                accu +=1
                
    return(accu/test_elements/k)


In [4]:
for i in range(1,21,2):
    print("for k = " + str(i) + " accuracy = " + str(round(kfold_CV(df,5,i),2)))

for k = 1 accuracy = 0.95
for k = 3 accuracy = 0.96
for k = 5 accuracy = 0.97
for k = 7 accuracy = 0.96
for k = 9 accuracy = 0.98
for k = 11 accuracy = 0.97
for k = 13 accuracy = 0.96
for k = 15 accuracy = 0.97
for k = 17 accuracy = 0.96
for k = 19 accuracy = 0.95
