In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
df = pd.read_csv('iris.csv')

# K-Nearest Neighbours

In [2]:
def knn(data, obj, k):
    """ input  - data is assumed to be a pandas dataframe with last column having the labels
                 obj is the single row dataframe with attributes for which label is to be predicted
                 k (int / list of int) is algorithmic parameter for k nearest neighbour algorithm. 
        return - function returns the predicted label"""
    
    entries_data = data.shape[0]
    attributes_data = data.shape[1]-1 # last column being label is omited
    obj = pd.Series(obj)
    
    dist = [] # list to hold euclidean distances of obj with each entry in data 

    for entry in range(entries_data):
        d = 0
        for attribute in range(attributes_data):
            d += (data.iloc[entry,attribute] - obj.iloc[attribute])**2
        dist.append(d**0.5)
        
    data['dist'] = dist
    data.sort_values(by="dist", inplace=True)
    
    ans = [None]*len(k)
    for k_i in range(len(k)):
        possible_ans = data.iloc[:,-2][0:k[k_i]].mode()
        if len(possible_ans) == 1:
            ans[k_i] = data.iloc[:,-2][0:k[k_i]].mode()
        else:
            dist_ans = []
            for ans_i in possible_ans:
                dist_ans.append(data[data.iloc[:,-2] == ans_i].iloc[:,-1].sum())
            ans[k_i] = dist_ans.index(max(dist_ans))
    
    
    ans = [(data.iloc[:,-2][0:k_i].mode()[0]) for k_i in k]
    
    data.drop('dist', axis = 1,inplace=True)
    return ans

# K-Fold Cross-Validation

In [3]:
def kfold_CV(data, k, knn_k):
    """ input  - data is assumed to be a pandas dataframe with last column having the labels
                 k is algorithmic parameter for k fold cross validation algorithm and knn_k (int / list of int)
                 is algothmic parameter for prediction algorithm used 
        output - function returns the average accuracy for k trials with diffrent test and train splits"""
    
    data = data.sample(frac = 1) # shuffling the entries as test and train are to be picked at random
    
    accu = np.zeros(len(knn_k))
    row, col = data.shape
    
    for i in range(k): #loop for selecting different test and train splits
        test  = data.iloc[i*row//k : (i+1)*row//k].copy()
        train = data.iloc[ :i*row//k].append(data.iloc[(i+1)*row//k: ]).copy()
        
        test_elements = test.shape[0]
        
        for j in range(test_elements): #loop for each element of test set
            test_results = knn(train, test.iloc[j], knn_k)
            
            for k_i in range(len(knn_k)): #loop for each value of k of KNN
                if test.iloc[j,-1] == test_results[k_i]:
                    accu[k_i] += 1

    return accu/data.shape[0]

In [4]:
k_vals = range(1,21,2)
accuracy = kfold_CV(df,5,k_vals)
for i in range(len(accuracy)):
    print("For k = ",k_vals[i]," Accuracy = ",round((accuracy[i]*100),2))

For k =  1  Accuracy =  96.0
For k =  3  Accuracy =  97.33
For k =  5  Accuracy =  96.67
For k =  7  Accuracy =  97.33
For k =  9  Accuracy =  98.0
For k =  11  Accuracy =  96.0
For k =  13  Accuracy =  96.67
For k =  15  Accuracy =  97.33
For k =  17  Accuracy =  96.0
For k =  19  Accuracy =  96.67
