In [401]:
import csv
import random
import math
import operator
import numpy as np
from typing import Callable,List

# Handle Data

In [402]:
def loadDataset(filename : str, split: float=0.80, trainingSet : list =[] , testSet: list =[], readHeader: bool=False, shuffle: bool=True):

    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        if shuffle : 
            random.shuffle(dataset)
        splitnb=len(dataset)*split
        for x in range(len(dataset)):
            
            for y in range(4):
                dataset[x][y] = float(dataset[x][y])
            
            if (len(trainingSet) < splitnb) :
                
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])

# Similarity

In [403]:
def euclideanDistance(instance1: list, instance2: list, length: int=0):

    
    if length==0: length=min(len(instance1),len(instance2))
    sse=sum([math.pow(x-y,2) for x,y in zip(instance1[:length],instance2[:length])])
    return math.sqrt(sse)

# Neighbors

In [404]:
def getNeighbors(trainingSet: list, testInstance: list, k: int =1, fdistance: Callable=euclideanDistance):

    distances = []

    length = len(testInstance)-1
    
    for x in range(len(trainingSet)):
        

        dist = fdistance(testInstance, trainingSet[x], length)

        distances.append((trainingSet[x], dist))

    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    
    for x in range(k):
        neighbors.append(distances[x][0])
    

    return neighbors

# Response

In [405]:
def getResponse(neighbors: list):

    classVotes = {}

    for x in range(len(neighbors)):

        response = neighbors[x][-1] 
        

        if response in classVotes:

            classVotes[response]+=1
        else:
            classVotes[response]=1
            

    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    
    return sortedVotes[0][0]

# Accuracy

In [406]:
def getAccuracy(testSet : list, predictions :list):

    correct=sum([ x[-1]==y for x,y in zip(testSet, predictions)])
    

    return (correct/float(len(testSet))) * 100.0

# Main

In [407]:
def predict(filename: str, split: int=0.80, k: int=1, fdistance: Callable=euclideanDistance):

    trainingSet=[]
    testSet=[]

    loadDataset(filename, split, trainingSet, testSet)
    
    prediction=[getResponse(getNeighbors(trainingSet, x, k,euclideanDistance))\
                  for x in testSet]
    
    return getAccuracy(testSet, prediction)  

In [408]:
matacc=[]
for i in range(100):
    matacc.append(predict(filename='../input/irisdata/iris.data.txt', split=0.70, k=5 , fdistance=euclideanDistance))
acc=sum(matacc)/100   
print(f'Mean Accuracy of the Model = {acc:.2f} %')

Mean Accuracy of the Model = 96.27 %


# Another distance metric

In [409]:
def manhattanDistance(instance1: list, instance2: list, length: int=0):

    
    if length==0: length=min(len(instance1),len(instance2))
    sae=sum([abs(x-y) for x,y in zip(instance1[:length],instance2[:length])])
    
    return sae

In [410]:
matacc=[]
for i in range(100):
    matacc.append(predict(filename='../input/irisdata/iris.data.txt', split=0.70, k=5 , fdistance=manhattanDistance))
acc=sum(matacc)/100   
print(f'Mean Accuracy of the Model = {acc:.2f} %')


Mean Accuracy of the Model = 96.51 %
