## Importing Modules

In [1]:
import csv
import random
import math
import operator

## 1. Handle Data

In [2]:
# The first thing we need to do is load our data file. 

In [6]:
with open('iris.data.txt', 'r') as csvfile:

    lines = csv.reader(csvfile)

    for row in lines :

        print (', '.join(row))

5.1, 3.5, 1.4, 0.2, Iris-setosa
4.9, 3.0, 1.4, 0.2, Iris-setosa
4.7, 3.2, 1.3, 0.2, Iris-setosa
4.6, 3.1, 1.5, 0.2, Iris-setosa
5.0, 3.6, 1.4, 0.2, Iris-setosa
5.4, 3.9, 1.7, 0.4, Iris-setosa
4.6, 3.4, 1.4, 0.3, Iris-setosa
5.0, 3.4, 1.5, 0.2, Iris-setosa
4.4, 2.9, 1.4, 0.2, Iris-setosa
4.9, 3.1, 1.5, 0.1, Iris-setosa
5.4, 3.7, 1.5, 0.2, Iris-setosa
4.8, 3.4, 1.6, 0.2, Iris-setosa
4.8, 3.0, 1.4, 0.1, Iris-setosa
4.3, 3.0, 1.1, 0.1, Iris-setosa
5.8, 4.0, 1.2, 0.2, Iris-setosa
5.7, 4.4, 1.5, 0.4, Iris-setosa
5.4, 3.9, 1.3, 0.4, Iris-setosa
5.1, 3.5, 1.4, 0.3, Iris-setosa
5.7, 3.8, 1.7, 0.3, Iris-setosa
5.1, 3.8, 1.5, 0.3, Iris-setosa
5.4, 3.4, 1.7, 0.2, Iris-setosa
5.1, 3.7, 1.5, 0.4, Iris-setosa
4.6, 3.6, 1.0, 0.2, Iris-setosa
5.1, 3.3, 1.7, 0.5, Iris-setosa
4.8, 3.4, 1.9, 0.2, Iris-setosa
5.0, 3.0, 1.6, 0.2, Iris-setosa
5.0, 3.4, 1.6, 0.4, Iris-setosa
5.2, 3.5, 1.5, 0.2, Iris-setosa
5.2, 3.4, 1.4, 0.2, Iris-setosa
4.7, 3.2, 1.6, 0.2, Iris-setosa
4.8, 3.1, 1.6, 0.2, Iris-setosa
5.4, 3.4

In [7]:
# we need to split the data into a training dataset 

In [8]:
def loadDataset(filename, split, trainingSet=[] , testSet=[]):
    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        for x in range(len(dataset)-1):
            for y in range(4):
                dataset[x][y] = float(dataset[x][y])
            if random.random() < split:
                trainingSet.append(dataset[x]) 
            else:
                testSet.append(dataset[x]) 

In [9]:
# test this function out with our iris dataset

In [11]:
trainingSet=[]

testSet=[]

loadDataset('iris.data.txt', 0.66, trainingSet, testSet)

print ('Train: ' + repr(len(trainingSet)))

print ('Test: ' + repr(len(testSet)) )

Train: 99
Test: 50


## 2. Similarity

In [12]:
def euclideanDistance(instance1, instance2, length): 
    distance = 0 
    for x in range(length): 
        distance += pow((instance1[x] - instance2[x]), 2) 
    return math.sqrt(distance) 
                        

In [13]:
# test this function with some sample data

In [15]:
data1 = [2, 2, 2, 'a']

data2 = [4, 4, 4, 'b']

distance = euclideanDistance(data1, data2, 3)

print ('Distance: ' + repr(distance))

Distance: 3.4641016151377544


### 3. Neighbors

In [16]:
# getNeighbors function that returns k most similar neighbors from the training set for a given test instance

In [17]:
def getKNeighbors(trainingSet, testInstance, k): 
    distances = [] 
    length = len(testInstance)-1 
    for x in range(len(trainingSet)): 
        dist = euclideanDistance(testInstance, trainingSet[x], length) 
        distances.append((trainingSet[x], dist)) 
    distances.sort(key=operator.itemgetter(1)) 
    neighbors = [] 
    for x in range(k): 
        neighbors.append(distances[x][0]) 
    return neighbors 

In [18]:
#  test out this function 

In [20]:
trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b']]

testInstance = [5, 5, 5]

k = 1

neighbors = getKNeighbors(trainSet, testInstance, 1)

print(neighbors)

[[4, 4, 4, 'b']]


## 4. Response

In [22]:
# function for getting the majority voted response from a number of neighbors. It assumes the class is the last attribute for each neighbor.

In [23]:
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]    

In [24]:
# test out this function with some test neighbors

In [25]:
neighbors = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]

response = getResponse(neighbors)

print(response)

a


## 5. Accuracy

In [32]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0  

In [27]:
# test this function with a test dataset and predictions

In [33]:
testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]

predictions = ['a', 'a', 'a']

accuracy = getAccuracy(testSet, predictions)

print(accuracy)

66.66666666666666


## 6. Main

In [29]:
# we can put  all the prevoious functions in one main function

In [34]:
def main():
    # preparing and handling the data
    trainingSet=[]
    testSet=[]
    split = 0.67
    # 1. Handle Data
    loadDataset(r'iris.data.txt', split, trainingSet, testSet)
    # 2. Similarity
    print('Train set: ' + repr(len(trainingSet)))
    print('Test set: ' + repr(len(testSet)))
    # funding our knn model
    # 
    predictions=[]
    k = 3
    for x in range(len(testSet)):
        # 3. Neighbors
        neighbors = getKNeighbors(trainingSet, testSet[x], k)
        # 4. Response
        resp = getResponse(neighbors)
        predictions.append(resp)
        print(' predicted result=' + repr(resp) + ', actual result=' + repr(testSet[x][-1]))
    # checking the performence of our model 
    #5. Accuracy
    accuracy=getAccuracy(testSet, predictions)
    print('Accuracy value : ' + repr(accuracy) + '%')

In [35]:
# calling the main function
main()

Train set: 106
Test set: 43
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-setosa'
 predicted result='Iris-setosa', actual result='Iris-seto

## 7. Another distance metric


#### Manhattan Distance between two points (x1, y1) and (x2, y2) is: |x1 – x2| + |y1 – y2|



In [36]:
import math 
#create function to calculate Manhattan distance 
# a and b are 2 vectors of cordinates of the 2 points that we want to calculate the distance between them
def manhattan(a, b):
    return sum(abs(val1-val2) for val1, val2 in zip(a,b))


#define vectors to try the function
a = [2, 4] #  2D coordinates
b = [5, 5] #  2D coordinates

#calculate Manhattan distance between vectors
print ( " the Distance between the 2 points a and b : " ,manhattan(a, b))



    

 the Distance between the 2 points a and b :  4
