KNN(K Nearest neighbors) algorithm is a method of classification. When a new item comes, we can find its K close neighbors, if 
most neighbors belong to class A, then we regard this new item as class A. 

The key problem is about how to choose a proper K value. Usually, we use cross validation to choose the right K. For each K (K=1,
2, 3 ...), we do the cross validation, and calculate the distortion for each K. Then we should choose the K with the minimum 
distortion.

In [2]:
import csv
import random
import math
import operator

def load_file(filename):
    dataset = []
    with open(filename) as file:
        reader_line = csv.reader(file)
        #print(type(reader_line))
        dataset = list(reader_line)
        #print(len(dataset))
        #print(len(dataset[0]))
    
        for x in range(len(dataset)):
            if dataset[x]:
                for y in range (len(dataset[0])-1):
                    dataset[x][y] = float(dataset[x][y]);
            else:
                dataset.pop(x)
            
    return dataset


def train_test_split(dataset, ratio):
    train_set = []
    test_set = []
    test_set = dataset
    
    train_len = ratio * len(dataset)

    while(len(train_set) < train_len):
        idx = random.randrange(len(test_set))
        train_set.append(test_set[idx])
        test_set.pop(idx)

    return train_set, test_set

def EuclidDist(instance1, instance2):
    length = len(instance1) - 1
    distance = 0.0
    for ch in range(length):
        distance += pow((instance1[ch] - instance2[ch]), 2)

    distance = math.sqrt(distance)

    return distance


def find_neighbors(train_set, test_instance, K):
    dist = []
    for tr_idx in range (len(train_set)):
        distance = EuclidDist(test_instance, train_set[tr_idx])
        dist.append([distance,tr_idx])
        #print(dist)

   # print(dist)
    dist.sort(key=lambda dist : dist[0])

    neighbors = []
    for i in range(K):
        neighbors.append(dist[i])

   # print(neighbors)
    return neighbors

def predictClass(neighbors, train_set):
    length = len(neighbors)
    classVote = {}

    for i in range(length):
        instance_class = train_set[neighbors[i][-1]][-1]

        if instance_class in classVote:
            classVote[instance_class] += 1
        else:
            classVote[instance_class] = 1

    sortedVotes = sorted(classVote.items(),key=operator.itemgetter(1), reverse=True)        

    return sortedVotes[0][0]

def getAccuracy(test_set, predictions):
    length = len(test_set)
    correct_num = 0
    for i in range(length):
        if test_set[i][-1] == predictions[i]:
            correct_num += 1

    result = correct_num/float(length)
    

    return result


if __name__ == '__main__':
    dataset = load_file('iris.csv')
    ratio = 0.7
    train_set, test_set = train_test_split(dataset, ratio)
    K = 3
    predictions = []

    for te_idx in range(len(test_set)):
        #print(test_set[te_idx])
        neighbors = find_neighbors(train_set, test_set[te_idx], K)
        prediction = predictClass(neighbors, train_set)
        predictions.append(prediction)
        print('>predicted: ' + repr(prediction)+ ', actual value: ' + repr(test_set[te_idx][-1]))


    accuracy = getAccuracy(test_set, predictions)
    print('The accuracy is ' + repr(accuracy))




>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-setosa', actual value: 'Iris-setosa'
>predicted: 'Iris-versicolor', actual value: 'Iris-versicolor'
>predicted: 'Iris-versicolor', actual value: 'Iris-versicolor'
>predicted: 'Iris-versicolor', actual value: 'Iris-versicolor'
>predicted: 'Iris-versicolor', actual value: 'Iris-versicolor'
>predicted: 'Iris-versicolor', actual value: 'Iris-versicolor'
>predicted: 'Iris-versicolor', actual value: 'Iris-versicolor'
>predicted: 'Iris

In [None]:
if __name__ == '__main__': if we donot add this line, when this python file is imported by other python program,
    it will execute the command below this line. With this line, the command below this line will only be excuted 
    in current file. 