In [0]:
# author: Josef Lorenz Rumberger   Matr#: 587961
# code blocks partly taken from: 
# https://www.kdnuggets.com/2016/01/implementing-your-own-knn-using-python.html
from sklearn.datasets import load_iris 
from sklearn.model_selection import train_test_split
import numpy
import math
from operator import itemgetter
from collections import Counter
from sklearn.metrics import classification_report, accuracy_score

def get_distance(data1, data2, len):
   return math.sqrt(sum([(a - b) ** 2 for a, b in zip(data1[:len], data2[:len])]))

def get_neighbours(training_set, test_instance, k, dim):
    distances = [_get_tuple_distance(training_instance, test_instance, dim) for training_instance in training_set]
 
    # index 1 is the calculated distance between training_instance and
    # test_instance
    sorted_distances = sorted(distances, key=itemgetter(1))
 
    # extract only training instances
    sorted_training_instances = [tuple[0] for tuple in sorted_distances]
 
    # select first k elements
    return sorted_training_instances[:k]
 
def _get_tuple_distance(training_instance, test_instance, dim):
    return (training_instance, get_distance(test_instance, training_instance, dim))

def get_majority_vote(neighbours):
    # index 2 is the class
    classes = [neighbour[2] for neighbour in neighbours]
    count = Counter(classes)
    return count.most_common()[0][0]


In [3]:
def main():
    #sample dataset: plants measured by sepal length, sepal width, petal length
    #and petal width.
    #Labels: three types of Iris (coded as 0, 1, 2)
    iris = load_iris()

    #we skip the class '2', so we only have 2 classes.  (0 and 1).
    iris_indices = numpy.where(iris.target < 2)[0]
    iris_data = iris.data[iris_indices]
    iris_target = iris.target[iris_indices]

    #remove petal length and petal width so we only have two dimensions
    iris_data = numpy.delete(iris_data, numpy.s_[2:4], axis=1)  

    # split the data set
    X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_target, test_size=0.4, random_state=1)
    
    # Look at the splitted sets
    X_train.shape, y_train.shape
    X_test.shape, y_test.shape
    # reformat train/test datasets for convenience
    train = numpy.hstack([X_train , numpy.atleast_2d(y_train).T])
    test = numpy.hstack([X_test , numpy.atleast_2d(y_test).T])

    predictions = []

    k = 5

    for x in range(len(X_test)):
        print( 'Classifying test instance number ' + str(x) + ":")
        neighbours = get_neighbours(training_set = train, test_instance = test[x], k=5, dim = 2)
        # neighbours is a different set than get_majority_vote(neighbours) expects
        majority_vote = get_majority_vote(neighbours)
        predictions.append(majority_vote)
        print('Predicted label=' + str(majority_vote) + ', Actual label=' + str(test[x][2]))
 
    # summarize performance of the classification
    print( '\nThe overall accuracy of the model is: ' + str(accuracy_score(y_test, predictions)) + "\n")
    report = classification_report(y_test, predictions, target_names = iris.target_names)
    print ('A detailed classification report: \n\n' + report)
    print(X_test[1])#, predictions)
if __name__ == "__main__":
    main()

Classifying test instance number 0:
Predicted label=1.0, Actual label=1.0
Classifying test instance number 1:
Predicted label=1.0, Actual label=1.0
Classifying test instance number 2:
Predicted label=0.0, Actual label=0.0
Classifying test instance number 3:
Predicted label=1.0, Actual label=1.0
Classifying test instance number 4:
Predicted label=1.0, Actual label=1.0
Classifying test instance number 5:
Predicted label=0.0, Actual label=0.0
Classifying test instance number 6:
Predicted label=0.0, Actual label=0.0
Classifying test instance number 7:
Predicted label=1.0, Actual label=1.0
Classifying test instance number 8:
Predicted label=1.0, Actual label=1.0
Classifying test instance number 9:
Predicted label=1.0, Actual label=1.0
Classifying test instance number 10:
Predicted label=1.0, Actual label=1.0
Classifying test instance number 11:
Predicted label=0.0, Actual label=0.0
Classifying test instance number 12:
Predicted label=1.0, Actual label=1.0
Classifying test instance number 13

  .format(len(labels), len(target_names))
