In [ ]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
from random import shuffle
from sklearn.metrics import accuracy_score
from math import sqrt

In [ ]:
df = pd.read_csv("chips.txt", header=None, names=["x", "y", "type"])
df['color'] = df['type'].map(lambda x: 'red' if x else 'blue')

In [ ]:
plt.scatter(df['x'], df['y'], c=df['color'])
plt.show()

In [ ]:
def cross_validation_split(data, k):
    rv = data[:]
    shuffle(rv)
    count = int(len(data) / k)
    return rv[:count], rv[count:]

def accuracy(theory, predictions):
    correct = 0
    for x in range(len(theory)):
        if theory[x][-1] == predictions[x]:
            correct += 1
    return correct/float(len(theory))

def f1score(theory, practice):
    tp, fn, fp, tn = 0, 0, 0, 0
    for i in range(len(theory)):
        tp += theory[i][-1] == practice[i] and practice[i] == 1
        tn += theory[i][-1] == practice[i] and practice[i] == 0
        fn = sum(practice) - tp
        fp = (len(practice) - sum(practice)) - tn
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    return 2 * precision * recall / (precision + recall)

In [ ]:
def euclidean(a, b):
    rv = 0
    for i in range(len(a)):
        rv += (a[i] - b[i]) ** 2
    return sqrt(rv)

def manhattan(a, b):
    rv = 0
    for i in range(len(a)):
        rv += abs(a[i] - b[i])
    return rv

def KNN(learn, test, k, distance):
    k = min(k, len(learn))
    res = [] 
    for p in test:
        a = [[distance(p[:2], x[:2]), x] for x in learn]
        a.sort()
        a = a[:k]
        r = {}
        for j, i in a:
            if i[2] in r:
                r[i[2]] = r[i[2]] + 1
            else:
                r[i[2]] = 1
        ans = sorted(r, key = r.get, reverse=True)[0]
        res.append((p, ans))
    return res

In [ ]:
def k_to_score(data, distance, score_function):
    test, learn = cross_validation_split(data, 5)
    result = {}
    for k in range(1, len(learn)):
        rr = KNN(learn, test, k, distance)
        result[k] = score_function(test, [x[-1] for x in rr])
    pprint(sorted(result.items(), key=lambda x : x[-1], reverse=True)[0])
    return result

In [ ]:
data = list(zip(list(df['x']), list(df['y']), list(df['type'])))
rv = k_to_score(data, euclidean, accuracy)
plt.plot(list(rv.keys()), list(rv.values()))

In [ ]:
data = list(zip(list(df['x']), list(df['y']), list(df['type'])))
rv = k_to_score(data, manhattan, accuracy)
plt.plot(list(rv.keys()), list(rv.values()))