In [None]:
import scipy.io
import numpy as np
import math
from sklearn import preprocessing
import random
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
%matplotlib inline 

def plot_num(center):
    center = np.reshape(center, (28, 28))
    plt.figure()
    plt.imshow(center)
    plt.colorbar()
    
def distance(instance1, instance2):
    y = int(math.sqrt(len(instance1)))
    inst = np.reshape(instance2, (y, y))
    inst0 = np.reshape(instance1, (y, y))
    d0 = np.linalg.norm(instance1 - instance2)
    inst1 = np.zeros([y,y])
    inst1[:,1:] = inst0[:,:y-1]
    inst2 = np.zeros([y,y])
    inst2[:,:y-1] = inst0[:,1:]
    inst3 = np.zeros([y,y])
    inst3[1:,:] = inst0[:y-1,:]
    inst4 = np.zeros([y,y])
    inst4[:y-1,:] = inst0[1:,:]
    d1 = np.linalg.norm(inst1 - inst)
    d2 = np.linalg.norm(inst2 - inst)
    d3 = np.linalg.norm(inst3 - inst)
    d4 = np.linalg.norm(inst4 - inst)
#     print d1, d2, d3, d4
    return min(d0,d1,d2,d3,d4)

def get_neighbors(instance,train_img, train_labels, K):
    distances = []
    for i in range(len(train_img)):
        dist = distance(instance, train_img[i])
        distances.append((train_img[i], dist, train_labels[i]))
    distances.sort(key=lambda x: x[1])
    return distances[:K]

def vote(neighbors,K):
    #img, dist,label
    counts = np.array([0]*10)
    best_label, best_dist, best_img = 0, 9999, neighbors[0][0]
    for i in range(K):
        img, dist,label = neighbors[i]
        counts+= label
        if best_dist > dist:
            best_label, best_dist,best_img = label, dist,img
    best_count = max(counts)
#     print "counts", counts
#     print "best_label, best_dist",best_label, best_dist
#     plot_num(best_img)
    best_labels = np.where(counts == best_count)[0]
#     print "best_labels", best_labels
    if len(best_labels) == 1:
        result = np.array([0]*10)
        result[best_labels[0]]=1
        return result
    else:
        return best_label
    
def weighted_vote(neighbors, K):
    counts = np.array([0]*10)
    for i in range(K):
        img, dist,label = neighbors[i]
        counts = counts + label*1.0/(i+1)
    result = np.array([0]*10)
    result[np.argmax(counts)] = 1
    return result

data = scipy.io.loadmat('dataset/mnist.mat')
print data.keys()
train_labels = data['trainY']
train_img = data['trainX']
print len(train_img[0])
print train_img.shape
print train_labels

lb = preprocessing.LabelBinarizer()
lb.fit(range(10))
train_labels = lb.transform(train_labels[0])
print train_labels[0]

train_img ,train_labels = shuffle(train_img, train_labels, random_state=0)
train_img = preprocessing.scale(train_img)
X = train_img[:55000]
y = train_labels[:55000]
Xv = train_img[55000:]
print Xv.shape
yv = train_labels[55000:]

test_img = data['testX']
test_img = preprocessing.scale(test_img)
test_labels = data['testY']
print test_img.shape
test_labels = lb.transform(test_labels[0])
print test_labels[0]

In [None]:
# print distance(X[0],X[1])
K = 10
interval = 100
count = 0
vali_acc = []
for i in range(5000):
    neighbors = get_neighbors(Xv[i], X, y, K)
    #img, dist,label
#     result = vote(neighbors,K)
    result = weighted_vote(neighbors,K)
    if list(result) == list(yv[i]):
        count += 1
    acc = 1.0*count/(i+1)
    if (i+1)%interval == 0:
        vali_acc.append(acc)
        print i,result, yv[i], list(result) == list(yv[i]), acc
        
count = 0
test_acc = []
for i in range(10000):
    neighbors = get_neighbors(test_img[i], train_img, train_labels, K)
#     result = vote(neighbors,K)
    result = weighted_vote(neighbors,K)
    if list(result) == list(test_labels[i]):
        count += 1

    acc = 1.0*count/(i+1)
    if (i+1)%interval == 0:
        test_acc.append(acc)
        print i,result, test_labels[i], list(result) == list(test_labels[i]), acc

In [None]:
plt.plot([i * interval for i in range(len(vali_acc))], vali_acc)
plt.plot([i * interval for i in range(len(test_acc))], test_acc)
plt.legend(['validation accuracy','test accuracy'], loc = 'upper right')
plt.show()