In [24]:
from scipy.spatial import distance
import pickle
from sklearn.metrics import classification_report
from random import shuffle
import os.path
import numpy as np
from scipy.spatial.distance import cdist
from scipy.stats import mode

def pickle_operating(fname, item):
    # save or load the pickle file.
    file_name = '%s.pickle' % fname
    print(file_name)
    if not item:
        with open(file_name, 'rb') as fs:
            item = pickle.load(fs)
            return item
    else:
        with open(file_name, 'wb') as fs:
            pickle.dump(item, fs, protocol=pickle.HIGHEST_PROTOCOL)

In [44]:
def getNeighbors(dataset, node, k_val=10):
    '''
    distances = []
    for x in range(len(dataset)):
        d = distance.euclidean(node, dataset[x][0])
        distances.append((x, d))
    distances.sort(key=lambda y: y[1])
    neighbors = []
    for x in range(k_val):
        neighbors.append((distances[x][0], dataset[distances[x][0]][1]))
    '''
    dists = cdist(dataset[0], [node], 'euclidean')
    idxes = np.argpartition(dists, k_val, axis=0)[:k_val]
    nearest_dists = np.take(dataset[1], idxes)
    return nearest_dists

In [38]:
def saveNearestNeighbors(train_data, test_data):
    data_nn = []
    X_train, Y_train = np.array([x[0] for x in train_data]), np.array([x[1] for x in train_data])
    for x in range(len(test_data)):
        if x % 500 == 0:
            print x
        node = test_data[x][0]
        data_nn.append(getNeighbors([X_train, Y_train], node, 10))
    return data_nn

In [39]:
def getLabels(neighbors, k_val):
    label_votes = {}
    for x in range(0, k_val):
        label = neighbors[x][1]
        if label in label_votes:
            label_votes[label] += 1
        else:
            label_votes[label] = 1
    sorted_votes = sorted(label_votes.iteritems(), key=lambda y: y[1], reverse=True)
    return sorted_votes[0][0]

In [56]:
def knn_experiment(testdata, testdata_nn, k=3):
    y_classified = []
    y_true = [x[1] for x in testdata]
    for x in range(len(testdata)):
        neighbors = testdata_nn[x]
        label = mode(neighbors[:k], axis=0)[0][0][0]
        #label = getLabels(neighbors, k)
        y_classified.append(label)
        #print('%s actual= %s, predicted=%s' % (x, y_true[x], label))
    print(classification_report(y_true, y_classified))

In [57]:
dataset = pickle_operating('Caltech_data_3', None)
print(len(dataset['train']), len(dataset['test']))
from random import shuffle
shuffle(dataset['train'])
shuffle(dataset['test'])

Caltech_data_3.pickle
(320, 324)


In [58]:
data_nn = saveNearestNeighbors(dataset['train'], dataset['test'])
pickle_operating('Caltech_testdata_nn', data_nn)

0
Caltech_testdata_nn.pickle


In [59]:
testdata_nn = pickle_operating('Caltech_testdata_nn', None)
knn_experiment(dataset['test'], testdata_nn, 3)

Caltech_testdata_nn.pickle
             precision    recall  f1-score   support

          1       0.16      0.50      0.24        36
          2       0.23      0.23      0.23        31
          3       0.45      0.61      0.52        33
          4       0.04      0.05      0.04        21
          5       0.83      0.58      0.68        43
          6       0.33      0.17      0.23        46
          7       0.31      0.12      0.18        32
          8       0.14      0.07      0.10        27
          9       0.75      0.36      0.49        25
         10       0.94      0.50      0.65        30

avg / total       0.43      0.34      0.35       324



In [60]:
dataset = pickle_operating('MNIST_data_2', None)
print(len(dataset['train']), len(dataset['test']))
shuffle(dataset['train'])
shuffle(dataset['test'])

MNIST_data_2.pickle
(60000, 10000)


In [61]:
data_nn = saveNearestNeighbors(dataset['train'], dataset['test'])
pickle_operating('MNIST_testdata_nn', data_nn)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
MNIST_testdata_nn.pickle


In [62]:
testdata_nn = pickle_operating('MNIST_testdata_nn', None)
knn_experiment(dataset['test'], testdata_nn, 3)

MNIST_testdata_nn.pickle
             precision    recall  f1-score   support

          0       0.96      0.99      0.98       980
          1       0.98      0.99      0.99      1135
          2       0.96      0.97      0.96      1032
          3       0.95      0.95      0.95      1010
          4       0.95      0.93      0.94       982
          5       0.96      0.94      0.95       892
          6       0.98      0.98      0.98       958
          7       0.97      0.95      0.96      1028
          8       0.95      0.94      0.95       974
          9       0.91      0.93      0.92      1009

avg / total       0.96      0.96      0.96     10000

