In [24]:
from scipy.spatial import distance
import pickle
from sklearn.metrics import classification_report
from random import shuffle
import os.path
import numpy as np
from scipy.spatial.distance import cdist
from scipy.stats import mode

def pickle_operating(fname, item):
    # save or load the pickle file.
    file_name = '%s.pickle' % fname
    print(file_name)
    if not item:
        with open(file_name, 'rb') as fs:
            item = pickle.load(fs)
            return item
    else:
        with open(file_name, 'wb') as fs:
            pickle.dump(item, fs, protocol=pickle.HIGHEST_PROTOCOL)

In [44]:

def getNeighbors(dataset, node, k_val=10):
    # slow approach using array
    '''
    distances = []
    for x in range(len(dataset)):
        d = distance.euclidean(node, dataset[x][0])
        distances.append((x, d))
    distances.sort(key=lambda y: y[1])
    neighbors = []
    for x in range(k_val):
        neighbors.append((distances[x][0], dataset[distances[x][0]][1]))
    '''
    #utilizing the cdist library to speed up the process 
    dists = cdist(dataset[0], [node], 'euclidean')
    idxes = np.argpartition(dists, k_val, axis=0)[:k_val]
    nearest_dists = np.take(dataset[1], idxes)
    return nearest_dists

In [67]:
#currently save the 10 nearest neighbors for each test record 
#so can be loaded up to test with different value of k
def saveNearestNeighbors(train_data, test_data):
    data_nn = []
    X_train, Y_train = np.array([x[0] for x in train_data]), np.array([x[1] for x in train_data])
    for x in range(len(test_data)):
        node = test_data[x][0]
        data_nn.append(getNeighbors([X_train, Y_train], node, 10))
    return data_nn

In [68]:
# not used anymore
def getLabels(neighbors, k_val):
    label_votes = {}
    for x in range(0, k_val):
        label = neighbors[x][1]
        if label in label_votes:
            label_votes[label] += 1
        else:
            label_votes[label] = 1
    sorted_votes = sorted(label_votes.iteritems(), key=lambda y: y[1], reverse=True)
    return sorted_votes[0][0]

In [69]:
# run the experimentation for knn 
def knn_experiment(testdata, testdata_nn, k=3):
    y_classified = []
    y_true = [x[1] for x in testdata]
    for x in range(len(testdata)):
        neighbors = testdata_nn[x]
        label = mode(neighbors[:k], axis=0)[0][0][0]
        #label = getLabels(neighbors, k)
        y_classified.append(label)
        #print('%s actual= %s, predicted=%s' % (x, y_true[x], label))
    print(classification_report(y_true, y_classified))

In [70]:
#experimentation runs for Caltech Data after PCA
dataset = pickle_operating('Caltech_data_3', None)
print(len(dataset['train']), len(dataset['test']))
from random import shuffle
shuffle(dataset['train'])
shuffle(dataset['test'])

Caltech_data_3.pickle
(320, 324)


In [71]:
data_nn = saveNearestNeighbors(dataset['train'], dataset['test'])
pickle_operating('Caltech_testdata_nn', data_nn)

Caltech_testdata_nn.pickle


In [80]:
testdata_nn = pickle_operating('Caltech_testdata_nn', None)
knn_experiment(dataset['test'], testdata_nn, 8)

Caltech_testdata_nn.pickle
             precision    recall  f1-score   support

          1       0.18      0.47      0.26        36
          2       0.30      0.26      0.28        31
          3       0.57      0.64      0.60        33
          4       0.25      0.10      0.14        21
          5       0.77      0.63      0.69        43
          6       0.19      0.13      0.15        46
          7       0.29      0.31      0.30        32
          8       0.09      0.07      0.08        27
          9       0.62      0.32      0.42        25
         10       1.00      0.67      0.80        30

avg / total       0.43      0.37      0.38       324



In [60]:
#experimentation runs for MNIST Data after PCA
dataset = pickle_operating('MNIST_data_2', None)
print(len(dataset['train']), len(dataset['test']))
shuffle(dataset['train'])
shuffle(dataset['test'])

MNIST_data_2.pickle
(60000, 10000)


In [73]:
data_nn = saveNearestNeighbors(dataset['train'], dataset['test'])
pickle_operating('MNIST_testdata_nn', data_nn)

MNIST_testdata_nn.pickle


In [65]:
testdata_nn = pickle_operating('MNIST_testdata_nn', None)
knn_experiment(dataset['test'], testdata_nn, 3)

MNIST_testdata_nn.pickle
             precision    recall  f1-score   support

          0       0.96      0.99      0.98       980
          1       0.98      0.99      0.99      1135
          2       0.96      0.97      0.96      1032
          3       0.95      0.95      0.95      1010
          4       0.95      0.93      0.94       982
          5       0.96      0.94      0.95       892
          6       0.98      0.98      0.98       958
          7       0.97      0.95      0.96      1028
          8       0.95      0.94      0.95       974
          9       0.91      0.93      0.92      1009

avg / total       0.96      0.96      0.96     10000

