# KNN Image Classifier

* Dataset from [University of Toronto - CS](https://www.cs.toronto.edu/~kriz/cifar.html)
* [list of CS231n assignment 1](http://cs231n.stanford.edu/slides/2017/cs231n_2017_lecture2.pdf)
* This KNN image classifier is a first assignment of CS231n.

In [1]:
import numpy as np
import pickle

In [2]:
# Hyperparameters
K = 3

In [3]:
# a function that loads data from file
def unpickle(file):
    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data

In [4]:
# Manhattan distance
def L1_distance(x, y):
    return np.abs(np.sum(x - y))

# Euclidean distance
def L2_distance(x, y):
    return np.sqrt(np.abs(np.sum((x-y) ** 2)))

In [5]:
batches = {'data': [], 'label': []}

def train(data, labels):
    # Training process of KNN is just remembering all images and labels.
    batches['data'].extend(data)
    batches['label'].extend(labels)
    
def predict(item, distance=L1_distance):
    min_values = []
    min_labels = []
    
    data = batches['data']
    labels = batches['label']
    
    for index in range(len(data)):
        d = distance(item, data[index])
        
        if len(min_values) < K:
            min_values.append(d)
            min_labels.append(labels[index])
        elif max(min_values) > d:
            removed_item_index = min_values.index(max(min_values))
            min_values[removed_item_index] = d
            min_labels[removed_item_index] = labels[index]

    majority = max(min_labels, key=min_labels.count)
    
    return majority

In [6]:
data_batch_1 = unpickle('../cifar-10-batches-py/data_batch_1')
test_batch = unpickle('../cifar-10-batches-py/test_batch')

print("key of data_batch_1: ", data_batch_1.keys())
print("the number of images in data_batch_1: ", len(data_batch_1[b'labels']))
print("the number of images in test_batch: ", len(test_batch[b'labels']))

key of data_batch_1:  dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
the number of images in data_batch_1:  10000
the number of images in test_batch:  10000


In [7]:
# train data_batch_1
train(data_batch_1[b'data'], data_batch_1[b'labels'])

In [8]:
correct_l1 = 0
correct_l2 = 0

count = 0

for index in range(len(test_batch[b'data'])):
    data = test_batch[b'data'][index]
    label = test_batch[b'labels'][index]
    
    kNN_output = predict(data)
    kNN_output_l2 = predict(data, distance=L2_distance)
    
    if kNN_output == label:
        correct_l1 += 1
    if kNN_output_l2 == label:
        correct_l2 += 1
    count += 1
    
    print(f"index {index}, label is {label}, and predicted label is {kNN_output} and {kNN_output_l2}")
    
    if count == 100:
        break

index 0, label is 3, and predicted label is 6 and 4
index 1, label is 8, and predicted label is 1 and 8
index 2, label is 8, and predicted label is 1 and 8
index 3, label is 0, and predicted label is 8 and 0
index 4, label is 6, and predicted label is 6 and 4
index 5, label is 6, and predicted label is 4 and 8
index 6, label is 1, and predicted label is 2 and 6
index 7, label is 6, and predicted label is 2 and 2
index 8, label is 3, and predicted label is 4 and 7
index 9, label is 1, and predicted label is 1 and 3
index 10, label is 0, and predicted label is 0 and 8
index 11, label is 9, and predicted label is 6 and 9
index 12, label is 5, and predicted label is 8 and 8
index 13, label is 7, and predicted label is 6 and 2
index 14, label is 9, and predicted label is 6 and 9
index 15, label is 8, and predicted label is 4 and 9
index 16, label is 5, and predicted label is 3 and 0
index 17, label is 7, and predicted label is 4 and 0
index 18, label is 8, and predicted label is 0 and 1
ind

In [9]:
print("accuracy of kNN using L1 distance: ", 100 * correct_l1 / count, "%")
print("accuracy of kNN using L2 distance: ", 100 * correct_l2 / count, "%")

accuracy of kNN using L1 distance:  24.0 %
accuracy of kNN using L2 distance:  17.0 %
