<a href="https://colab.research.google.com/github/Gan4x4/CV-HSE2019/blob/master/KNN_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Download and unpack archive with CIFAR10 dataset to disk  from official site: https://www.cs.toronto.edu/~kriz/cifar.html

!wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
!tar -xzf cifar-10-python.tar.gz
!ls -l

--2021-02-07 16:28:58--  https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170498071 (163M) [application/x-gzip]
Saving to: ‘cifar-10-python.tar.gz’


2021-02-07 16:29:05 (27.9 MB/s) - ‘cifar-10-python.tar.gz’ saved [170498071/170498071]

total 166516
drwxr-xr-x 2 2156 1103      4096 Jun  4  2009 cifar-10-batches-py
-rw-r--r-- 1 root root 170498071 Jun  4  2009 cifar-10-python.tar.gz
drwxr-xr-x 1 root root      4096 Feb  4 15:26 sample_data


In [3]:
# Loading CIFAR10 data using code from official site: https://www.cs.toronto.edu/~kriz/cifar.html
import numpy as np
import random

def unpickle(file,encoding = 'bytes'):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding=encoding)
    return dict

def load_train_data():
  x = np.zeros((0,3072),dtype=int) # To avoid overflow 
  y = np.array([],dtype=int)
  for i in range(1,6):
    raw = unpickle(f"/content/cifar-10-batches-py/data_batch_{i}")
    x = np.append(x,np.array(raw[b'data'],dtype=int),axis=0)
    y = np.append(y,np.array(raw[b'labels'],dtype=int),axis=0)
  return x,y

x_train, y_train = load_train_data()

test = unpickle("/content/cifar-10-batches-py/test_batch")
x_test = np.array(test[b'data'])
y_test = np.array(test[b'labels'])

# Load label names. For for convenience only.
meta = unpickle("/content/cifar-10-batches-py/batches.meta",'utf-8')
labels= meta['label_names']



In [5]:
class NearestNeighbor:
  def __init__(self):
    pass

  def train(self,x,y):
    self.train_data = x
    self.train_labels = y
  
  def predict(self,x):
    # To avoid overflow data must be int, not a byte!
    distances = np.sum(np.abs(self.train_data - x),axis = 1) # Axis 0 it's a row num in image list 
    return self.train_labels[np.argmin(distances)]


In [6]:
# Function to check model accuracy
def validate(model,x_test,y_test):
  correct = 0
  for i, sample in enumerate(x_test):
    index = model.predict(sample)
    correct += 1 if index == y_test[i] else 0
    if i > 0 and i % 100 == 0:
      print ("Accuracy {:.3f}".format(correct/i))
  
  return correct/len(x_test)  

In [7]:
# Now test accuracy and speed of model
import time

nn = NearestNeighbor()
nn.train(x_train,y_train)

start = time.perf_counter()
accuracy = validate(nn,x_test[:100],y_test[:100])   
tm = time.perf_counter() - start
total = x_test.shape[0]
print("Accuracy {:.2f} Train {:d} /test {:d} in {:.1f} sec. speed {:.2f} samples per second.".format(accuracy,len(x_train),total,tm,total/tm,) )

Accuracy 0.34 Train 50000 /test 10000 in 46.4 sec. speed 215.56 samples per second.


In [None]:
#  KNN 

from collections import Counter

class KNearestNeighbor(NearestNeighbor):
  def __init__(self,k):
    self.k = k
    pass
  
  def predict(self,x):
    distances = np.sum(np.abs(self.train_data - x),axis = 1) # L1
    sorted_distance_indexes = np.argsort(distances)
    k_nearest_images = sorted_distance_indexes[:self.k]
    most_common = Counter(self.train_labels[k_nearest_images]).most_common()
    return most_common[0][0]

knn = KNearestNeighbor(11)
knn.train( x_train,y_train)
validate(knn,x_test[:1000],y_test[:1000])  


Accuracy 0.380
Accuracy 0.410
Accuracy 0.373
Accuracy 0.375
Accuracy 0.380
Accuracy 0.383
