# Import all the required Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import cv2
from tqdm import tqdm

# Dataset and Preprocessing stage

Define Dataset path and Categories

In [None]:
dataset_path = "/kaggle/input/cifar-10-images/CIFAR-10-images-master/train"
categories = []
for c in os.listdir(dataset_path):
    categories.append(c)
print(categories)

Load and resize images, assign class labels

In [None]:
train_data = []
for c in categories:
    path = os.path.join(dataset_path, c)
    class_num = categories.index(c)
    for img in os.listdir(path):
        try:
            img = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
            # plt.imshow(img, cmap = 'gray')
            # plt.show()
            img_resize = cv2.resize(img, (32,32))
            train_data.append([img_resize, class_num])
        except Exception as e:
            pass

print(len(train_data))

Shuffle the data and then print a sample

In [None]:
random.shuffle(train_data)

for sample in train_data[:11]:
    print(sample[1])

Show some images with labels, then save the graph as a png and pdf file

In [None]:
plt.figure(figsize = (20, 5))
for i in range(20):
    plt.subplot(2, 10, i+1)
    plt.imshow(train_data[i][0], cmap = 'gray')
    plt.xticks([])
    plt.yticks([])
    plt.ylabel(f"class: {train_data[i][1]}")

plt.savefig('images_with_labels.png')
plt.savefig('images_with_labels.pdf')
plt.show()

# Model Building and Evalution Stage 

Split the data into 5 folds

In [None]:
F0 = train_data[0:10000]
F1 = train_data[10000:20000]
F2 = train_data[20000:30000]
F3 = train_data[30000:40000]
F4 = train_data[40000:50000]

print(len(F1), len(F4))

Define all the required functions. Like L1, L2 and KNN.

In [None]:
def L1_distance(img1, img2):
    return np.sum(np.abs(img1 - img2))

def L2_distance(img1, img2):
    return np.sqrt(np.sum(np.square(img1 - img2)))

def KNN(test_img, train_set, k, distance):
    distances_labels = []
    for train_img, label in train_set:
        if distance == "L1":
            dist = L1_distance(test_img, train_img)
        elif distance == "L2":
            dist = L2_distance(test_img, train_img)
        else:
            raise ValueError("Invalid distance metric")
        distances_labels.append((dist, label))
    distances_labels.sort(key=lambda x: x[0])
    neighbors = distances_labels[:k]
    counts = {}
    for _, label in neighbors:
        counts[label] = counts.get(label, 0) + 1
    prediction = max(counts, key=counts.get)
    return prediction

Now Define a list of possible values for k, distance metrics and an empty dict for average accuracy

In [None]:
k_values = list(range(1, 6))

distance_metrics = ["L1", "L2"]

average_accuracy = {}

In [None]:
print(len(k_values))

Loop through the posible k values, then distance metrics and then store the accuracy for each folds while taking the current fold as test set and rest as training set.

In [None]:
for k in k_values:
    for distance in distance_metrics:
        accuracy_per_fold = []
        for i in range(5):
            test_set = globals()[f"F{i}"]
            train_set = []
            for j in range(5):
                if j != i:
                    train_set.extend(globals()[f"F{j}"])
            correct = 0
            for test_img, true_label in tqdm(test_set):
                pred_label = KNN(test_img, train_set, k, distance)
                if pred_label == true_label:
                    correct += 1
            accuracy = correct / len(test_set)
            accuracy_per_fold.append(accuracy)
        average_accuracy[(k, distance)] = np.mean(accuracy_per_fold)

print(average_accuracy)

Then plot a graph with k on the X-axis and accuracy on the Y-axis for both L1 and L2 distances

In [None]:
plt.figure(figsize = (20, 5))
plt.plot(k_values, [average_accuracy[(k, "L1")] for k in k_values], label = "L1 distance")
plt.plot(k_values, [average_accuracy[(k, "L2")] for k in k_values], label = "L2 distance")
plt.xlabel("k")
plt.ylabel("accuracy")
plt.title("KNN accuracy for different values of k and distance metrics")
plt.legend()
plt.savefig('KNN_graph.png')
plt.savefig('KNN_graph.pdf')
plt.show()

Finding the best values for k and the distance metrics

In [None]:
l1_accuracy = np.array([average_accuracy[(k, "L1")] for k in k_values])
l2_accuracy = np.array([average_accuracy[(k, "L2")] for k in k_values])

best_k_l1 = k_values[np.argmax(l1_accuracy)]
best_k_l2 = k_values[np.argmax(l2_accuracy)]

if np.max(l1_accuracy) > np.max(l2_accuracy):
    dm = ["L1", best_k_l1]
else:
    dm = ["L2", best_k_l2]

print("Best distance metric and k value:", dm)

# Testing Stage

Define a test set path and load some test images

In [None]:
test_set_path = "/kaggle/input/cifar-10-images/CIFAR-10-images-master/test"
test_data = []
for c in categories:
    path = os.path.join(test_set_path, c)
    class_num = categories.index(c)
    for img in os.listdir(path):
        try:
            img = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
            img_resize = cv2.resize(img, (32,32))
            test_data.append([img_resize, class_num])
        except Exception as e:
            pass

random.shuffle(test_data)

In [None]:
for sample in test_data[:11]:
    print(sample[1])

In [None]:
print(categories)

Show some images with labels

In [None]:
plt.figure(figsize = (20, 5))
for i in range(20):
    plt.subplot(2, 10, i+1)
    plt.imshow(test_data[i][0], cmap = 'gray')
    plt.xticks([])
    plt.yticks([])
    plt.ylabel(f"class: {test_data[i][1]}")
plt.show()

Define the best value of k and distance metric based on the graph

In [None]:
best_k = dm[1]
best_distance = dm[0]

# Now display the top 5 predictions made by the model on the test images
plt.figure(figsize = (20, 5))
for i in range(5):
    plt.subplot(1, 5, i+1)
    plt.imshow(test_data[i][0], cmap = 'gray')
    plt.xticks([])
    plt.yticks([])
    pred_label = KNN(test_data[i][0], train_data, best_k, best_distance)
    plt.title(f"Prediction: {categories[pred_label]}")
plt.savefig('predict_test.png')
plt.savefig('predict_test.pdf')
plt.show()

# Results

From the training, it was found that 'L1' is the best distance metrics for this dataset.
As for the value of k, it was found that the best value of k is 3 for this dataset.

Save the best k values and distance into a text file

In [None]:
with open("best_k_value_and_distance.txt", "w") as file:
    file.write(f"Best value of K is: {dm[1]}. \nAnd best distance is: {dm[0]} \nWith the range of k being: {len(k_values)} \nAnd number of images per folds being: {len(F1)}")

In [None]:
with open("best_k_value_and_distance.txt", "r") as file:
    contents = file.read()
    print(contents)

# Discussion

Due to the large ammount of data, only 5 values were used for possible k values. 
And at the end the best distance metrics was found to be 'L1' and best k value 4. 
There are many ways to improve this further, like proper feature extraction, more possible values of k and other types of distance metrics.
To increase the speed of training and testing, multicore processing can be used.

And as it took over 10 hours to train and test in kaggle, it was not possible to include the output of each cells for the final ipynb file.