In [2]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import concurrent.futures
import os


os.environ["LOKY_MAX_CPU_COUNT"] = str(os.cpu_count())


def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data

def centroid_function(train_data, train_labels, test_data, test_labels):
    clf = NearestCentroid()
    clf.fit(train_data, train_labels)
    predictions = clf.predict(test_data)
    return accuracy_score(test_labels, predictions)

def knn_function(train_data, train_labels, test_data, test_labels, k):
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(train_data, train_labels)
    predictions = clf.predict(test_data)
    return accuracy_score(test_labels, predictions)

def parallel_knn(k, train_data, train_labels, test_data, test_labels):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(knn_function, train_data, train_labels, test_data, test_labels, k)]
        results = [future.result() for future in concurrent.futures.as_completed(futures)]

    return results[0]

if __name__ == "__main__":
    train_paths = ('data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4', 'data_batch_5')
    test_path = 'test_batch'

    train_data = {}
    test_data = unpickle(test_path)

    for i in train_paths:
        train_data[i] = unpickle(i)

    train_labels = np.concatenate([train_data[key][b'labels'] for key in train_data.keys()])
    train_data = np.concatenate([train_data[key][b'data'] for key in train_data.keys()])

    test_labels = np.concatenate([test_data[b'labels'] for key in test_data.keys()])
    test_data = np.concatenate([test_data[b'data'] for key in test_data.keys()])

    unique_labels = np.unique(train_labels)

    tdata = []

    for label in unique_labels:
        mask = (train_labels == label)
        data_for_label = train_data[mask]
        tdata.append(data_for_label)

    print("The accuracy of the Nearest Centroid is: " + str(centroid_function(train_data, train_labels, test_data, test_labels) * 100) + "%")
    k_list = [1, 3]
    
    results = []
    for k in k_list:
        result = parallel_knn(k, train_data, train_labels, test_data, test_labels)
        results.append(result)
        print(f"The accuracy of k Nearest Neighboor with k = {k} is {result * 100}%")


The accuracy of the Nearest Centroid is: 27.74%
The accuracy of k Nearest Neighboor with k = 1 is 35.39%
The accuracy of k Nearest Neighboor with k = 3 is 33.03%
