In [None]:
import os
import pickle
import threading
import numpy as np
import pandas as pd

from random import shuffle
from PIL import Image
from numpy import ndarray
from scipy import linalg, stats
from typing import List, Tuple

combined_directory = "./New folder/combined/"
directory_n_1 = "./batch 2/"
directory_n_2 = "./batch 3/"
test_images = "./test/"

In [None]:
#Takes a directory path and returns all the names of the subdirectories and their paths
def open_directory(directory_path):
    names = os.listdir(directory_path)
    paths = [directory_path + name + "/" for name in names]
    return paths, names

In [None]:
def open_dic(directory_path):
    names = os.listdir(directory_path)
    paths = [directory_path + name for name in names]
    return paths, names

In [None]:
#Takes a list of image paths and returns a list of image
def open_images(image_paths):
    image_arrays = [np.array(Image.open(image_path)) for image_path in image_paths]
    return image_arrays

In [None]:

#Takes a list of image np.arrays and turns them into a large feature vector array where rows correnspond to images and columns correspond to features (pixels) of the image
def images_to_feature_vectors(image_paths):
    images = open_images(image_paths)
    h, w = images[0].shape
    n_features = h * w
    fvectors = np.empty((len(images), n_features))
    for i, image in enumerate(images):
        fvectors[i, :] = image.reshape(1, n_features)
    return fvectors

In [None]:
#Splits an image list into training and testing data
def split_two(image_list, ratio=[0.7, 0.3]):
    train_ratio = ratio[0]
    indices_for_splittin = [int(len(image_list) * train_ratio)]
    train, test = np.split(image_list, indices_for_splittin)
    return train, test


#Splits an image list into training, validation and testing data
def split_three(image_list, ratio=[0.8, 0.1, 0.1]):
    train_r, val_r, test_r = ratio
    assert (np.sum(ratio) == 1.0)
    indicies_for_splitting = [int(len(image_list) * train_r), int(len(image_list) * (train_r + val_r))]
    train, val, test = np.split(image_list, indicies_for_splitting)
    return train, val, test

In [None]:
#Takes the path of a directory where every image is placed into a directory with the name of the label
#and returns a dictionary with the feature vectors and their corresponding labels
def label_data(directory):
    data_labelled = {}
    data_fvectors = []
    data_labels = []
    subdirectory_paths, subdirectory_names = open_directory(directory)
    for i in range(len(subdirectory_names)):
        images = os.listdir(subdirectory_paths[i])
        images = [subdirectory_paths[i] + "/" + image for image in images]
        data_fv = images_to_feature_vectors(images)
        for fv in data_fv:
            data_fvectors.append(fv)
            data_labels.append(subdirectory_names[i])
    data_labelled["fvectors"] = data_fvectors
    data_labelled["labels"] = data_labels

    return data_labelled

In [None]:
#Does the same as label_data but it also splits the images into training and testing and returns seperate dictionaries
def split_train_test(directory):
    train_model = {}
    train_fvectors = []
    train_labels = []
    test_model = {}
    test_fvectors = []
    test_labels = []
    subdirectory_paths, subdirectory_names = open_directory(directory)
    for i in range(len(subdirectory_names)):
        images = os.listdir(subdirectory_paths[i])
        shuffle(images)
        images = [subdirectory_paths[i] + "/" + image for image in images]
        train, test = split_two(images)
        train_fv = images_to_feature_vectors(train)
        for fv in train_fv:
            train_fvectors.append(fv)
            train_labels.append(subdirectory_names[i])
        test_fv = images_to_feature_vectors(test)
        for fv in test_fv:
            test_fvectors.append(fv)
            test_labels.append(subdirectory_names[i])
    train_model["fvectors"] = train_fvectors
    train_model["labels"] = train_labels
    test_model["fvectors"] = test_fvectors
    test_model["labels"] = test_labels

    return train_model, test_model

In [None]:
#Save the training model to a pickle file
def save_pickle(data: dict) -> None:
    a_file = open("data.pkl", "wb")
    pickle.dump(data, a_file)
    a_file.close()


#Loads the training model from the pickle file
def load_pickle() -> dict:
    a_file = open("data.pkl", "rb")
    model = pickle.load(a_file)
    return model

In [None]:
#Computes the cosine distance between arrays of training and testing data and returns the distance where rows correspond to test images and columns correspond to train images, very quick
def cosine_similarity(training, testing):
    tdott = np.dot(testing, training.transpose())
    modtrain = np.sqrt(np.sum(training * training, axis=1))
    modtest = np.sqrt(np.sum(testing * testing, axis=1))
    dist = -tdott / np.outer(modtest, modtrain.transpose())
    return dist

In [None]:
#computes the euclidean distance between arrays of training and testing data, pretty slow
def euclidean_distance(training, testing):
    dist = np.full([len(testing), len(training)], 0)
    for testrow in range(0, testing.shape[0]):
        for trainrow in range(0, training.shape[0]):
            dist[testrow][trainrow] = np.linalg.norm(testing[testrow] - training[trainrow])
    return dist

In [None]:
KNNC = 5  #K in K nearest neighbours


#Takes a training dict and a testing dict, computes the distance between the feature vectors, finds the k nearest images, extracts their labels,
#finds the most common label, and classifies it as such. Returns labels in the same order as the test feature vectors
def classify(train_model: dict, test_fvectors, k, distance) -> List[str]:
    train = np.array(train_model["fvectors"])
    train_labels = train_model["labels"]

    #Compute distance
    if distance == 0:
        dist = cosine_similarity(train, test_fvectors)
    else:
        dist = euclidean_distance(train, test_fvectors)

    #Extract k nearest images
    knearest = np.argsort(dist, axis=1)[:, 0:k]

    #Extract the labels of the k nearest neighbours for each test image
    klabels = []
    for i in range(len(knearest)):
        individual_labels = []
        for j in range(len(knearest[0])):
            individual_labels.append(train_labels[knearest[i][j]])
        klabels.append(individual_labels)

    #Find the most comon label and classify
    klabels = pd.DataFrame(klabels)
    labels = klabels.mode(axis='columns')
    label = np.array(labels[0].tolist())
    return label

In [None]:
#Computes the accuracy of the model by running classify and checking the percentage of true labels
def evaluate(train_model: dict, test: dict, k, distance) -> Tuple[float, float]:
    true_labels = test["labels"]
    test_fvectors = test["fvectors"]
    output_labels = classify(train_model, test_fvectors, k, distance)
    n_of_correct_labels = 0
    #wrong_predictions = []
    #print(len(true_labels))
    for i in range(len(true_labels)):
        if output_labels[i] == true_labels[i]:
            n_of_correct_labels += 1
    #     else:
    #         wrong_labels = []
    #         wrong_labels.append(output_labels[i])
    #         wrong_labels.append(true_labels[i])
    #         wrong_predictions.append(wrong_labels)
    # print(len(wrong_predictions))
    # print(wrong_predictions)
    score = 100.0 * n_of_correct_labels / len(true_labels)
    return score


In [None]:
#Creates a random test/train split and runs the classifier once
def test_one(directory_of_images, k, distance):
    #train = label_data(directory_of_test_images)
    #test = label_data(directory_of_images)
    train, test = split_train_test(directory_of_images)
    #save_pickle(train)
    return evaluate(test, train, k, distance)


In [None]:
#Runs the classifier n times, each time creating a new training/testing split, then prints the average accuracy of the n runs
def test_n_times(directory_of_images, k, n, distance):
    accuracy = []
    for i in range(n):
        accuracy.append(test_one(combined_directory, k, distance))
    average = sum(accuracy) / n
    final_score = "Average score for k=" + str(k) + " = " + str(round(average, 2))
    print(final_score)


In [None]:
#Saves the model using train data directory
def save_model(train_data_directory):
    model = label_data(train_data_directory)
    save_pickle(model)

In [None]:
save_model(combined_directory)

In [None]:
#Takes the directory path of the images we want to classify and returns the corresponding labels
def classify_unlabelled_directory(segmented_image_directory):
    image_paths, _ = open_dic(segmented_image_directory)
    image_fvectors = images_to_feature_vectors(image_paths)
    train_model = load_pickle()
    labels = classify(train_model, image_fvectors, 3, 0)
    return labels

In [None]:
def predictChars(listOfChars):
    mappings = {
        "1": "1",
        "2": "2",
        "3": "3",
        "4": "4",
        "5": "5",
        "6": "6",
        "7": "7",
        "8": "8",
        "9": "9",
        "10": "أ",
        "11": "ب",
        "12": "ج",
        "13": "د",
        "14": "ر",
        "15": "س",
        "16": "ص",
        "17": "ط",
        "18": "ع",
        "19": "ف",
        "20": "ق",
        "21": "ل",
        "22": "م",
        "23": "ن",
        "24": "ه",
        "25": "و",
        "26": "ي",
    }

    # list=["1","2","a","f"]
    finalChar = []
    for char in listOfChars:
        finalChar.append(mappings[char])
    return finalChar

In [None]:
predicted_chars = classify_unlabelled_directory(test_images)
print(predictChars(predicted_chars))