# Packages Importation and parameters specification

In [None]:
import Clusterer, Global_feature_extractor, Local_features_extractor, Norms
import Image, PCA_reduction, Distances, Encoder_NN
from Dataset_loader import load_dataset
from Accuracy import accuracy_optimised, accuracy
import os, cv2
import numpy as np

In [None]:
PIPELINE_PATHS = {
    "local_patch_extraction_representation":[("SIFT", "SIFT"),
                                             ("A-KAZE", "A-KAZE"),
                                             ("SIFT", "Autoencoder"),
                                             ("A-KAZE", "Autoencoder")],
    "codebook_generation": ["MiniBatchKMeans", "KMedoids"],
    "feature_encoding_and_pooling": ["BoVW", "VLAD"],
    "dimentionality_reduction": [None, "PCA"]
}
DATASETS = [("IAM", None), ("TrigraphSlant", False), ("TrigraphSlant", True), ("ICDAR", "en"), ("ICDAR", "ar")]

In [None]:
pipline = [0, 0, 0, 0]

training_session = {
    "id": "Madoka",
    "datasets": [0],
    "training_size": 10000,
    "testing_size": 10000
}

if not os.path.exists(training_session["id"]):
  os.mkdir(training_session["id"])

# Dataset preparation

In [None]:
train_big_set, test_big_set = list(), list()

for choice in training_session["datasets"]:
    train_mini_set, test_mini_set = load_dataset(dataset=DATASETS[choice][0],
                                                 path="./dataset", 
                                                 size_train=training_session["training_size"], 
                                                 size_test=training_session["testing_size"],
                                                 parametre=DATASETS[choice][1])
    train_big_set.extend(train_mini_set)
    test_big_set.extend(test_mini_set)
    
_, _, images_train_set = map(list, zip(*train_big_set))
writers_test_set, images_names_test_set, images_test_set = map(list, zip(*test_big_set))

In [None]:
print("Number of training images:",len(images_train_set))
print("Number of testing images:",len(images_test_set))

# Computing local descriptors

In [None]:
modules_chosen = PIPELINE_PATHS["local_patch_extraction_representation"][pipline[0]]

In [None]:
if modules_chosen == ("SIFT", "SIFT"):
  norm = Norms.Norm.hellinger_normalization
  algo = cv2.xfeatures2d.SIFT_create()
  local_features_extractor_descriptor = Local_features_extractor.Local_feature_extractor(algorithm=algo, norm=norm)
elif modules_chosen == ("A-KAZE", "A-KAZE"):
  norm = Norms.Norm.hellinger_normalization
  algo = cv2.AKAZE_create()
  local_features_extractor_descriptor = Local_features_extractor.Local_feature_extractor(algorithm=algo, norm=norm)
else:
    shape_images = '?'
    max_key_points = '?'
    model_path = training_session["id"]+'/?.h5'
    if modules_chosen[0] == "SIFT":
      local_features_detector = cv2.xfeatures2d.SIFT_create()
    elif modules_chosen[0] == "A-KAZE":
      local_features_detector = cv2.AKAZE_create()
    encoder = Encoder_NN.Encoder_NN((shape_images*2, shape_images*2),
                                     max_key_points, 
                                     local_features_detector=local_features_detector)
    encoder.set_model(model_path=model_path)
    local_features_extractor_descriptor = Local_features_extractor.Local_feature_extractor(algorithm=encoder)

In [None]:
def get_descriptors(local_features_extractor_descriptor, images_train_set, mini_size_sample=550):
    images_pre_clustering = [Image.Image(image, local_feature_extractor=local_features_extractor_descriptor) 
                             for image 
                             in images_train_set]
    list_local_descriptors = []
    list_local_descriptors_all = []
    for image in images_pre_clustering:
        mini_list_local_descriptors = np.array(image.local_descriptors)
        #repeated two times to ensure that each image can offer the mini_size sample
        list_local_descriptors_all.extend(
            mini_list_local_descriptors[
                np.random.choice(
                    mini_list_local_descriptors.shape[0], 
                    len(mini_list_local_descriptors), 
                    replace=False)
            ]
        )
        list_local_descriptors.extend(
            mini_list_local_descriptors[
                np.random.choice(
                    mini_list_local_descriptors.shape[0], 
                    min(mini_size_sample,len(mini_list_local_descriptors)), 
                    replace=False)
            ]
        )
    return list_local_descriptors_all, list_local_descriptors

In [None]:
descriptors_all, descriptors_sample = get_descriptors(local_features_extractor_descriptor, images_train_set)
print(len(descriptors_sample))
print(len(descriptors_all))

# Searchig for the optimal value of K

In [None]:
clustering_algo = PIPELINE_PATHS["codebook_generation"][pipline[1]]
max_no_improvement = 500
test_values=range(2, 400, 25)

In [None]:
Clusterer.Clusterer.choose_number_clusters_clustering(vectors=descriptors_sample, 
                                                      algo=clustering_algo,
                                                      max_no_improvement=max_no_improvement, 
                                                      test_values=test_values,
                                                      verbose=1)

# Compute the accuracy of the system as a function of K

In [None]:
local_patch_representation = PIPELINE_PATHS["local_patch_extraction_representation"][pipline[0]][1]
if local_patch_representation=="Autoencoder":
  distance_metric = Distances.Distance.angular_distance
  accuracy_calculator = accuracy
else:
  distance_metric = Distances.Distance.chi2_distance
  accuracy_calculator = accuracy_optimised

In [None]:
def principal_components(images_pre, pca_model_path, percentage_variance = 0.98):
    global_descriptors = [image.global_descriptor for image in images_pre]
    PCA_reduction.PCA_reduction.plot_variance_nbComponents(
        vectors=global_descriptors, 
        percentage_variance=percentage_variance)
    PCA_reduction.PCA_reduction.create_new_pca_model(vectors=global_descriptors, 
                                                    path_to_save=pca_model_path, 
                                                    percentage_variance=percentage_variance)

    pca_instance = PCA_reduction.PCA_reduction(pca_model_path)
    return pca_instance

In [None]:
accuracy_values=[]

In [None]:
set_nb_clusters = []
module_chosen = PIPELINE_PATHS["feature_encoding_and_pooling"][pipline[2]]

In [None]:
images_pre = [Image.Image(image,image_name=image_name,local_feature_extractor=local_features_extractor_descriptor) 
              for image, image_name 
              in zip(images_test_set,images_names_test_set)]

In [None]:
for nb_clusters in set_nb_clusters:
    clusters_centers_path = training_session["id"]+"/Centers_clusters_"+str(nb_clusters)+"nb.npy"
    Clusterer.Clusterer.fit_new_trainig(vectors=descriptors_all,
                                        algo= clustering_algo,
                                        path_to_save=clusters_centers_path,
                                        nb_clusters=nb_clusters, 
                                        max_no_improvement=max_no_improvement,
                                        metric=None,
                                        verbose=0)
    if module_chosen == "BoVW":
      global_feature_extractor = Global_feature_extractor.BOW(clusters_centers_path=clusters_centers_path)
    elif module_chosen == "VLAD":
      global_feature_extractor = Global_feature_extractor.VLAD(clusters_centers_path=clusters_centers_path)
    
    [image.set_global_descriptor(global_feature_extractor) for image in images_pre]
    
    if (PIPELINE_PATHS["dimentionality_reduction"][pipline[3]] == "PCA") and (module_chosen == "VLAD"):
        percentage_variance = 0.95
        pca_model_path = training_session["id"]+"/pca_model_"+str(nb_clusters)+"clusters.pkl"
        pca_instance = principal_components(images_pre, pca_model_path, percentage_variance)
        global_feature_extractor = Global_feature_extractor.VLAD(clusters_centers_path=clusters_centers_path, 
                                                                 pca_instance=pca_instance)
        [image.set_global_descriptor(global_feature_extractor) for image in images_pre]
    
    accuracy_values.append({nb_clusters:accuracy_calculator(X_test=images_pre, 
                                     Y_test=writers_test_set,
                                     distance_metric=distance_metric)
                           })

In [None]:
print(accuracy_values)