### dependencies

In [11]:
import pickle
import pandas as pd
import numpy as np
import time
import re
import os
from tqdm import tqdm as progress_bar
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from bornrule import BornClassifier
from sklearn.model_selection import StratifiedKFold

### functions

In [12]:
def cross_val_prediction_function(classifier_dict, classifier_, histogram_data, y_data, n_splits=5, TFIDF=False):

    if TFIDF:
        tfidf = TfidfTransformer()
        histogram_data = tfidf.fit_transform(histogram_data).toarray()

    classifier = classifier_dict[classifier_]
    skf = StratifiedKFold(n_splits=n_splits)
    f1_scores = []
    computation_times = []

    for train_index, test_index in skf.split(histogram_data, y_data):
        histogram_train, histogram_test = histogram_data[train_index], histogram_data[test_index]
        y_train, y_test = y_data[train_index], y_data[test_index]
        
        start = time.time()
        classifier.fit(histogram_train, y_train)
        y_pred = classifier.predict(histogram_test)
        end = time.time() - start

        f1 = f1_score(y_test, y_pred, average='weighted')
        f1_scores.append(f1)
        computation_times.append(end)
    
    avg_f1_score = np.mean(f1_scores)
    avg_computation_time = np.mean(computation_times)
    results = {'y_obs': y_test, 'y_pred': y_pred}
    
    return results, avg_f1_score, avg_computation_time


def load_and_classify(classifier_dict, classifier_, file_path, n_splits=5, TFIDF=False):

    file_name = os.path.basename(file_path)
    match = re.match(r'(\w+)_(\w+)_(\w+)_(\w+)\.pkl', file_name)
    if match:
        dataset = match.group(1)
        extractor_name = match.group(2)
        distance_metric = match.group(3)
        k = int(match.group(4))
    else:
        raise ValueError("Nome del file non conforme al formato previsto.")
    
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    
    histogram_train = np.array(data['histograms_train'])
    y_train = np.array(data['y_train'])
    histogram_test = np.array(data['histograms_test'])
    y_test = np.array(data['y_test'])

    histogram_data = np.concatenate((histogram_train, histogram_test), axis=0)
    y_data = np.concatenate((y_train, y_test), axis=0).flatten()

    
    results, avg_f1_score, avg_computation_time = cross_val_prediction_function(classifier_dict, classifier_, histogram_data, y_data, n_splits, TFIDF)
    
    tuning_results = []
    tuning_results.append({
        'Params': {'classifier': classifier_, 'dataset': dataset, 'extractor': extractor_name, 'distance_metric': distance_metric, 'k': k, 'tfidf': TFIDF},
        'F1_Score': avg_f1_score * 100,
        'Comp_time': avg_computation_time
    })
    
    return tuning_results


def results_to_dataframe(results_dict):
    records = []
    for sublist in results_dict:
        for result in sublist:
            params = result['Params']
            classifier = params.get('classifier')
            dataset = params.get('dataset')
            extractor = params.get('extractor')
            distance_metric = params.get('distance_metric')
            k = params.get('k')
            tfidf = params.get('tfidf')
            f1_score = result.get('F1_Score')
            computational_time = result.get('Comp_time')
            
            records.append((dataset, classifier, extractor, distance_metric, k, round(f1_score, 3), round(computational_time, 3), tfidf))

    df = pd.DataFrame(records, columns=['Dataset', 'Classifier', 'Extractor', 'Distance_Metric', 'K', 'F1_Score(%)', 'Computational_time(s)', 'tfidf'])
    df = df.sort_values(by='F1_Score(%)', ascending=False)
    
    return df


### Run

In [13]:
file_path = 'Histogram_configurations'
file_names = [f"{file_path}/{f}" for f in os.listdir(file_path) if os.path.isfile(os.path.join(file_path, f))]

classifiers = {
    "MultinomialNB": MultinomialNB(),
    "BORN(0.5,1,1)": BornClassifier(0.5,1,1),
    "BORN(0.5,1,0)": BornClassifier(0.5,1,0),
    "BORN(1,0,0)": BornClassifier(1,0,0),
    "BORN(1,0,1)": BornClassifier(1,0,1),
    }

In [14]:
results_dict = []
for file in progress_bar(file_names):
    for classifier in classifiers:
        results_dict.append(load_and_classify(classifiers, classifier, file, TFIDF=False) )

  0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40/40 [35:57<00:00, 53.93s/it]   


In [15]:
df_tfidf_cv = results_to_dataframe(results_dict)
df_tfidf_cv.to_csv("no_tfidf_cv")

In [16]:
df_tfidf_cv

Unnamed: 0,Dataset,Classifier,Extractor,Distance_Metric,K,F1_Score(%),Computational_time(s),tfidf
165,IMAGENET,MultinomialNB,SIFT,cosine,2500,47.498,0.193,False
190,IMAGENET,MultinomialNB,SIFT,euclidean,2500,47.356,0.176,False
170,IMAGENET,MultinomialNB,SIFT,cosine,5000,46.152,0.492,False
195,IMAGENET,MultinomialNB,SIFT,euclidean,5000,46.094,0.261,False
160,IMAGENET,MultinomialNB,SIFT,cosine,1000,44.974,0.106,False
...,...,...,...,...,...,...,...,...
131,IMAGENET,"BORN(0.5,1,1)",ORB,euclidean,100,6.431,0.059,False
106,IMAGENET,"BORN(0.5,1,1)",ORB,cosine,100,5.305,0.056,False
104,IMAGENET,"BORN(1,0,1)",ORB,cosine,10,5.222,0.014,False
126,IMAGENET,"BORN(0.5,1,1)",ORB,euclidean,10,3.508,0.025,False


In [42]:
df_tfidf = results_to_dataframe(results_dict)

df_tfidf.to_csv("TFIDF_Results")

In [43]:
df_tfidf

Unnamed: 0,Dataset,Classifier,Extractor,Distance_Metric,K,F1_Score(%),Computational_time(s),tfidf
165,IMAGENET,MultinomialNB,SIFT,cosine,2500,46.050,0.068,True
160,IMAGENET,MultinomialNB,SIFT,cosine,1000,45.932,0.029,True
190,IMAGENET,MultinomialNB,SIFT,euclidean,2500,45.822,0.080,True
185,IMAGENET,MultinomialNB,SIFT,euclidean,1000,45.727,0.041,True
195,IMAGENET,MultinomialNB,SIFT,euclidean,5000,44.598,0.151,True
...,...,...,...,...,...,...,...,...
131,IMAGENET,"BORN(0.5,1,1)",ORB,euclidean,100,6.110,0.021,True
104,IMAGENET,"BORN(1,0,1)",ORB,cosine,10,5.426,0.007,True
106,IMAGENET,"BORN(0.5,1,1)",ORB,cosine,100,5.072,0.055,True
126,IMAGENET,"BORN(0.5,1,1)",ORB,euclidean,10,3.229,0.008,True


### result analysis

In [44]:
df = pd.read_csv('File_with_results/cifar_imagenet_pipeline_results')  
df = df.rename(columns={'Dataset': 'temp_col', 'Classifier': 'Dataset'})
df = df.rename(columns={'temp_col': 'Classifier'})
df = df.drop(columns=['Unnamed: 0'])
columns = list(df.columns)
col1_index = columns.index('Dataset')
col2_index = columns.index('Classifier')
columns[col1_index], columns[col2_index] = columns[col2_index], columns[col1_index]
df = df[columns]
df = df[df['Classifier'].str.startswith('BORN')].reset_index(drop=True)
df.head()

In [45]:
df['match'] = [f"{df.Dataset[i]}_{df.Classifier[i]}_{df.Extractor[i]}_{df.Distance_Metric[i]}_{df.K[i]}" for i in range(len(df))]
df = df[['match', 'F1_Score(%)']]

df_tfidf['match'] = [f"{df_tfidf.Dataset[i]}_{df_tfidf.Classifier[i]}_{df_tfidf.Extractor[i]}_{df_tfidf.Distance_Metric[i]}_{df_tfidf.K[i]}" for i in range(len(df_tfidf))]
df_tfidf = df_tfidf[['match', 'F1_Score(%)']]


In [46]:
df_merged = pd.merge(df, df_tfidf, on='match', suffixes=('_histograms', '_tfidf'))
df_merged[['dataset', 'classifier', 'extractor', 'distance_metric', 'K']] = df_merged['match'].str.split('_', expand=True)
df_merged = df_merged[['dataset', 'classifier', 'extractor', 'distance_metric', 'K', 'F1_Score(%)_histograms', 'F1_Score(%)_tfidf']]
df_merged['difference'] = df_merged['F1_Score(%)_histograms'] - df_merged['F1_Score(%)_tfidf']
df_merged = df_merged.rename(columns={'F1_Score(%)_histograms': 'f1_hist', 'F1_Score(%)_tfidf': 'f1_tfidf'})
df_merged.head()

In [19]:
# df_merged = df_merged[(df_merged.extractor =='SIFT') & (df_merged.distance_metric == 'euclidean') & (df_merged.K == '2500')].reset_index(drop=True)
# df_merged
# df_merged = df_merged[df_merged.difference <0].reset_index(drop=True)
# df_merged