### dependencies

In [1]:
import pickle
import pandas as pd
import numpy as np
import time
import re
import os
from tqdm import tqdm as progress_bar
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from bornrule import BornClassifier
from sklearn.model_selection import StratifiedKFold

### functions

In [2]:
#def prediction_function(classifier_dict, classifier_, histogram_train, y_train, histogram_test, y_test, TFIDF=False):
#    # Applicare TF-IDF
#    if TFIDF:
#        tfidf = TfidfTransformer()
#        histogram_train = tfidf.fit_transform(histogram_train).toarray()
#        histogram_test = tfidf.transform(histogram_test).toarray()
#
#    classifier = classifier_dict[classifier_]
#    start = time.time()
#    classifier.fit(histogram_train, y_train)
#    y_pred = classifier.predict(histogram_test)
#    end = time.time()-start
#    results = {'y_obs': y_test, 'y_pred': y_pred}
#    return results, end
#
#
#def load_and_classify(classifier_dict, classifier_, file_path, TFIDF = False):
#    # Estrazione dei parametri dal nome del file
#    file_name = os.path.basename(file_path)
#    match = re.match(r'(\w+)_(\w+)_(\w+)_(\w+)\.pkl', file_name)
#    if match:
#        dataset = match.group(1)
#        extractor_name = match.group(2)
#        distance_metric = match.group(3)
#        k = int(match.group(4))
#    else:
#        raise ValueError("Nome del file non conforme al formato previsto.")
#    
#    with open(file_path, 'rb') as f:
#        data = pickle.load(f)
#    
#    histogram_train = np.array(data['histograms_train'])
#    y_train = np.array(data['y_train'])
#    histogram_test = np.array(data['histograms_test'])
#    y_test = np.array(data['y_test'])
#
#    results, comp_time = prediction_function(classifier_dict, classifier_, histogram_train, y_train, histogram_test, y_test, TFIDF)
#    f1 = f1_score(results['y_obs'], results['y_pred'], average='weighted')
#    
#    tuning_results = []
#    tuning_results.append({
#        'Params': {'classifier': classifier_, 'dataset':dataset, 'extractor': extractor_name, 'distance_metric': distance_metric, 'k': k, 'tfidf': TFIDF},
#        'F1_Score': f1 * 100,
#        'Comp_time': comp_time
#    })
#    
#    return tuning_results
#
#
#def results_to_dataframe(results_dict):
#    records = []
#    for sublist in results_dict:
#        for result in sublist:
#            params = result['Params']
#            classifier = params.get('classifier')
#            dataset = params.get('dataset')
#            extractor = params.get('extractor')
#            distance_metric = params.get('distance_metric')
#            k = params.get('k')
#            tfidf = params.get('tfidf')
#            f1_score = result.get('F1_Score')
#            computational_time = result.get('Comp_time')
#            
#            records.append((dataset, classifier, extractor, distance_metric, k, round(f1_score, 3), round(computational_time, 3), tfidf))
#
#    df = pd.DataFrame(records, columns=['Dataset', 'Classifier', 'Extractor', 'Distance_Metric', 'K', 'F1_Score(%)', 'Computational_time(s)', 'tfidf'])
#    df = df.sort_values(by='F1_Score(%)', ascending=False)
#    
#    return df



def cross_val_prediction_function(classifier_dict, classifier_, histogram_data, y_data, n_splits=5, TFIDF=False):
    # Applicare TF-IDF
    if TFIDF:
        tfidf = TfidfTransformer()
        histogram_data = tfidf.fit_transform(histogram_data).toarray()

    classifier = classifier_dict[classifier_]
    skf = StratifiedKFold(n_splits=n_splits)
    f1_scores = []
    computation_times = []

    for train_index, test_index in skf.split(histogram_data, y_data):
        histogram_train, histogram_test = histogram_data[train_index], histogram_data[test_index]
        y_train, y_test = y_data[train_index], y_data[test_index]
        
        start = time.time()
        classifier.fit(histogram_train, y_train)
        y_pred = classifier.predict(histogram_test)
        end = time.time() - start

        f1 = f1_score(y_test, y_pred, average='weighted')
        f1_scores.append(f1)
        computation_times.append(end)
    
    avg_f1_score = np.mean(f1_scores)
    avg_computation_time = np.mean(computation_times)
    results = {'y_obs': y_test, 'y_pred': y_pred}
    
    return results, avg_f1_score, avg_computation_time


def load_and_classify(classifier_dict, classifier_, file_path, n_splits=5, TFIDF=False):
    # Estrazione dei parametri dal nome del file
    file_name = os.path.basename(file_path)
    match = re.match(r'(\w+)_(\w+)_(\w+)_(\w+)\.pkl', file_name)
    if match:
        dataset = match.group(1)
        extractor_name = match.group(2)
        distance_metric = match.group(3)
        k = int(match.group(4))
    else:
        raise ValueError("Nome del file non conforme al formato previsto.")
    
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    
    histogram_train = np.array(data['histograms_train'])
    y_train = np.array(data['y_train'])
    histogram_test = np.array(data['histograms_test'])
    y_test = np.array(data['y_test'])

    histogram_data = np.concatenate((histogram_train, histogram_test), axis=0)
    y_data = np.concatenate((y_train, y_test), axis=0).flatten()

    
    results, avg_f1_score, avg_computation_time = cross_val_prediction_function(classifier_dict, classifier_, histogram_data, y_data, n_splits, TFIDF)
    
    tuning_results = []
    tuning_results.append({
        'Params': {'classifier': classifier_, 'dataset': dataset, 'extractor': extractor_name, 'distance_metric': distance_metric, 'k': k, 'tfidf': TFIDF},
        'F1_Score': avg_f1_score * 100,
        'Comp_time': avg_computation_time
    })
    
    return tuning_results


def results_to_dataframe(results_dict):
    records = []
    for sublist in results_dict:
        for result in sublist:
            params = result['Params']
            classifier = params.get('classifier')
            dataset = params.get('dataset')
            extractor = params.get('extractor')
            distance_metric = params.get('distance_metric')
            k = params.get('k')
            tfidf = params.get('tfidf')
            f1_score = result.get('F1_Score')
            computational_time = result.get('Comp_time')
            
            records.append((dataset, classifier, extractor, distance_metric, k, round(f1_score, 3), round(computational_time, 3), tfidf))

    df = pd.DataFrame(records, columns=['Dataset', 'Classifier', 'Extractor', 'Distance_Metric', 'K', 'F1_Score(%)', 'Computational_time(s)', 'tfidf'])
    df = df.sort_values(by='F1_Score(%)', ascending=False)
    
    return df


### Run

In [3]:
file_path = 'Histograms_repository'
file_names = [f"{file_path}/{f}" for f in os.listdir(file_path) if os.path.isfile(os.path.join(file_path, f))]


classifiers = {
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MultinomialNB": MultinomialNB(),
    "KNeighbors": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "BORN(0.5,1,1)": BornClassifier(0.5,1,1),
    "BORN(0.5,1,0)": BornClassifier(0.5,1,0),
    "BORN(1,0,0)": BornClassifier(1,0,0),
    "BORN(1,0,1)": BornClassifier(1,0,1),
    }

In [None]:
results_dict = []
for file in progress_bar(file_names):
    for classifier in classifiers:
        results_dict.append(load_and_classify(classifiers, classifier, file, TFIDF=False))

In [5]:
df = results_to_dataframe(results_dict)
df.to_csv("cv_results_all_configurations_cifar_imnet")

In [6]:
df

Unnamed: 0,Dataset,Classifier,Extractor,Distance_Metric,K,F1_Score(%),Computational_time(s),tfidf
307,IMAGENET,LogisticRegression,SIFT,cosine,5000,52.712,10.913,False
352,IMAGENET,LogisticRegression,SIFT,euclidean,5000,52.572,9.809,False
298,IMAGENET,LogisticRegression,SIFT,cosine,2500,52.202,5.172,False
343,IMAGENET,LogisticRegression,SIFT,euclidean,2500,51.645,5.455,False
280,IMAGENET,LogisticRegression,SIFT,cosine,100,49.635,0.525,False
...,...,...,...,...,...,...,...,...
230,IMAGENET,"BORN(0.5,1,1)",ORB,euclidean,10,3.508,0.007,False
255,IMAGENET,KNeighbors,ORB,euclidean,2500,3.484,1.726,False
264,IMAGENET,KNeighbors,ORB,euclidean,5000,3.321,3.450,False
210,IMAGENET,KNeighbors,ORB,cosine,2500,3.035,1.766,False


In [7]:
1/0

ZeroDivisionError: division by zero

### result analysis

In [None]:
df = pd.read_csv('File_with_results/cifar_imagenet_pipeline_results')  
df = df.rename(columns={'Dataset': 'temp_col', 'Classifier': 'Dataset'})
df = df.rename(columns={'temp_col': 'Classifier'})
df = df.drop(columns=['Unnamed: 0'])
columns = list(df.columns)
col1_index = columns.index('Dataset')
col2_index = columns.index('Classifier')
columns[col1_index], columns[col2_index] = columns[col2_index], columns[col1_index]
df = df[columns]
df = df[df['Classifier'].str.startswith('BORN')].reset_index(drop=True)


In [None]:
df.head()

In [None]:
df['match'] = [f"{df.Dataset[i]}_{df.Classifier[i]}_{df.Extractor[i]}_{df.Distance_Metric[i]}_{df.K[i]}" for i in range(len(df))]
df = df[['match', 'F1_Score(%)']]

df_tfidf['match'] = [f"{df_tfidf.Dataset[i]}_{df_tfidf.Classifier[i]}_{df_tfidf.Extractor[i]}_{df_tfidf.Distance_Metric[i]}_{df_tfidf.K[i]}" for i in range(len(df_tfidf))]
df_tfidf = df_tfidf[['match', 'F1_Score(%)']]


In [None]:
df_merged = pd.merge(df, df_tfidf, on='match', suffixes=('_histograms', '_tfidf'))
df_merged[['dataset', 'classifier', 'extractor', 'distance_metric', 'K']] = df_merged['match'].str.split('_', expand=True)
df_merged = df_merged[['dataset', 'classifier', 'extractor', 'distance_metric', 'K', 'F1_Score(%)_histograms', 'F1_Score(%)_tfidf']]
df_merged['difference'] = df_merged['F1_Score(%)_histograms'] - df_merged['F1_Score(%)_tfidf']
df_merged = df_merged.rename(columns={'F1_Score(%)_histograms': 'f1_hist', 'F1_Score(%)_tfidf': 'f1_tfidf'})

In [None]:
# df_merged = df_merged[(df_merged.dataset =='IMAGENET')]
df_merged

In [None]:
# df_merged = df_merged[(df_merged.extractor =='SIFT') & (df_merged.distance_metric == 'euclidean') & (df_merged.K == '2500')].reset_index(drop=True)
# df_merged
# df_merged = df_merged[df_merged.difference <0].reset_index(drop=True)
# df_merged

In [None]:
import matplotlib.pyplot as plt



# Impostazioni del grafico
plt.figure(figsize=(10, 6))

# Impostare la larghezza delle barre
bar_width = 0.35

# Indici delle barre
index = range(len(df_merged))

# Barre per 'F1_score(%)_original'
plt.bar(index, df_merged['f1_hist'], bar_width, label='Original')

# Barre per 'F1_score(%)_tfidf'
plt.bar([i + bar_width for i in index], df_merged['f1_tfidf'], bar_width, label='TF-IDF')

# Aggiungere le etichette e il titolo
plt.xlabel('Match')
plt.ylabel('F1 Score (%)')
plt.title('Confronto degli F1 Score (%) Original e TF-IDF')
plt.xticks([i + bar_width / 2 for i in index])
plt.legend()

# Mostrare il grafico
plt.show()
