## Evaluating Information Retrieval Models:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json

### Load Labels

In [3]:
df_questions = pd.read_csv('questions_train.csv')

df_questions.head(5)

Unnamed: 0,id,category,subcategory,question,extra_description,article_ids
0,1102,Travail,Travail et parentalité,Je suis travailleur salarié(e). Puis-je refuse...,Pendant la grossesse,"22225,22226,22227,22228,22229,22230,22231,2223..."
1,91,Argent,Dettes,Peut-on saisir tous mes revenus ?,"Procédures de récupération des dettes, Récupér...",585358545855
2,474,Famille,Situation de couples,Je suis marié(e). Nous sommes mariés. Dois-je ...,Mariage,109610971098110811091110
3,836,Logement,Location en Wallonie,Je mets un kot en location (bail de droit comm...,"Mettre un logement en location (Wallonie), Doi...",12012120301203112032120331203412035
4,1079,Travail,Maladie - incapacité de travail,Suis-je payé pendant la procédure du trajet de...,Rupture du contrat de travail pour force majeu...,"21114,21115,21116,21117,21118,21119,21120,2112..."


In [4]:
all_ids_labels = []
for article_ids in df_questions['article_ids']:
    all_ids_labels.append(article_ids)


In [5]:
len(all_ids_labels)

886

### Load Prediction

In [6]:
with open('predictions_2.json', 'r') as f:
    all_predictions = json.load(f)

In [7]:
import json
import gzip

# # Load the predictions from the JSON file
# with gzip.open('all_predictions.json.gz', 'rt', encoding='utf-8') as f:
#     all_predictions = json.load(f)

# Load the predictions from the JSON file
with gzip.open('all_predictions_RSChunk.json.gz', 'rt', encoding='utf-8') as f:
    all_predictions = json.load(f)



In [8]:
predicted_ids = []

for i in range(len(all_predictions)):
    predicted_ids.append(all_predictions[str(i)]['predicted_ids'])

print(predicted_ids[1:3])

[[5729, 13774, 5815, 5779, 5790, 22384, 13238, 18949, 8104, 5706, 5856, 5775, 5722, 5786, 5711, 156, 2123, 5778, 15822, 18942, 13236, 5980, 13770, 5762, 5745, 18925, 8837, 14331, 5777, 13238, 5956, 15771, 5974, 5810, 5730, 18950, 5854, 5791, 18926, 17696], [1094, 1120, 1121, 1094, 1119, 1103, 1145, 1102, 1111, 1120, 1112, 1123, 1096, 1124, 1125, 1118, 1104, 1099, 1108, 1123, 1114, 1101, 1120, 1113, 1146, 5386, 1125, 1148, 1227, 939, 1075, 1161, 1110, 1089, 940, 1008, 937, 1174, 1093, 934]]


### Eval Function

In [9]:
def Eval_Retrieval(all_predictions, articles_ids, top_k=20):
    # Assure que les articles_ids sont bien sous forme de liste d'ID (int) pour chaque requête
    articles_ids = [list(map(int, ids.split(','))) for ids in articles_ids]
    
    # Initialize metrics
    precisions = []
    recalls = []
    f1_scores = []
    average_precisions = []
    reciprocal_ranks = []

    # Nombre total de questions
    Q = len(all_predictions)

    # Calcul des métriques pour chaque ensemble de prédictions
    for preds, true_ids in zip(all_predictions, articles_ids):
        # Limiter les prédictions à top_k résultats
        preds = preds[:top_k]
        
        # Convertir les prédictions en set pour faciliter les calculs
        preds_set = set(preds)
        true_set = set(true_ids)

        # Calcul des True Positives (TP), False Positives (FP), et False Negatives (FN)
        tp = len(preds_set & true_set) # intersection (in both)
        fp = len(preds_set - true_set)  # Difference (in pred but not in true)
        fn = len(true_set - preds_set)  #Difference (in true but not in pred)

        # Calcul Precision, Recall, F1-Score
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

        # Calcul de l'Average Precision (AP)
        ap = 0
        relevant_count = 0
        for rank, pred in enumerate(preds, 1):  # rank starts at 1
            if pred in true_set:
                relevant_count += 1
                ap += relevant_count / rank
        ap /= len(true_set) if len(true_set) > 0 else 1
        average_precisions.append(ap)

        # Calcul Mean Reciprocal Rank (MRR)
        mrr = 0
        for rank, pred in enumerate(preds, 1):
            if pred in true_set:
                mrr = 1 / rank
                break
        reciprocal_ranks.append(mrr)

    # Calcul des métriques globales
    mean_precision = sum(precisions) / Q
    mean_recall = sum(recalls) / Q
    mean_f1 = sum(f1_scores) / Q
    mean_ap = sum(average_precisions) / Q
    mean_mrr = sum(reciprocal_ranks) / Q

    # Retourner les métriques sous forme de dictionnaire
    return {
        "mean_precision": mean_precision,
        "mean_recall": mean_recall,
        "mean_f1_score": mean_f1,
        "MAP": mean_ap,
        "MRR": mean_mrr
    }


In [12]:
metrics = Eval_Retrieval(predicted_ids, all_ids_labels, top_k=3)

format_metrics = json.dumps(metrics, indent=4)
print(format_metrics)

{
    "mean_precision": 0.19582392776523702,
    "mean_recall": 0.19493667246510607,
    "mean_f1_score": 0.1553632677653987,
    "MAP": 0.1925122599578059,
    "MRR": 0.3306997742663657
}


In [8]:
metrics = Eval_Retrieval(predicted_ids, all_ids_labels, top_k=5)

format_metrics = json.dumps(metrics, indent=4)
print(format_metrics)

{
    "mean_precision": 0.1455793829947329,
    "mean_recall": 0.2374931901131348,
    "mean_f1_score": 0.13913758057231745,
    "MAP": 0.19823666815625512,
    "MRR": 0.32904439428141463
}


In [30]:
metrics = Eval_Retrieval(predicted_ids, all_ids_labels, top_k=20)

format_metrics = json.dumps(metrics, indent=4)
print(format_metrics)


{
    "mean_precision": 0.056094808126410836,
    "mean_recall": 0.22770241505769426,
    "mean_f1_score": 0.06633374001609073,
    "MAP": 0.07571223066018752,
    "MRR": 0.1817468067896123
}


In [26]:
metrics = Eval_Retrieval(predicted_ids, all_ids_labels, top_k=5)

format_metrics = json.dumps(metrics, indent=4)
print(format_metrics)


{
    "mean_precision": 0.05146726862302483,
    "mean_recall": 0.05169027307757119,
    "mean_f1_score": 0.034377427779609115,
    "MAP": 0.028122523460075988,
    "MRR": 0.09048156508653123
}


In [26]:
metrics = Eval_Retrieval(all_predictions['predictions'], all_ids_labels, top_k=5)

format_metrics = json.dumps(metrics, indent=4)
print(format_metrics)

{
    "mean_precision": 0.1363431151241535,
    "mean_recall": 0.24542927585976157,
    "mean_f1_score": 0.13602369048546678,
    "MAP": 0.16525724725154028,
    "MRR": 0.32565838976674194
}
