In [62]:
# constants

import requests
import re
import json
import os
import random
import math

API_ENDPOINT = 'http://localhost:8080/search'
QUERY = 'f1 score on wdc dataset'

QUERIES = [
    "f1 score on dataset wdc",
    "precision and recall for deep learning on ImageNet dataset",
    "accuracy comparison for federated learning models",
    "f1 and accuracy on CIFAR-10 classification",
    "ROC-AUC evaluation for MIMIC-III dataset",
    "evaluation metrics for SQuAD dataset on question answering",
    "evaluation of BERT embeddings for similarity tasks",
    "metrics comparison for GPT models on summarization",
    "training convergence for federated learning models"
]

In [None]:
def extract_id(url):
    match = re.search(r'/(\d{4}\.\d{5})', url)
    return match.group(1) if match else None

In [None]:
# Generate a single text file that consolidates all the necessary data. This file will be sent to GPT-4 for ranking purposes.

file_path = "generic-metrics/query1-bis.txt"
payload = {
    "inputString":QUERY,
    "resultCount":10,
    "resourceType":"json"
}

tables = requests.post(API_ENDPOINT, data=payload)
tables = tables.json()

with open(file_path, 'wt') as f:
    f.write(f"Query: {QUERY}\n")
    for table in tables:
        f.write(f"\nTable: {extract_id(table['link'])}:{table['tableId']}\n")
        f.write(f"Caption: {table['caption']}\n")
        f.write(f"References: {table['references']}\n") if table['references'] else None
        f.write(f"Footnotes: {table['footnotes']}\n") if table['footnotes'] else None
        f.write("\n-----------------------------------------\n")

In [None]:
# Generates raw json ranking file from HTTP/POST to our system

file_idx = 1
for query in QUERIES:
    
    payload = {
        "inputString":query,
        "resultCount":10,
        "resourceType":"json"
    }

    tables = requests.post(API_ENDPOINT, data=payload)
    tables = tables.json()

    data = {}
    table_idx = 1

    for table in tables:
        data[table_idx] = f"{extract_id(table['link'])}:{table['tableId']}"
        table_idx += 1


    with open(f'query{file_idx}-raw.json', 'w') as f:
        json.dump(data, f, indent=4)

    file_idx += 1        

In [None]:
# Determine the position of the first relevant element (according to the truth source)

def position_of_first_relevant_result(lucene_json, truth_json): 
    with open(truth_json, 'r') as f:
        gpt_rankings = json.load(f)
    
    most_relevant_id = gpt_rankings['1']

    with open(lucene_json, 'r') as f:
        search_rankings = json.load(f)
    
    for key, value in search_rankings.items():
        if value == most_relevant_id:
            return int(key)
    
    return None

## Mean Reciprocal Rank (MRR)

$$
MRR = \frac{1}{|Q|}\sum_{i=1}^{|Q|}\frac{1}{rank_{i}}
$$

Dove $Q$ è il numero di query e $rank_{i}$ la posizione del primo elemento rilevante

In [42]:
RAWS_PATH = [
    "generic-metrics/query1-raw.json",
    "generic-metrics/query2-raw.json",
    "generic-metrics/query3-raw.json",
    "dataset-specific/query1-raw.json",
    "dataset-specific/query2-raw.json",
    "dataset-specific/query3-raw.json",
    "model-specific/query1-raw.json",
    "model-specific/query2-raw.json",
    "model-specific/query3-raw.json"
]

GPT_PATH = [
    "generic-metrics/query1-gpt.json",
    "generic-metrics/query2-gpt.json",
    "generic-metrics/query3-gpt.json",
    "dataset-specific/query1-gpt.json",
    "dataset-specific/query2-gpt.json",
    "dataset-specific/query3-gpt.json",
    "model-specific/query1-gpt.json",
    "model-specific/query2-gpt.json",
    "model-specific/query3-gpt.json"
]


def calculate_mrr(source, truth):
    reciprocal_ranks = []

    for raw, gpt in zip(source, truth):
        reciprocal_rank = 1 / position_of_first_relevant_result(raw, gpt)
        reciprocal_ranks.append(reciprocal_rank)

    return sum(reciprocal_ranks) / len(QUERIES)

mrr = calculate_mrr(RAWS_PATH, GPT_PATH)
print(mrr)

0.6208112874779541


### Habemus MRR

0.6208112874779541 è il valore calcolato confrontando i ranking di GPT-4 con i ranking del nostro sistema sulle 9 query definite. 

Confrontiamo il risultato ottenuto con una **baseline random**

In [53]:
def shuffle_json_values(input_file, output_file):
    # Load the JSON file
    with open(input_file, 'r') as file:
        data = json.load(file)
    
    # Shuffle the values
    keys = list(data.keys())
    values = list(data.values())
    random.shuffle(values)
    
    # Create a new JSON object with shuffled values
    shuffled_data = {key: value for key, value in zip(keys, values)}
    
    # Save the shuffled JSON to the output file
    with open(output_file, 'w') as file:
        json.dump(shuffled_data, file, indent=4)

def shuffle():        
    for raw in RAWS_PATH:
        random_file = os.path.join(os.path.dirname(raw), os.path.basename(raw).replace('raw', 'rnd'))
        shuffle_json_values(raw, random_file)

In [54]:
RND_PATH = [
    "generic-metrics/query1-rnd.json",
    "generic-metrics/query2-rnd.json",
    "generic-metrics/query3-rnd.json",
    "dataset-specific/query1-rnd.json",
    "dataset-specific/query2-rnd.json",
    "dataset-specific/query3-rnd.json",
    "model-specific/query1-rnd.json",
    "model-specific/query2-rnd.json",
    "model-specific/query3-rnd.json"
]

results = []

for i in range(0, 10):
    shuffle()
    results.append(calculate_mrr(RAWS_PATH, RND_PATH))

print(results)
print(f"Average: {sum(results)/len(results)}")

[0.22698412698412695, 0.1845238095238095, 0.3630952380952381, 0.1974867724867725, 0.45185185185185184, 0.5408730158730158, 0.451984126984127, 0.2887125220458554, 0.5566137566137566, 0.3900793650793651]
Average: 0.36522045855379187


daje

## Normalized Discounted Cumulative Gain (NDCG)

$$
NDCG = \frac{DCG}{IDCG}
$$

$$
DCG = \sum_{i=1}^{n}\frac{relevance_{i}}{\log_{2}(i+1)}
$$

Dove
* $relevance_{i}$ è il *relevance score* di un elemento alla posizione $i$ nel ranking
* $i$ è il rank (iniziando da 1)

--- 

* Il $DCG$ è calcolato usando i *relevance score* del nostro sistema
* Il $IDCG$ è calcolato allo stesso modo del $DCG$ ma usando i *relevance score* della **ground truth**

In [None]:
def assign_relevance_score(lucene_json, truth_json):
    with open(lucene_json, 'r') as f1, open(truth_json, 'r') as f2:
        lucene_ranking = json.load(f1)
        truth_ranking = json.load(f2)
    
    n = len(truth_ranking)
    relevance_scores = {value: n - int(key) + 1 for key, value in truth_ranking.items()}

    ranking = {}
    for key, value in lucene_ranking.items():
        relevance_score = relevance_scores.get(value)
        ranking[key] = {"item": value, "relevance_score": relevance_score}
    return ranking

In [66]:
def compute_dcg(ranking):
    dcg = 0.0
    for i, key in enumerate(ranking, start=1):
        relevance_score = ranking[key]["relevance_score"]
        dcg += relevance_score / math.log2(i + 1)
    return dcg

In [67]:
def compute_idcg(ranking):
    sorted_relevance = sorted((v["relevance_score"] for v in ranking.values()), reverse=True)
    
    idcg = 0.0
    for i, relevance_score in enumerate(sorted_relevance, start=1):
        idcg += relevance_score / math.log2(i + 1)

    return idcg 

In [68]:
ndcg_values = []

for raw, gpt in zip(RAWS_PATH, GPT_PATH):
    ranking = assign_relevance_score(raw, gpt)
    dcg = compute_dcg(ranking)
    idcg = compute_idcg(ranking)

    ndcg = dcg / idcg
    print(f"NDCG for {raw.replace('-raw', '')}: {ndcg}")
    ndcg_values.append(ndcg)

print(f"Average on all files: {sum(ndcg_values)/len(ndcg_values)}")

NDCG for generic-metrics/query1.json: 0.9831306821489963
NDCG for generic-metrics/query2.json: 0.972737265142328
NDCG for generic-metrics/query3.json: 0.8789381085401112
NDCG for dataset-specific/query1.json: 0.9986372961784811
NDCG for dataset-specific/query2.json: 0.9133597294931859
NDCG for dataset-specific/query3.json: 0.6975149037011815
NDCG for model-specific/query1.json: 1.0
NDCG for model-specific/query2.json: 0.9320920269228142
NDCG for model-specific/query3.json: 0.9076303300494618
Average on all files: 0.9204489269085067


### Habemus NDCG
avg = 0.9204489269085067 sembra pure bono che te devo di :D

Ma proviamolo con il **random baseline**!!!!

In [84]:
avg_values = []
for i in range(0, 10):
    shuffle()
    ndcg_values = []

    for raw, gpt in zip(RAWS_PATH, RND_PATH):
        ranking = assign_relevance_score(raw, gpt)
        dcg = compute_dcg(ranking)
        idcg = compute_idcg(ranking)

        ndcg = dcg / idcg
        ndcg_values.append(ndcg)

    avg_values.append(sum(ndcg_values)/len(ndcg_values))

print(f"Average on 10 shuffles: {sum(avg_values)/len(avg_values)}")

Average on 10 shuffles: 0.8351258677669422


Hai capito eh