In [1]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results
import pickle as pkl
import pandas as pd
import json, os, jsonlines

from lkae.utils.data_loading import root_dir
from lkae.utils.scoring import eval_run_custom_nofile

out_dir = './'
data = []

split = 'train_dev_combined'
dataset_split = f'English_{split}'


# ground truth RQ3
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]


selected_variations = ["nopre-nam-bio", "nopre-nonam-nobio", "pre-nam-bio", "pre-nonam-nobio"]

# load each config and construct its retriever
setups = {}

with open('../config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        exp_fingerprint = f'{config["retriever_method"]}__{config["verifier_method"]}'
        
        
        setups[exp_fingerprint] = {}
        setups[exp_fingerprint]['retriever'] = config["retriever_method"]
        setups[exp_fingerprint]['verifier'] = config["verifier_method"]


for dataset_variation in selected_variations:
    for exp_fingerprint in setups:

        run_filename = f'{out_dir}/{dataset_variation}_{exp_fingerprint}.pkl'

        # check if the file already exists from a previous run
        if os.path.exists(run_filename):
            # print(f'found {run_filename}, loading from file')
            verification_results = pkl.load(open(run_filename, 'rb'))
        else:
            print(f'file {run_filename} not found, skipping')

        macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

        retriever_label, verifier_label = exp_fingerprint.split('__')

        # print(
        #     f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with retriever: {retriever_label} and retriever: {verifier_label}"
        # )

        data.append({
            'Macro-F1': macro_f1,
            'Strict-Macro-F1': strict_macro_f1,
            'Retrieval_Method': retriever_label, 
            'Verifier_Method': verifier_label, 
            'DS_Settings': dataset_variation,
        })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

# Display the DataFrame
df_sorted = df_verification.sort_values(by='Macro-F1', ascending=False)

# add an overall rank column based on the Macro-F1 score
# it displays the index in the dataframe for each row
ranks = []
rank = 0
last_score = 1.0
for idx,row in df_sorted.iterrows():
    if row['Macro-F1'] < last_score:
        rank += 1
        last_score = row['Macro-F1']
    ranks.append(rank)

df_sorted['Rank'] = ranks

# move rank column to the front
df_sorted = df_sorted[['Rank', 'Macro-F1', 'Strict-Macro-F1', 'Retrieval_Method', 'Verifier_Method', 'DS_Settings']]

display(df_sorted)
# len(df_sorted)

Unnamed: 0,Rank,Macro-F1,Strict-Macro-F1,Retrieval_Method,Verifier_Method,DS_Settings
17,1,0.858523,0.85047,rerank-nv-embed-v1,openai-4o,pre-nam-bio
24,2,0.854713,0.846412,rerank-nv-embed-v1,openai-4o,pre-nonam-nobio
10,3,0.850244,0.842376,rerank-nv-embed-v1,openai-4o,nopre-nonam-nobio
3,4,0.844351,0.84008,rerank-nv-embed-v1,openai-4o,nopre-nam-bio
2,5,0.81038,0.797802,rerank-nv-embed-v1,openai-4o-mini,nopre-nam-bio
9,6,0.803897,0.79536,rerank-nv-embed-v1,openai-4o-mini,nopre-nonam-nobio
23,7,0.793172,0.784668,rerank-nv-embed-v1,openai-4o-mini,pre-nonam-nobio
16,8,0.781789,0.768742,rerank-nv-embed-v1,openai-4o-mini,pre-nam-bio
26,9,0.774271,0.762466,rerank-nv-embed-v1,llama3-1-70b,pre-nonam-nobio
12,10,0.767206,0.759516,rerank-nv-embed-v1,llama3-1-70b,nopre-nonam-nobio


In [2]:
# filter df_sorted for only the best result for each verifier_method
df_best = df_sorted.groupby('Verifier_Method').head(1)

df_best

Unnamed: 0,Rank,Macro-F1,Strict-Macro-F1,Retrieval_Method,Verifier_Method,DS_Settings
17,1,0.858523,0.85047,rerank-nv-embed-v1,openai-4o,pre-nam-bio
2,5,0.81038,0.797802,rerank-nv-embed-v1,openai-4o-mini,nopre-nam-bio
26,9,0.774271,0.762466,rerank-nv-embed-v1,llama3-1-70b,pre-nonam-nobio
27,11,0.766809,0.751714,rerank-nv-embed-v1,llama3-1-405b,pre-nonam-nobio
18,17,0.631411,0.62089,rerank-nv-embed-v1,llama3-1-8b,pre-nam-bio
7,21,0.322321,0.322321,rerank-nv-embed-v1,transformers-roberta,nopre-nonam-nobio
1,23,0.298747,0.280012,rerank-nv-embed-v1,transformers-bart,nopre-nam-bio
