In [1]:
import os
import json
import jsonlines
import time
import pandas as pd
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkls, root_dir, AuredDataset
from lkae.retrieval.retrieve import get_retriever,retrieve_evidence
from lkae.verification.verify import get_verifier, Judge, run_verifier_on_dataset
from lkae.utils.scoring import eval_run_custom_nofile

datasets = load_pkls(pkl_dir)

# possilbe splits: train, dev, train_dev_combined
# (test, all_combined don't have "labels")
split = 'train_dev_combined'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

if not pt.started():
    pt.init()

dict_keys(['nopre-nam-bio', 'nopre-nam-nobio', 'nopre-nonam-bio', 'nopre-nonam-nobio', 'pre-nam-bio', 'pre-nam-nobio', 'pre-nonam-bio', 'pre-nonam-nobio'])


PyTerrier 0.10.1 has loaded Terrier 5.10 (built by craigm on 2024-08-22 17:33) and terrier-helper 0.0.8



In [2]:
# ground truth RQ3
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a set of variations of the dataset
selected_variations = ["nopre-nam-bio", "nopre-nonam-nobio", "pre-nam-bio", "pre-nonam-nobio"]

In [3]:
# load each config and construct its retriever
setups = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        exp_fingerprint = f'{config["retriever_method"]}__{config["verifier_method"]}'
        
        # retriever = get_retriever(**config)
        verifier = get_verifier(**config)
        
        setups[exp_fingerprint] = {}
        setups[exp_fingerprint]['retriever'] = config["retriever_method"]
        setups[exp_fingerprint]['verifier'] = verifier

display(setups)

Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'rerank-nv-embed-v1__transformers-roberta': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x223b6c32650>},
 'rerank-nv-embed-v1__transformers-bart': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x2242464ecb0>},
 'rerank-nv-embed-v1__openai-4o-mini': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x22424843f70>},
 'rerank-nv-embed-v1__openai-4o': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x22426506140>},
 'rerank-nv-embed-v1__llama3-1-8b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x22426507fa0>},
 'rerank-nv-embed-v1__llama3-1-70b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_azure_ai.Llama3Azure

In [4]:
solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True, # ignore NEI predictions
)

In [5]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results
import pickle as pkl

out_dir = 'results'
data = []


for dataset_variation in selected_variations:

    for exp_fingerprint in setups:
        # get the dataset here since it is modified in place here, contrary to RQ2
        dataset: AuredDataset = dataset_variations_dict[dataset_variation]
        start = time.time()

        # retrieved_data = retrieve_evidence(dataset, setups[exp_fingerprint]['retriever'])
        data_path = f'{root_dir}/RQ1/experiment-{split}/results/{dataset_variation}_{setups[exp_fingerprint]["retriever"]}.pkl'
        retrieved_data = pkl.load(open(data_path, 'rb'))
        print(f'loaded retrieval results from {data_path}')

        dataset.add_trec_list_judgements(retrieved_data)

        run_filename = f'{out_dir}/{dataset_variation}_{exp_fingerprint}.pkl'

        # check if the file already exists from a previous run
        if os.path.exists(run_filename):
            print(f'found {run_filename}, loading from file')
            verification_results = pkl.load(open(run_filename, 'rb'))
        else:
            print(f'running {exp_fingerprint} on {dataset_variation}')
            verification_results = run_verifier_on_dataset(
                dataset=dataset,
                verifier=setups[exp_fingerprint]['verifier'],
                judge=solomon,
                blind=False,
            )
            pkl.dump(verification_results, open(run_filename, 'wb'))

        # print(verification_results)

        macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

        retriever_label, verifier_label = exp_fingerprint.split('__')

        print(
            f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with retriever: {retriever_label} and retriever: {verifier_label}"
        )

        wall_time = time.time() - start

        data.append({
            'Macro-F1': macro_f1,
            'Strict-Macro-F1': strict_macro_f1,
            'Retrieval_Method': retriever_label, 
            'Verifier_Method': verifier_label, 
            'DS_Settings': dataset_variation,
            'Time (s)': wall_time,
        })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

df_verification.to_csv(f'{out_dir}/df_verification.csv')
print(f'saved df to {out_dir}/df_verification.csv')

# Display the DataFrame
display(df_verification.sort_values(by='Macro-F1', ascending=False))

loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__transformers-roberta on nopre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


result for verification run - Macro-F1: 0.2256 Strict-Macro-F1: 0.2160 with retriever: rerank-nv-embed-v1 and retriever: transformers-roberta
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__transformers-bart on nopre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


result for verification run - Macro-F1: 0.2987 Strict-Macro-F1: 0.2800 with retriever: rerank-nv-embed-v1 and retriever: transformers-bart
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__openai-4o-mini on nopre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	196686
prompt tokens:	188460
completion tokens:	8226
price estimate:	$1.06569
result for verification run - Macro-F1: 0.8104 Strict-Macro-F1: 0.7978 with retriever: rerank-nv-embed-v1 and retriever: openai-4o-mini
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__openai-4o on nopre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	196061
prompt tokens:	186204
completion tokens:	9857
price estimate:	$0.0338448
result for verification run - Macro-F1: 0.8444 Strict-Macro-F1: 0.8401 with retriever: rerank-nv-embed-v1 and retriever: openai-4o
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
found results/nopre-nam-bio_rerank-nv-embed-v1__llama3-1-8b.pkl, loading from file
result for verification run - Macro-F1: 0.5172 Strict-Macro-F1: 0.5108 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
found results/nopre-nam-bio_rerank-nv-embed-v1__llama3-1-70b.pkl, loading from file
result for verification run - Macro-F1: 0.7218 Strict-Macro-F1: 0.7090 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-70

  0%|          | 0/115 [00:00<?, ?it/s]

result for verification run - Macro-F1: 0.3122 Strict-Macro-F1: 0.3038 with retriever: rerank-nv-embed-v1 and retriever: transformers-roberta
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__transformers-bart on pre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

result for verification run - Macro-F1: 0.2190 Strict-Macro-F1: 0.2105 with retriever: rerank-nv-embed-v1 and retriever: transformers-bart
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__openai-4o-mini on pre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	359271
prompt tokens:	342828
completion tokens:	16443
price estimate:	$1.960785
result for verification run - Macro-F1: 0.7932 Strict-Macro-F1: 0.7847 with retriever: rerank-nv-embed-v1 and retriever: openai-4o-mini
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__openai-4o on pre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	357880
prompt tokens:	338316
completion tokens:	19564
price estimate:	$0.062485799999999994
result for verification run - Macro-F1: 0.8547 Strict-Macro-F1: 0.8464 with retriever: rerank-nv-embed-v1 and retriever: openai-4o
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
found results/pre-nonam-nobio_rerank-nv-embed-v1__llama3-1-8b.pkl, loading from file
result for verification run - Macro-F1: 0.6525 Strict-Macro-F1: 0.6417 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
found results/pre-nonam-nobio_rerank-nv-embed-v1__llama3-1-70b.pkl, loading from file
result for verification run - Macro-F1: 0.7315 Strict-Macro-F1: 0.7156 with retriever: rerank-nv-embed-v1 and re

Unnamed: 0,Macro-F1,Strict-Macro-F1,Retrieval_Method,Verifier_Method,DS_Settings,Time (s)
17,0.858523,0.85047,rerank-nv-embed-v1,openai-4o,pre-nam-bio,0.023078
24,0.854713,0.846412,rerank-nv-embed-v1,openai-4o,pre-nonam-nobio,1806.097465
10,0.850244,0.842376,rerank-nv-embed-v1,openai-4o,nopre-nonam-nobio,0.019208
3,0.844351,0.84008,rerank-nv-embed-v1,openai-4o,nopre-nam-bio,1822.676337
2,0.81038,0.797802,rerank-nv-embed-v1,openai-4o-mini,nopre-nam-bio,1863.546027
9,0.803897,0.79536,rerank-nv-embed-v1,openai-4o-mini,nopre-nonam-nobio,0.017215
23,0.793172,0.784668,rerank-nv-embed-v1,openai-4o-mini,pre-nonam-nobio,1796.51254
16,0.781789,0.768742,rerank-nv-embed-v1,openai-4o-mini,pre-nam-bio,0.020559
12,0.776465,0.768775,rerank-nv-embed-v1,llama3-1-70b,nopre-nonam-nobio,0.016772
27,0.773538,0.75484,rerank-nv-embed-v1,llama3-1-405b,pre-nonam-nobio,0.017048
