In [1]:
import os
import json
import jsonlines
import time
import pandas as pd
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkls, root_dir, AuredDataset
from lkae.retrieval.retrieve import get_retriever,retrieve_evidence
from lkae.verification.verify import get_verifier, Judge, run_verifier_on_dataset
from lkae.utils.scoring import eval_run_custom_nofile

datasets = load_pkls(pkl_dir)

# possilbe splits: train, dev, train_dev_combined
# (test, all_combined don't have "labels")
split = 'train_dev_combined'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

if not pt.started():
    pt.init()

dict_keys(['nopre-nam-bio', 'nopre-nam-nobio', 'nopre-nonam-bio', 'nopre-nonam-nobio', 'pre-nam-bio', 'pre-nam-nobio', 'pre-nonam-bio', 'pre-nonam-nobio'])


PyTerrier 0.10.1 has loaded Terrier 5.10 (built by craigm on 2024-08-22 17:33) and terrier-helper 0.0.8



In [2]:
# ground truth RQ3
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a set of variations of the dataset
selected_variations = ["nopre-nonam-nobio", "pre-nam-bio"]

In [3]:
# load each config and construct its retriever
setups = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        exp_fingerprint = f'{config["retriever_method"]}__{config["verifier_method"]}'
        
        # retriever = get_retriever(**config)
        verifier = get_verifier(**config)
        
        setups[exp_fingerprint] = {}
        setups[exp_fingerprint]['retriever'] = config["retriever_method"]
        setups[exp_fingerprint]['verifier'] = verifier

display(setups)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'rerank-nv-embed-v1__llama3-8b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_hf.HFLlama3Verifier at 0x208ce322170>},
 'rerank-nv-embed-v1__llama3-70b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_hf.HFLlama3Verifier at 0x20907de7880>},
 'rerank-nv-embed-v1__transformers-roberta': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x20907dcb2b0>},
 'rerank-nv-embed-v1__transformers-bart': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x20910d76890>},
 'rerank-nv-embed-v1__openai-4o-mini': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x2092951bb50>},
 'rerank-nv-embed-v1__openai-4o': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x2092a86d

In [4]:
solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True, # ignore NEI predictions
)

In [7]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results
import pickle as pkl

out_dir = 'results'
data = []


for dataset_variation in selected_variations:

    for exp_fingerprint in setups:
        # get the dataset here since it is modified in place here, contrary to RQ2
        dataset: AuredDataset = dataset_variations_dict[dataset_variation]
        start = time.time()

        # retrieved_data = retrieve_evidence(dataset, setups[exp_fingerprint]['retriever'])
        data_path = f'{root_dir}/RQ1/experiment-{split}/results/{dataset_variation}_{setups[exp_fingerprint]["retriever"]}.pkl'
        print(f'loaded retrieval results from {data_path}')
        retrieved_data = pkl.load(open(data_path, 'rb'))

        dataset.add_trec_list_judgements(retrieved_data)

        run_filename = f'{out_dir}/{dataset_variation}_{exp_fingerprint}.pkl'

        # check if the file already exists from a previous run
        if os.path.exists(run_filename):
            print(f'found {run_filename}, loading from file')
            verification_results = pkl.load(open(run_filename, 'rb'))
        else:
            print(f'running {exp_fingerprint} on {dataset_variation}')
            verification_results = run_verifier_on_dataset(
                dataset=dataset,
                verifier=setups[exp_fingerprint]['verifier'],
                judge=solomon,
                blind=False,
            )
            pkl.dump(verification_results, open(run_filename, 'wb'))

        # print(verification_results)

        macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

        retriever_label, verifier_label = exp_fingerprint.split('__')

        print(
            f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with retriever: {retriever_label} and retriever: {verifier_label}"
        )

        wall_time = time.time() - start

        data.append({
            'Macro-F1': macro_f1,
            'Strict-Macro-F1': strict_macro_f1,
            'Retrieval_Method': retriever_label, 
            'Verifier_Method': verifier_label, 
            'DS_Settings': dataset_variation,
            'Time (s)': wall_time,
        })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

df_verification.to_csv(f'{out_dir}/df_verification.csv')
print(f'saved df to {out_dir}/df_verification.csv')

# Display the DataFrame
display(df_verification.sort_values(by='Macro-F1', ascending=False))

loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-8b on nopre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

Error (429): 429; Text: {"error":"Rate limit reached. You reached PRO hourly usage limit. Use Inference Endpoints (dedicated) to scale your endpoint."}; sleeping 1 hour...
Error (4xx): 429; Text: {"error":"Rate limit reached. You reached PRO hourly usage limit. Use Inference Endpoints (dedicated) to scale your endpoint."}; retrying... (retries=0)
sleeping for 4 seconds before retrying (retries=1)
result for verification run - Macro-F1: 0.6702 Strict-Macro-F1: 0.6646 with retriever: rerank-nv-embed-v1 and retriever: llama3-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-70b on nopre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

ERROR: unkown label " SUPPORTS" in answer: {"decision": " SUPPORTS", "confidence": 1}
Error (429): 429; Text: {"error":"Model is overloaded","error_type":"overloaded"}; sleeping 1 hour...
Error (4xx): 429; Text: {"error":"Model is overloaded","error_type":"overloaded"}; retrying... (retries=0)
sleeping for 4 seconds before retrying (retries=1)
Error (429): 429; Text: {"error":"Model is overloaded","error_type":"overloaded"}; sleeping 1 hour...
Error (4xx): 429; Text: {"error":"Model is overloaded","error_type":"overloaded"}; retrying... (retries=0)
sleeping for 4 seconds before retrying (retries=1)
ERROR: could not find the answer format in answer from model: {"decision": "NOT ENOUGH INFO", "confidence score": 1}
ERROR: could not find the answer format in answer from model: {"decision": ["REFUTES"], "confidence": 0.8}
ERROR: could not find the answer format in answer from model: {"decision": "SUPPORTS", "confidence_score": 0.8}
result for verification run - Macro-F1: 0.7547 Strict-Macr

  0%|          | 0/115 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


result for verification run - Macro-F1: 0.3223 Strict-Macro-F1: 0.3223 with retriever: rerank-nv-embed-v1 and retriever: transformers-roberta
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__transformers-bart on nopre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


result for verification run - Macro-F1: 0.2835 Strict-Macro-F1: 0.2751 with retriever: rerank-nv-embed-v1 and retriever: transformers-bart
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__openai-4o-mini on nopre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	178532
prompt tokens:	170298
completion tokens:	8234
price estimate:	$1.9500000000000002
result for verification run - Macro-F1: 0.8039 Strict-Macro-F1: 0.7954 with retriever: rerank-nv-embed-v1 and retriever: openai-4o-mini
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__openai-4o on nopre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	177794
prompt tokens:	168042
completion tokens:	9752
price estimate:	$1.97298
result for verification run - Macro-F1: 0.8502 Strict-Macro-F1: 0.8424 with retriever: rerank-nv-embed-v1 and retriever: openai-4o
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-8b on pre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

result for verification run - Macro-F1: 0.6280 Strict-Macro-F1: 0.6161 with retriever: rerank-nv-embed-v1 and retriever: llama3-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-70b on pre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

ERROR: could not find the answer format in answer from model: {"REFUTES": 0.9}
ERROR: could not find the answer format in answer from model: REFUTES: 0.7
ERROR: could not find the answer format in answer from model: {"REFUTES", 0.85}
ERROR: could not find the answer format in answer from model: {"REFUTES": 0.9}
ERROR: could not find the answer format in answer from model: REFUTES, Confidence: 0.9

The statement does not mention a Qatari being killed in Tunisia, nor does it mention the city of Bizerte. The statement actually talks about the humanitarian situation in Palestine, which suggests that it is unrelated to the claim.
ERROR: could not find the answer format in answer from model: {"REFUTES", 0.8}
ERROR: could not find the answer format in answer from model: REFUTES, 1
ERROR: could not find the answer format in answer from model: {"REFUTES": 0.8}
ERROR: could not find the answer format in answer from model: {"REFUTES": 0.8}
ERROR: could not find the answer format in answer from mo

ValueError: could not convert string to float: '0.8}'