In [1]:
import os
import json
import jsonlines
import time
import pandas as pd
import pickle as pkl
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkls, root_dir, AuredDataset
from lkae.retrieval.retrieve import get_retriever,retrieve_evidence
from lkae.verification.verify import get_verifier, Judge, run_verifier_on_dataset
from lkae.utils.scoring import eval_run_custom_nofile

datasets = load_pkls(pkl_dir)

# possilbe splits: train, dev, train_dev_combined
# (test, all_combined don't have "labels")
split = 'train_dev_combined'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

if not pt.started():
    pt.init()

dict_keys(['nopre-nam-bio', 'nopre-nam-nobio', 'nopre-nonam-bio', 'nopre-nonam-nobio', 'pre-nam-bio', 'pre-nam-nobio', 'pre-nonam-bio', 'pre-nonam-nobio'])


PyTerrier 0.10.1 has loaded Terrier 5.10 (built by craigm on 2024-08-22 17:33) and terrier-helper 0.0.8



In [2]:
# ground truth RQ3
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a set of variations of the dataset
selected_variations = ["nopre-nam-bio", "nopre-nonam-nobio", "pre-nam-bio", "pre-nonam-nobio"]

In [3]:
# load each config and construct its retriever
setups = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        exp_fingerprint = f'{config["retriever_method"]}__{config["verifier_method"]}'
        
        # retriever = get_retriever(**config)
        verifier = get_verifier(**config)
        
        setups[exp_fingerprint] = {}
        setups[exp_fingerprint]['retriever'] = config["retriever_method"]
        setups[exp_fingerprint]['verifier'] = verifier

display(setups)

{'rerank-nv-embed-v1__llama3-1-8b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x1d91df006a0>},
 'rerank-nv-embed-v1__llama3-1-70b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x1d91df03e50>},
 'rerank-nv-embed-v1__llama3-1-405b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x1d91df00d90>}}

In [4]:
solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True, # ignore NEI predictions
)

In [5]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results

out_dir = 'results'
data = []


for dataset_variation in selected_variations:

    for exp_fingerprint in setups:
        # get the dataset here since it is modified in place here, contrary to RQ2
        dataset: AuredDataset = dataset_variations_dict[dataset_variation]
        start = time.time()

        # retrieved_data = retrieve_evidence(dataset, setups[exp_fingerprint]['retriever'])
        data_path = f'{root_dir}/RQ1/experiment-{split}/results/{dataset_variation}_{setups[exp_fingerprint]["retriever"]}.pkl'
        retrieved_data = pkl.load(open(data_path, 'rb'))
        print(f'loaded retrieval results from {data_path}')

        dataset.add_trec_list_judgements(retrieved_data)

        run_filename = f'{out_dir}/{dataset_variation}_{exp_fingerprint}.pkl'

        # check if the file already exists from a previous run
        if os.path.exists(run_filename):
            print(f'found {run_filename}, loading from file')
            verification_results = pkl.load(open(run_filename, 'rb'))
        else:
            print(f'running {exp_fingerprint} on {dataset_variation}')
            verification_results = run_verifier_on_dataset(
                dataset=dataset,
                verifier=setups[exp_fingerprint]['verifier'],
                judge=solomon,
                blind=False,
            )
            pkl.dump(verification_results, open(run_filename, 'wb'))

        # print(verification_results)

        macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

        retriever_label, verifier_label = exp_fingerprint.split('__')

        print(
            f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with retriever: {retriever_label} and retriever: {verifier_label}"
        )

        wall_time = time.time() - start

        data.append({
            'Macro-F1': macro_f1,
            'Strict-Macro-F1': strict_macro_f1,
            'Retrieval_Method': retriever_label, 
            'Verifier_Method': verifier_label, 
            'DS_Settings': dataset_variation,
            'Time (s)': wall_time,
        })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

df_verification.to_csv(f'{out_dir}/df_verification.csv')
print(f'saved df to {out_dir}/df_verification.csv')

# Display the DataFrame
display(df_verification.sort_values(by='Macro-F1', ascending=False))

loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
found results/nopre-nam-bio_rerank-nv-embed-v1__llama3-1-8b.pkl, loading from file
result for verification run - Macro-F1: 0.5172 Strict-Macro-F1: 0.5108 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
found results/nopre-nam-bio_rerank-nv-embed-v1__llama3-1-70b.pkl, loading from file
result for verification run - Macro-F1: 0.7218 Strict-Macro-F1: 0.7090 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-70b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
found results/nopre-nam-bio_rerank-nv-embed-v1__llama3-1-405b.pkl, loading from file
result for verification run - Macr

  0%|          | 0/115 [00:00<?, ?it/s]

could not json-parse response from Azure API: {"decision": "REFUTES", "confidence": 0.8}

Reasoning: The statement from the Yemeni Ministry of Foreign Affairs mentions that the Minister discussed the possibility of evacuation of Yemeni nationals, including students, from Wuhan, China, and the sending of additional financial aid. This implies that the Yemeni government is taking steps to help its citizens in China, which contradicts the claim that Saudi Arabia did not evacuate Yemeni students while Oman did. However, the statement does not directly address the specific claim about Saudi Arabia's actions, hence the confidence score is not 1., returning NOT ENOUGH INFO answer
could not json-parse response from Azure API: {"decision": "NOT ENOUGH INFO", "confidence": 0.8} 

The statement from the Chinese Embassy in Yemen only talks about the Chinese government providing food to Yemeni students in Wuhan, while the claim is about Saudi Arabia and Oman's actions regarding evacuating Yemeni st

-----total token usage for verification-----
total tokens:	173243
prompt tokens:	162877
completion tokens:	10366
price estimate:	$0.8798337
result for verification run - Macro-F1: 0.7409 Strict-Macro-F1: 0.7300 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-405b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-8b on pre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	171340
prompt tokens:	161743
completion tokens:	9597
price estimate:	$0.05437706999999999
result for verification run - Macro-F1: 0.6158 Strict-Macro-F1: 0.6052 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-70b on pre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	171379
prompt tokens:	161743
completion tokens:	9636
price estimate:	$0.46758268
result for verification run - Macro-F1: 0.7052 Strict-Macro-F1: 0.6923 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-70b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-405b on pre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	344658
prompt tokens:	324620
completion tokens:	20038
price estimate:	$1.7525468
result for verification run - Macro-F1: 0.7047 Strict-Macro-F1: 0.6894 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-405b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
found results/pre-nonam-nobio_rerank-nv-embed-v1__llama3-1-8b.pkl, loading from file
result for verification run - Macro-F1: 0.6525 Strict-Macro-F1: 0.6417 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
found results/pre-nonam-nobio_rerank-nv-embed-v1__llama3-1-70b.pkl, loading from file
result for verification run - Macro-F1: 0.7315 Strict-Macro-F1: 0.7156 with retriever: rerank-nv-embed-v1 and retriever

Unnamed: 0,Macro-F1,Strict-Macro-F1,Retrieval_Method,Verifier_Method,DS_Settings,Time (s)
4,0.776465,0.768775,rerank-nv-embed-v1,llama3-1-70b,nopre-nonam-nobio,0.014027
11,0.773538,0.75484,rerank-nv-embed-v1,llama3-1-405b,pre-nonam-nobio,0.017059
5,0.740946,0.73004,rerank-nv-embed-v1,llama3-1-405b,nopre-nonam-nobio,951.188266
10,0.73147,0.715628,rerank-nv-embed-v1,llama3-1-70b,pre-nonam-nobio,0.017273
1,0.721766,0.709036,rerank-nv-embed-v1,llama3-1-70b,nopre-nam-bio,0.01506
7,0.705238,0.692306,rerank-nv-embed-v1,llama3-1-70b,pre-nam-bio,536.397683
8,0.704749,0.689438,rerank-nv-embed-v1,llama3-1-405b,pre-nam-bio,1018.201312
2,0.704116,0.688744,rerank-nv-embed-v1,llama3-1-405b,nopre-nam-bio,0.01754
9,0.652548,0.641708,rerank-nv-embed-v1,llama3-1-8b,pre-nonam-nobio,0.030309
6,0.615837,0.6052,rerank-nv-embed-v1,llama3-1-8b,pre-nam-bio,226.308516
