In [1]:
import os
import json
import jsonlines
import time
import pandas as pd
import pickle as pkl
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkls, root_dir, AuredDataset
from lkae.retrieval.retrieve import get_retriever,retrieve_evidence
from lkae.verification.verify import get_verifier, Judge, run_verifier_on_dataset
from lkae.utils.scoring import eval_run_custom_nofile

datasets = load_pkls(pkl_dir)

# possilbe splits: train, dev, train_dev_combined
# (test, all_combined don't have "labels")
split = 'train_dev_combined'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

if not pt.started():
    pt.init()

dict_keys(['nopre-nam-bio', 'nopre-nam-nobio', 'nopre-nonam-bio', 'nopre-nonam-nobio', 'pre-nam-bio', 'pre-nam-nobio', 'pre-nonam-bio', 'pre-nonam-nobio'])


PyTerrier 0.10.1 has loaded Terrier 5.10 (built by craigm on 2024-08-22 17:33) and terrier-helper 0.0.8



In [2]:
# ground truth RQ3
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a set of variations of the dataset
selected_variations = ["nopre-nam-bio", "nopre-nonam-nobio", "pre-nam-bio", "pre-nonam-nobio"]

In [3]:
# load each config and construct its retriever
setups = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        exp_fingerprint = f'{config["retriever_method"]}__{config["verifier_method"]}'
        
        # retriever = get_retriever(**config)
        verifier = get_verifier(**config)
        
        setups[exp_fingerprint] = {}
        setups[exp_fingerprint]['retriever'] = config["retriever_method"]
        setups[exp_fingerprint]['verifier'] = verifier

display(setups)

{'rerank-nv-embed-v1__llama3-1-8b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x164e2903550>},
 'rerank-nv-embed-v1__llama3-1-70b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x164e2903ee0>},
 'rerank-nv-embed-v1__llama3-1-405b': {'retriever': 'rerank-nv-embed-v1',
  'verifier': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x164e29034c0>}}

In [4]:
solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True, # ignore NEI predictions
)

In [5]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results

out_dir = 'results'
data = []


for dataset_variation in selected_variations:

    for exp_fingerprint in setups:
        # get the dataset here since it is modified in place here, contrary to RQ2
        dataset: AuredDataset = dataset_variations_dict[dataset_variation]
        start = time.time()

        # retrieved_data = retrieve_evidence(dataset, setups[exp_fingerprint]['retriever'])
        data_path = f'{root_dir}/RQ1/experiment-{split}/results/{dataset_variation}_{setups[exp_fingerprint]["retriever"]}.pkl'
        retrieved_data = pkl.load(open(data_path, 'rb'))
        print(f'loaded retrieval results from {data_path}')

        dataset.add_trec_list_judgements(retrieved_data)

        run_filename = f'{out_dir}/{dataset_variation}_{exp_fingerprint}.pkl'

        # check if the file already exists from a previous run
        if os.path.exists(run_filename):
            print(f'found {run_filename}, loading from file')
            verification_results = pkl.load(open(run_filename, 'rb'))
        else:
            print(f'running {exp_fingerprint} on {dataset_variation}')
            verification_results = run_verifier_on_dataset(
                dataset=dataset,
                verifier=setups[exp_fingerprint]['verifier'],
                judge=solomon,
                blind=False,
            )
            pkl.dump(verification_results, open(run_filename, 'wb'))

        # print(verification_results)

        macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

        retriever_label, verifier_label = exp_fingerprint.split('__')

        print(
            f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with retriever: {retriever_label} and retriever: {verifier_label}"
        )

        wall_time = time.time() - start

        data.append({
            'Macro-F1': macro_f1,
            'Strict-Macro-F1': strict_macro_f1,
            'Retrieval_Method': retriever_label, 
            'Verifier_Method': verifier_label, 
            'DS_Settings': dataset_variation,
            'Time (s)': wall_time,
        })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

df_verification.to_csv(f'{out_dir}/df_verification.csv')
print(f'saved df to {out_dir}/df_verification.csv')

# Display the DataFrame
display(df_verification.sort_values(by='Macro-F1', ascending=False))

loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-8b on nopre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	190971
prompt tokens:	181305
completion tokens:	9666
price estimate:	$0.060287759999999996
result for verification run - Macro-F1: 0.5072 Strict-Macro-F1: 0.5008 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-70b on nopre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	190896
prompt tokens:	181305
completion tokens:	9591
price estimate:	$0.51984954
result for verification run - Macro-F1: 0.7089 Strict-Macro-F1: 0.6961 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-70b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-405b on nopre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	192449
prompt tokens:	181305
completion tokens:	11144
price estimate:	$0.9787469000000001
result for verification run - Macro-F1: 0.7100 Strict-Macro-F1: 0.6913 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-405b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-8b on nopre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	363570
prompt tokens:	344182
completion tokens:	19388
price estimate:	$0.11508128000000001
result for verification run - Macro-F1: 0.5801 Strict-Macro-F1: 0.5684 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-70b on nopre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	363367
prompt tokens:	344182
completion tokens:	19185
price estimate:	$0.99032266
result for verification run - Macro-F1: 0.7672 Strict-Macro-F1: 0.7595 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-70b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/nopre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-405b on nopre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	365570
prompt tokens:	344182
completion tokens:	21388
price estimate:	$1.8583854
result for verification run - Macro-F1: 0.7464 Strict-Macro-F1: 0.7355 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-405b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-8b on pre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	534895
prompt tokens:	505925
completion tokens:	28970
price estimate:	$0.16944919999999997
result for verification run - Macro-F1: 0.6314 Strict-Macro-F1: 0.6209 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-70b on pre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	534736
prompt tokens:	505925
completion tokens:	28811
price estimate:	$1.4578699400000001
result for verification run - Macro-F1: 0.7052 Strict-Macro-F1: 0.6923 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-70b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nam-bio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-405b on pre-nam-bio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	536975
prompt tokens:	505925
completion tokens:	31050
price estimate:	$2.7310825
result for verification run - Macro-F1: 0.7210 Strict-Macro-F1: 0.7020 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-405b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-8b on pre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

could not json-parse response from Azure API: I cannot provide a response that supports a conspiracy theory. Is there something else I can help you with?, returning NOT ENOUGH INFO answer


-----total token usage for verification-----
total tokens:	690603
prompt tokens:	651991
completion tokens:	38612
price estimate:	$0.21915061999999996
result for verification run - Macro-F1: 0.6091 Strict-Macro-F1: 0.5975 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-8b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-70b on pre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	690449
prompt tokens:	651991
completion tokens:	38458
price estimate:	$1.8834772000000002
result for verification run - Macro-F1: 0.7743 Strict-Macro-F1: 0.7625 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-70b
loaded retrieval results from c:\users\luisk\projects-win\thesis\lkae/RQ1/experiment-train_dev_combined/results/pre-nonam-nobio_rerank-nv-embed-v1.pkl
running rerank-nv-embed-v1__llama3-1-405b on pre-nonam-nobio


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	692830
prompt tokens:	651991
completion tokens:	40839
price estimate:	$3.5208947
result for verification run - Macro-F1: 0.7668 Strict-Macro-F1: 0.7517 with retriever: rerank-nv-embed-v1 and retriever: llama3-1-405b
saved df to results/df_verification.csv


Unnamed: 0,Macro-F1,Strict-Macro-F1,Retrieval_Method,Verifier_Method,DS_Settings,Time (s)
10,0.774271,0.762466,rerank-nv-embed-v1,llama3-1-70b,pre-nonam-nobio,848.446947
4,0.767206,0.759516,rerank-nv-embed-v1,llama3-1-70b,nopre-nonam-nobio,863.011265
11,0.766809,0.751714,rerank-nv-embed-v1,llama3-1-405b,pre-nonam-nobio,768.592623
5,0.746368,0.735498,rerank-nv-embed-v1,llama3-1-405b,nopre-nonam-nobio,791.743006
8,0.721013,0.701967,rerank-nv-embed-v1,llama3-1-405b,pre-nam-bio,746.702997
2,0.709957,0.691318,rerank-nv-embed-v1,llama3-1-405b,nopre-nam-bio,839.955027
1,0.708889,0.696105,rerank-nv-embed-v1,llama3-1-70b,nopre-nam-bio,730.069102
7,0.705238,0.692306,rerank-nv-embed-v1,llama3-1-70b,pre-nam-bio,1028.190726
6,0.631411,0.62089,rerank-nv-embed-v1,llama3-1-8b,pre-nam-bio,230.790539
9,0.60915,0.597481,rerank-nv-embed-v1,llama3-1-8b,pre-nonam-nobio,278.86256
