TODO: compare multiple runs of a LLM verifier

In [1]:
import os
import json
import jsonlines
import time
import pandas as pd
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkl, root_dir, AuredDataset
from lkae.verification.verify import get_verifier
from lkae.utils.scoring import eval_run_custom_nofile
from lkae.verification.verify import Judge, run_verifier_on_dataset
from lkae.utils.data_loading import AuthorityPost

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [2]:
datasets = {}

# walk through the pkl directory and load all the datasets in one of its subdirectories
# load each dataset with its subdirectory name and filename as the key
# skip non-pkl files
for subdir in os.listdir(pkl_dir):
    if not os.path.isdir(os.path.join(pkl_dir, subdir)):
        continue            
    datasets[subdir] = {}
    for filename in os.listdir(os.path.join(pkl_dir, subdir)):
        if not filename.endswith('.pkl'):
            continue
        key = os.path.join(subdir, filename)
        datasets[subdir][filename.split('.')[0]] = load_pkl(os.path.join(pkl_dir, key))

# possilbe splits: train, dev, train_dev_combined
# (test, all_combined don't have "labels")
split = 'dev'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

# ground truth RQ2
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a set of variations of the dataset
selected_variations = ["pre-nonam-nobio"]

dict_keys(['nopre-nam-bio', 'nopre-nam-nobio', 'nopre-nonam-bio', 'nopre-nonam-nobio', 'pre-nam-bio', 'pre-nam-nobio', 'pre-nonam-bio', 'pre-nonam-nobio'])


In [3]:
# load each config and construct its verifier

verifiers = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        verifier_label = get_verifier(**config)
        verifiers[config['verifier_method']] = verifier_label

verifiers

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word

{'transformers-roberta': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x1e46c611510>,
 'transformers-bart': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x1e4a3fd1ba0>,
 'llama3-8b': <lkae.verification.models.llama3_hf.HFLlama3Verifier at 0x1e4a527ee30>,
 'llama3-70b': <lkae.verification.models.llama3_hf.HFLlama3Verifier at 0x1e4a57ec700>,
 'llama3-405b': <lkae.verification.models.llama3_hf.HFLlama3Verifier at 0x1e4a57dc700>,
 'openai-4o-mini': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x1e4a5804700>,
 'openai-4o': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x1e5025acdf0>}

In [4]:
solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True, # ignore NEI predictions
)

In [5]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results

out_dir = 'results'
data = []

for dataset_variation in selected_variations:
    dataset: AuredDataset = dataset_variations_dict[dataset_variation]
    for i, item in enumerate(dataset):
        retrieved_ev = []
        evidences = item["evidence"]
        if evidences is None:
            print(f"skipped {i} because no evidence")
            continue
        for ev in evidences:
            retrieved_ev.append(AuthorityPost(ev.url, ev.post_id, ev.text, 1, 1))
        dataset[i]["retrieved_evidence"] = retrieved_ev
        
    for verifier_label in verifiers:
        start = time.time()

        verification_results = run_verifier_on_dataset(
            dataset=dataset,
            verifier=verifiers[verifier_label],
            judge=solomon,
            blind=False,
        )

        # print(verification_results)

        macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

        print(
            f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with verifier {verifier_label} and ground truth file {gold_file}"
        )

        wall_time = time.time() - start

        
        data.append({
            'Macro-F1': macro_f1,
            'Strict-Macro-F1': strict_macro_f1,
            'Verifier_Method': verifier_label, 
            'DS_Settings': dataset_variation,
            'Time (s)': wall_time,
        })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

df_verification.to_csv(f'{out_dir}/df_verification.csv')
print(f'saved df to {out_dir}/df_verification.csv')

# Display the DataFrame
display(df_verification.sort_values(by='Macro-F1', ascending=False))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


result for verification run - Macro-F1: 0.6889 Strict-Macro-F1: 0.6889 with verifier transformers-roberta and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl


  attn_output = torch.nn.functional.scaled_dot_product_attention(


result for verification run - Macro-F1: 0.6787 Strict-Macro-F1: 0.6787 with verifier transformers-bart and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl
result for verification run - Macro-F1: 0.8640 Strict-Macro-F1: 0.8640 with verifier llama3-8b and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl
result for verification run - Macro-F1: 1.0000 Strict-Macro-F1: 1.0000 with verifier llama3-70b and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl
result for verification run - Macro-F1: 0.9722 Strict-Macro-F1: 0.9722 with verifier llama3-405b and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl
-----total token usage for verification-----
total tokens:	16264
prompt tokens:	15502
completion tokens:	762
price estimate:	$0.17788
result for verification run - Macro-F1: 1.0000 Strict-Macro-F1: 1.0000 with verifier openai-4o-mini and ground truth file c:\users\luis

Unnamed: 0,Macro-F1,Strict-Macro-F1,Verifier_Method,DS_Settings,Time (s)
3,1.0,1.0,llama3-70b,pre-nonam-nobio,110.913317
5,1.0,1.0,openai-4o-mini,pre-nonam-nobio,183.76702
4,0.972174,0.972174,llama3-405b,pre-nonam-nobio,335.90203
6,0.972174,0.972174,openai-4o,pre-nonam-nobio,180.976645
2,0.863997,0.863997,llama3-8b,pre-nonam-nobio,22.043511
0,0.688889,0.688889,transformers-roberta,pre-nonam-nobio,4.249372
1,0.678664,0.678664,transformers-bart,pre-nonam-nobio,6.098403
