In [1]:
import os
import json
import jsonlines
import time
import pandas as pd
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkl, load_pkls, root_dir, AuredDataset
from lkae.verification.verify import get_verifier
from lkae.utils.scoring import eval_run_custom_nofile
from lkae.verification.verify import Judge, run_verifier_on_dataset
from lkae.utils.data_loading import AuthorityPost

datasets = load_pkls(pkl_dir)

# possilbe splits: train, dev, train_dev_combined
# (test, all_combined don't have "labels")
split = 'train_dev_combined'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

dict_keys(['nopre-nam-bio', 'nopre-nam-nobio', 'nopre-nonam-bio', 'nopre-nonam-nobio', 'pre-nam-bio', 'pre-nam-nobio', 'pre-nonam-bio', 'pre-nonam-nobio'])


In [2]:
# ground truth RQ2
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a set of variations of the dataset
selected_variations = ["pre-nonam-nobio"]

In [3]:
# load each config and construct its verifier

verifiers = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        verifier_label = get_verifier(**config)
        verifiers[config['verifier_method']] = verifier_label

verifiers

Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'sbert-deberta-tasksource': <lkae.verification.models.deberta_verifier.DebertaVerifier at 0x1b6f3512590>,
 'transformers-roberta': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x1b6ccbf7a60>,
 'transformers-bart': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x1b6cd0c56c0>,
 'llama3-1-8B': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x1b6d1465780>,
 'llama3-1-70b': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x1b6d14657b0>,
 'llama3-1-405b': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x1b6d29b4850>,
 'openai-4o-mini': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x1b6d29b54e0>,
 'openai-4o': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x1b6d26f72b0>}

In [4]:
solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True, # ignore NEI predictions
)

In [5]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results
import pickle as pkl

out_dir = 'results'
data = []

for dataset_variation in selected_variations:
    dataset: AuredDataset = dataset_variations_dict[dataset_variation]
    for i, item in enumerate(dataset):
        retrieved_ev = []
        evidences = item["evidence"]
        if evidences is None:
            print(f"skipped {i} because no evidence")
            continue
        for ev in evidences:
            retrieved_ev.append(AuthorityPost(ev.url, ev.post_id, ev.text, 1, 1))
        dataset[i]["retrieved_evidence"] = retrieved_ev
        
    for verifier_label in verifiers:
        start = time.time()

        run_filename = f'{out_dir}/{dataset_variation}_{verifier_label}.pkl'

        # check if the file already exists from a previous run
        if os.path.exists(run_filename):
            print(f'found {run_filename}, loading from file')
            verification_results = pkl.load(open(run_filename, 'rb'))
        else:
            verification_results = run_verifier_on_dataset(
                dataset=dataset,
                verifier=verifiers[verifier_label],
                judge=solomon,
                blind=False,
            )
            pkl.dump(verification_results, open(run_filename, 'wb'))

        # print(verification_results)

        macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

        print(
            f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with verifier {verifier_label} and ground truth file {gold_file}"
        )

        wall_time = time.time() - start

        
        data.append({
            'Macro-F1': macro_f1,
            'Strict-Macro-F1': strict_macro_f1,
            'Verifier_Method': verifier_label, 
            'DS_Settings': dataset_variation,
            'Time (s)': wall_time,
        })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

df_verification.to_csv(f'{out_dir}/df_verification.csv')
print(f'saved df to {out_dir}/df_verification.csv')

# Display the DataFrame
display(df_verification.sort_values(by='Macro-F1', ascending=False))

found results/pre-nonam-nobio_sbert-deberta-tasksource.pkl, loading from file
result for verification run - Macro-F1: 0.4958 Strict-Macro-F1: 0.4958 with verifier sbert-deberta-tasksource and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_train_dev_combined.jsonl
found results/pre-nonam-nobio_transformers-roberta.pkl, loading from file
result for verification run - Macro-F1: 0.7712 Strict-Macro-F1: 0.7712 with verifier transformers-roberta and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_train_dev_combined.jsonl
found results/pre-nonam-nobio_transformers-bart.pkl, loading from file
result for verification run - Macro-F1: 0.7058 Strict-Macro-F1: 0.7058 with verifier transformers-bart and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_train_dev_combined.jsonl


  0%|          | 0/115 [00:00<?, ?it/s]

could not json-parse response from Azure API: I cannot provide a response that contains harmful misinformation about vaccination. Can I help you with something else?, returning NOT ENOUGH INFO answer


-----total token usage for verification-----
total tokens:	91455
prompt tokens:	85930
completion tokens:	5525
price estimate:	$0.02914925
result for verification run - Macro-F1: 0.8031 Strict-Macro-F1: 0.8031 with verifier llama3-1-8B and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_train_dev_combined.jsonl


  0%|          | 0/115 [00:00<?, ?it/s]

-----total token usage for verification-----
total tokens:	91441
prompt tokens:	85930
completion tokens:	5511
price estimate:	$0.24980134000000004
result for verification run - Macro-F1: 0.9607 Strict-Macro-F1: 0.9607 with verifier llama3-1-70b and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_train_dev_combined.jsonl


  0%|          | 0/115 [00:00<?, ?it/s]

could not json-parse response from Azure API: {"decision": "REFUTES", "confidence": 0.8} 

The statement from the WHO authority account confirms the emergence of the first cases of the new Coronavirus in the UAE, but it mentions only 4 members of the same family being infected, which contradicts the claim of 75 cases in Abu Dhabi and 63 cases in Dubai. This discrepancy suggests that the claim is likely an exaggeration or false., returning NOT ENOUGH INFO answer


-----total token usage for verification-----
total tokens:	91492
prompt tokens:	85930
completion tokens:	5562
price estimate:	$0.4643282
result for verification run - Macro-F1: 0.9853 Strict-Macro-F1: 0.9853 with verifier llama3-1-405b and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_train_dev_combined.jsonl
found results/pre-nonam-nobio_openai-4o-mini.pkl, loading from file
result for verification run - Macro-F1: 0.9035 Strict-Macro-F1: 0.9035 with verifier openai-4o-mini and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_train_dev_combined.jsonl
found results/pre-nonam-nobio_openai-4o.pkl, loading from file
result for verification run - Macro-F1: 0.9377 Strict-Macro-F1: 0.9377 with verifier openai-4o and ground truth file c:\users\luisk\projects-win\thesis\lkae\data\English_train_dev_combined.jsonl
saved df to results/df_verification.csv


Unnamed: 0,Macro-F1,Strict-Macro-F1,Verifier_Method,DS_Settings,Time (s)
5,0.985261,0.985261,llama3-1-405b,pre-nonam-nobio,935.063797
4,0.960686,0.960686,llama3-1-70b,pre-nonam-nobio,326.515228
7,0.937672,0.937672,openai-4o,pre-nonam-nobio,0.011043
6,0.903537,0.903537,openai-4o-mini,pre-nonam-nobio,0.011037
3,0.803084,0.803084,llama3-1-8B,pre-nonam-nobio,125.43139
1,0.771243,0.771243,transformers-roberta,pre-nonam-nobio,0.010556
2,0.705831,0.705831,transformers-bart,pre-nonam-nobio,0.010717
0,0.495774,0.495774,sbert-deberta-tasksource,pre-nonam-nobio,0.013042
