In [1]:
import os
import json
import jsonlines
import time
import pandas as pd
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkl, root_dir, AuredDataset
from lkae.verification.verify import get_verifier
from lkae.utils.scoring import eval_run_custom_nofile
from lkae.verification.verify import Judge, run_verifier_on_dataset
from lkae.utils.data_loading import AuthorityPost

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [None]:
datasets = {}

# walk through the pkl directory and load all the datasets in one of its subdirectories
# load each dataset with its subdirectory name and filename as the key
# skip non-pkl files
for subdir in os.listdir(pkl_dir):
    if not os.path.isdir(os.path.join(pkl_dir, subdir)):
        continue            
    datasets[subdir] = {}
    for filename in os.listdir(os.path.join(pkl_dir, subdir)):
        if not filename.endswith('.pkl'):
            continue
        key = os.path.join(subdir, filename)
        datasets[subdir][filename.split('.')[0]] = load_pkl(os.path.join(pkl_dir, key))

split = 'dev'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

# ground truth RQ2
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a single variation of the dataset
selected_variation = "pre-nonam-nobio"
dataset: AuredDataset = dataset_variations_dict[selected_variation]

In [2]:
# load each config and construct its verifier

verifiers = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        verifier_label = get_verifier(**config)
        verifiers[config['verifier_method']] = verifier_label

verifiers

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'transformers-roberta': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x1f76d4d2b00>,
 'transformers-bart': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x1f7a8dc2530>}

In [3]:
solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True, # ignore NEI predictions
)

for i, item in enumerate(dataset):
    retrieved_ev = []
    evidences = item["evidence"]
    if evidences is None:
        print(f"skipped {i} because no evidence")
        continue
    for ev in evidences:
        retrieved_ev.append(AuthorityPost(ev.url, ev.post_id, ev.text, 1, 1))
    dataset[i]["retrieved_evidence"] = retrieved_ev

In [4]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results

out_dir = 'results'
data = []

for verifier_label in verifiers:
    start = time.time()

    verification_results = run_verifier_on_dataset(
        dataset=dataset,
        verifier=verifiers[verifier_label],
        judge=solomon,
        blind=False,
    )

    # print(verification_results)

    macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

    print(
        f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with config {config} and TREC FILE {gold_file}"
    )

    wall_time = time.time() - start

    
    data.append({
        'Macro-F1': macro_f1,
        'Strict-Macro-F1': strict_macro_f1,
        'Verifier_Method': verifier_label, 
        'DS_Settings': selected_variation,
        'Time (s)': wall_time,
    })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

df_verification.to_csv(f'{out_dir}/df_verification.csv')
print(f'saved df to {out_dir}/df_verification.csv')

# Display the DataFrame
display(df_verification.sort_values(by='Macro-F1', ascending=False))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
key "retrieved_evidence" was empty for rumor with id AuRED_045
key "retrieved_evidence" was empty for rumor with id AuRED_025
key "retrieved_evidence" was empty for rumor with id AuRED_026
key "retrieved_evidence" was empty for rumor with id AuRED_088
key "retrieved_evidence" was empty for rumor with id AuRED_066
key "retrieved_evidence" was empty for rumor with id AuRED_053
key "retrieved_evidence" was empty for rumor with id AuRED_046
key "retrieved_evidence" was empty for rumor with id AuRED_059
key "retrieved_evidence" was empty for rumor with id AuRED_033
key "retrieved_evidence" was empty for rumor with id AuRED_001
key "retrieved_evidence" was empty for rumor with id AuRED_039
key "retrieved_evidence" was empty for rumor with id AuRED_076
key "retrieved_evidence" was empty for rumor with id AuRED_003


result for verification run - Macro-F1: 0.6993 Strict-Macro-F1: 0.6993 with config {'verifier_method': 'transformers-bart', 'model': 'facebook/bart-large-mnli'} and TREC FILE c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl


  attn_output = torch.nn.functional.scaled_dot_product_attention(
key "retrieved_evidence" was empty for rumor with id AuRED_045
key "retrieved_evidence" was empty for rumor with id AuRED_025
key "retrieved_evidence" was empty for rumor with id AuRED_026
key "retrieved_evidence" was empty for rumor with id AuRED_088
key "retrieved_evidence" was empty for rumor with id AuRED_066
key "retrieved_evidence" was empty for rumor with id AuRED_053
key "retrieved_evidence" was empty for rumor with id AuRED_046
key "retrieved_evidence" was empty for rumor with id AuRED_059
key "retrieved_evidence" was empty for rumor with id AuRED_033
key "retrieved_evidence" was empty for rumor with id AuRED_001
key "retrieved_evidence" was empty for rumor with id AuRED_039
key "retrieved_evidence" was empty for rumor with id AuRED_076
key "retrieved_evidence" was empty for rumor with id AuRED_003


result for verification run - Macro-F1: 0.6897 Strict-Macro-F1: 0.6897 with config {'verifier_method': 'transformers-bart', 'model': 'facebook/bart-large-mnli'} and TREC FILE c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl
saved df to results/df_verification.csv


Unnamed: 0,Macro-F1,Strict-Macro-F1,Verifier_Method,DS_Settings,Time (s)
0,0.699346,0.699346,transformers-roberta,pre-nonam-nobio,4.848028
1,0.689724,0.689724,transformers-bart,pre-nonam-nobio,6.199241
