In [1]:
import os
import json
import jsonlines
import time
import pandas as pd
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkl, root_dir, AuredDataset, AuthorityPost
from lkae.retrieval.retrieve import get_retriever,retrieve_evidence
from lkae.verification.verify import get_verifier, Judge, run_verifier_on_dataset
from lkae.utils.scoring import eval_run_custom_nofile

# import pyterrier as pt
# import pyterrier.io as ptio
# import pyterrier.pipelines as ptpipelines
# from ir_measures import R, MAP    

# if not pt.started():
#     pt.init()

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [None]:
datasets = {}

# walk through the pkl directory and load all the datasets in one of its subdirectories
# load each dataset with its subdirectory name and filename as the key
# skip non-pkl files
for subdir in os.listdir(pkl_dir):
    if not os.path.isdir(os.path.join(pkl_dir, subdir)):
        continue            
    datasets[subdir] = {}
    for filename in os.listdir(os.path.join(pkl_dir, subdir)):
        if not filename.endswith('.pkl'):
            continue
        key = os.path.join(subdir, filename)
        datasets[subdir][filename.split('.')[0]] = load_pkl(os.path.join(pkl_dir, key))

split = 'dev'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

# ground truth RQ3
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a single variation of the dataset
selected_variation = "pre-nonam-nobio"
dataset: AuredDataset = dataset_variations_dict[selected_variation]

In [2]:
# load each config and construct its retriever
setups = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        exp_fingerprint = f'{config["retriever_method"]}__{config["verifier_method"]}'
        
        retriever = get_retriever(**config)
        verifier = get_verifier(**config)
        
        setups[exp_fingerprint] = {}
        setups[exp_fingerprint]['retriever'] = retriever
        setups[exp_fingerprint]['verifier'] = verifier

display(setups)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another tas

{'bm25__transformers-roberta': {'retriever': <lkae.retrieval.methods.bm25.BM25Retriever at 0x2bb52170bb0>,
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x2bb521705b0>},
 'bm25__transformers-bart': {'retriever': <lkae.retrieval.methods.bm25.BM25Retriever at 0x2bb3022bc70>,
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x2bba89b1a20>},
 'tfidf__transformers-roberta': {'retriever': <lkae.retrieval.methods.tfidf.TFIDFRetriever at 0x2bba8a66b60>,
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x2bba8baed10>},
 'tfidf__transformers-bart': {'retriever': <lkae.retrieval.methods.tfidf.TFIDFRetriever at 0x2bba8bafd60>,
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x2bbab2b1b40>}}

In [3]:
# load each config and construct its retriever
retrievers = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        retriever_label = get_retriever(**config)
        retrievers[config['retriever_method']] = retriever_label

retrievers

{'bm25': <lkae.retrieval.methods.bm25.BM25Retriever at 0x2bb52014220>,
 'tfidf': <lkae.retrieval.methods.tfidf.TFIDFRetriever at 0x2bb521708e0>}

In [4]:
for i, item in enumerate(dataset):
    retrieved_ev = []
    evidences = item["evidence"]
    if evidences is None:
        print(f"skipped {i} because no evidence")
        continue
    for ev in evidences:
        retrieved_ev.append(AuthorityPost(ev.url, ev.post_id, ev.text, 1, 1))
    dataset[i]["retrieved_evidence"] = retrieved_ev

In [5]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results


solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True, # ignore NEI predictions
)

out_dir = 'results'
data = []

for exp_fingerprint in setups:
    start = time.time()

    retrieved_data = retrieve_evidence(dataset, setups[exp_fingerprint]['retriever'])

    dataset.add_trec_list_judgements(retrieved_data)

    verification_results = run_verifier_on_dataset(
        dataset=dataset,
        verifier=setups[exp_fingerprint]['verifier'],
        judge=solomon,
        blind=False,
    )

    # print(verification_results)

    macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

    print(
        f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with config {config}"
    )

    wall_time = time.time() - start

    retriever, verifier = exp_fingerprint.split('__')
    
    data.append({
        'Macro-F1': macro_f1,
        'Strict-Macro-F1': strict_macro_f1,
        'Retrieval_Method': retriever, 
        'Verifier_Method': verifier, 
        'DS_Settings': selected_variation,
        'Time (s)': wall_time,
    })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

df_verification.to_csv(f'{out_dir}/df_verification.csv')
print(f'saved df to {out_dir}/df_verification.csv')

# Display the DataFrame
display(df_verification.sort_values(by='Macro-F1', ascending=False))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


result for verification run - Macro-F1: 0.4361 Strict-Macro-F1: 0.4062 with config {'retriever_method': 'tfidf', 'retriever_k': 5, 'verifier_method': 'transformers-bart', 'model': 'facebook/bart-large-mnli'} and TREC FILE c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl


  attn_output = torch.nn.functional.scaled_dot_product_attention(


result for verification run - Macro-F1: 0.4934 Strict-Macro-F1: 0.4783 with config {'retriever_method': 'tfidf', 'retriever_k': 5, 'verifier_method': 'transformers-bart', 'model': 'facebook/bart-large-mnli'} and TREC FILE c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl
result for verification run - Macro-F1: 0.5417 Strict-Macro-F1: 0.5111 with config {'retriever_method': 'tfidf', 'retriever_k': 5, 'verifier_method': 'transformers-bart', 'model': 'facebook/bart-large-mnli'} and TREC FILE c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl
result for verification run - Macro-F1: 0.4841 Strict-Macro-F1: 0.4553 with config {'retriever_method': 'tfidf', 'retriever_k': 5, 'verifier_method': 'transformers-bart', 'model': 'facebook/bart-large-mnli'} and TREC FILE c:\users\luisk\projects-win\thesis\lkae\data\English_dev.jsonl
saved df to results/df_verification.csv


Unnamed: 0,Macro-F1,Strict-Macro-F1,Retrieval_Method,Verifier_Method,DS_Settings,Time (s)
2,0.541667,0.511111,tfidf,transformers-roberta,pre-nonam-nobio,59.869791
1,0.493412,0.478261,bm25,transformers-bart,pre-nonam-nobio,35.641588
3,0.484127,0.455267,tfidf,transformers-bart,pre-nonam-nobio,72.035851
0,0.436147,0.40617,bm25,transformers-roberta,pre-nonam-nobio,28.889681
