In [1]:
import os
import json
import jsonlines
import time
import pandas as pd
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkl, root_dir, AuredDataset, AuthorityPost
from lkae.retrieval.retrieve import get_retriever,retrieve_evidence
from lkae.verification.verify import get_verifier, Judge, run_verifier_on_dataset
from lkae.utils.scoring import eval_run_custom_nofile

# import pyterrier as pt
# import pyterrier.io as ptio
# import pyterrier.pipelines as ptpipelines
# from ir_measures import R, MAP    

# if not pt.started():
#     pt.init()

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [2]:
datasets = {}

# walk through the pkl directory and load all the datasets in one of its subdirectories
# load each dataset with its subdirectory name and filename as the key
# skip non-pkl files
for subdir in os.listdir(pkl_dir):
    if not os.path.isdir(os.path.join(pkl_dir, subdir)):
        continue            
    datasets[subdir] = {}
    for filename in os.listdir(os.path.join(pkl_dir, subdir)):
        if not filename.endswith('.pkl'):
            continue
        key = os.path.join(subdir, filename)
        datasets[subdir][filename.split('.')[0]] = load_pkl(os.path.join(pkl_dir, key))

split = 'dev'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

# ground truth RQ3
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a single variation of the dataset
selected_variation = "pre-nonam-nobio"
dataset: AuredDataset = dataset_variations_dict[selected_variation]

dict_keys(['nopre-nam-bio', 'nopre-nam-nobio', 'nopre-nonam-bio', 'nopre-nonam-nobio', 'pre-nam-bio', 'pre-nam-nobio', 'pre-nonam-bio', 'pre-nonam-nobio'])


In [3]:
# load each config and construct its retriever
setups = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        exp_fingerprint = f'{config["retriever_method"]}__{config["verifier_method"]}'
        
        retriever = get_retriever(**config)
        verifier = get_verifier(**config)
        
        setups[exp_fingerprint] = {}
        setups[exp_fingerprint]['retriever'] = retriever
        setups[exp_fingerprint]['verifier'] = verifier

display(setups)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initializing HFSentenceTransformersRetriever with model: sentence-transformers/multi-qa-distilbert-cos-v1


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initializing HFSentenceTransformersRetriever with model: sentence-transformers/multi-qa-distilbert-cos-v1


{'bm25__transformers-roberta': {'retriever': <lkae.retrieval.methods.bm25.BM25Retriever at 0x21fe9192b30>,
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x21fe9192bf0>},
 'bm25__openai': {'retriever': <lkae.retrieval.methods.bm25.BM25Retriever at 0x21f8b12ebf0>,
  'verifier': <lkae.verification.models.open_ai.OpenaiVerifier at 0x22028a4e0e0>},
 'sent-transformers-hf__transformers-roberta': {'retriever': <lkae.retrieval.methods.sent_transformers_hf.HFSentenceTransformersRetriever at 0x2202ab75fc0>,
  'verifier': <lkae.verification.models.transformers_verifier.TransformersVerifier at 0x2202ab76860>},
 'sent-transformers-hf__openai': {'retriever': <lkae.retrieval.methods.sent_transformers_hf.HFSentenceTransformersRetriever at 0x2202ab76200>,
  'verifier': <lkae.verification.models.open_ai.OpenaiVerifier at 0x2202ac91450>}}

In [4]:
# load each config and construct its retriever
retrievers = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        retriever_label = get_retriever(**config)
        retrievers[config['retriever_method']] = retriever_label

retrievers

Initializing HFSentenceTransformersRetriever with model: sentence-transformers/multi-qa-distilbert-cos-v1
Initializing HFSentenceTransformersRetriever with model: sentence-transformers/multi-qa-distilbert-cos-v1


{'bm25': <lkae.retrieval.methods.bm25.BM25Retriever at 0x21fe9193250>,
 'sent-transformers-hf': <lkae.retrieval.methods.sent_transformers_hf.HFSentenceTransformersRetriever at 0x21f8afd5000>}

In [5]:
for i, item in enumerate(dataset):
    retrieved_ev = []
    evidences = item["evidence"]
    if evidences is None:
        print(f"skipped {i} because no evidence")
        continue
    for ev in evidences:
        retrieved_ev.append(AuthorityPost(ev.url, ev.post_id, ev.text, 1, 1))
    dataset[i]["retrieved_evidence"] = retrieved_ev

In [6]:
# then for every variation of the dataset in ds, run the experiment with each retriever and save the results


solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True, # ignore NEI predictions
)

out_dir = 'results'
data = []

for exp_fingerprint in setups:
    start = time.time()

    retrieved_data = retrieve_evidence(dataset, setups[exp_fingerprint]['retriever'])

    dataset.add_trec_list_judgements(retrieved_data)

    verification_results = run_verifier_on_dataset(
        dataset=dataset,
        verifier=setups[exp_fingerprint]['verifier'],
        judge=solomon,
        blind=False,
    )

    # print(verification_results)

    macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

    print(
        f"result for verification run - Macro-F1: {macro_f1:.4f} Strict-Macro-F1: {strict_macro_f1:.4f} with config {config}"
    )

    wall_time = time.time() - start

    retriever, verifier = exp_fingerprint.split('__')
    
    data.append({
        'Macro-F1': macro_f1,
        'Strict-Macro-F1': strict_macro_f1,
        'Retrieval_Method': retriever, 
        'Verifier_Method': verifier, 
        'DS_Settings': selected_variation,
        'Time (s)': wall_time,
    })

# Convert the list of dictionaries to a DataFrame
df_verification = pd.DataFrame(data)

df_verification.to_csv(f'{out_dir}/df_verification.csv')
print(f'saved df to {out_dir}/df_verification.csv')

# Display the DataFrame
display(df_verification.sort_values(by='Macro-F1', ascending=False))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


result for verification run - Macro-F1: 0.4361 Strict-Macro-F1: 0.4062 with config {'retriever_method': 'sent-transformers-hf', 'retriever_k': 5, 'retriever_model': 'sentence-transformers/multi-qa-distilbert-cos-v1', 'verifier_method': 'openai'}
-----total token usage for verification-----
total tokens:	46556
prompt tokens:	44213
completion tokens:	2343
price estimate:	$0.51242
result for verification run - Macro-F1: 0.8968 Strict-Macro-F1: 0.8827 with config {'retriever_method': 'sent-transformers-hf', 'retriever_k': 5, 'retriever_model': 'sentence-transformers/multi-qa-distilbert-cos-v1', 'verifier_method': 'openai'}
Waiting for model to warm up (for 20.0 seconds)
result for verification run - Macro-F1: 0.5628 Strict-Macro-F1: 0.5015 with config {'retriever_method': 'sent-transformers-hf', 'retriever_k': 5, 'retriever_model': 'sentence-transformers/multi-qa-distilbert-cos-v1', 'verifier_method': 'openai'}
-----total token usage for verification-----
total tokens:	45910
prompt tokens:

Unnamed: 0,Macro-F1,Strict-Macro-F1,Retrieval_Method,Verifier_Method,DS_Settings,Time (s)
1,0.896825,0.882716,bm25,openai,pre-nonam-nobio,526.545298
3,0.896825,0.879917,sent-transformers-hf,openai,pre-nonam-nobio,564.810798
2,0.562802,0.501499,sent-transformers-hf,transformers-roberta,pre-nonam-nobio,132.525914
0,0.436147,0.40617,bm25,transformers-roberta,pre-nonam-nobio,10.869552
