In [3]:
import os
import json
import jsonlines
import time
import pandas as pd
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkl, load_pkls, root_dir, AuredDataset
from lkae.verification.verify import get_verifier
from lkae.utils.scoring import eval_run_custom_nofile
from lkae.verification.verify import Judge, run_verifier_on_dataset
from lkae.utils.data_loading import AuthorityPost

datasets = load_pkls(pkl_dir)

# possilbe splits: train, dev, train_dev_combined
# (test, all_combined don't have "labels")
split = 'train_dev_combined'

dataset_split = f'English_{split}'
qrel_filename = f'{dataset_split}_qrels.txt'

dataset_variations_dict = datasets[dataset_split]
print(dataset_variations_dict.keys())

dict_keys(['nopre-nam-bio', 'nopre-nam-nobio', 'nopre-nonam-bio', 'nopre-nonam-nobio', 'pre-nam-bio', 'pre-nam-nobio', 'pre-nonam-bio', 'pre-nonam-nobio'])


In [4]:
# ground truth RQ2
gold_file = os.path.join(root_dir, 'data', f'{dataset_split}.jsonl')
gold_list = [line for line in jsonlines.open(gold_file)]

# select a set of variations of the dataset
selected_variations = ["pre-nonam-nobio"]

In [5]:
# load each config and construct its verifier

verifiers = {}

with open('config-compare.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        verifier_label = get_verifier(**config)
        verifiers[config['verifier_method']] = verifier_label

verifiers

{'openai-4o-mini-run1': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x2c37cc4d5d0>,
 'openai-4o-mini-run2': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x2c37cc4d8a0>,
 'openai-4o-mini-run3': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x2c37b8f3970>}

In [6]:
# compare outputs for different verifiers on one dataset
# first get judgements for all verifiers

# get the dataset
selected_variation = selected_variations[0]
dataset: AuredDataset = dataset_variations_dict[selected_variation]
dataset = dataset[:]
for i, item in enumerate(dataset):
    retrieved_ev = []
    evidences = item["evidence"]
    if evidences is None:
        print(f"skipped {i} because no evidence")
        continue
    for ev in evidences:
        retrieved_ev.append(AuthorityPost(ev.url, ev.post_id, ev.text, 1, 1))
    dataset[i]["retrieved_evidence"] = retrieved_ev

solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True,  # ignore NEI predictions
)

results = {}
for verifier_label in verifiers:
    start = time.time()

    verification_results = run_verifier_on_dataset(
        dataset=dataset,
        verifier=verifiers[verifier_label],
        judge=solomon,
        blind=False,
    )

    macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

    results[verifier_label] = {
        "res_dict": verification_results,
        "time": time.time() - start,
        "macro_f1": macro_f1,
        "strict_f1": strict_macro_f1,
        "settings": {"verifier": verifier_label, "dataset": selected_variation},
    }

-----total token usage for verification-----
total tokens:	95516
prompt tokens:	90859
completion tokens:	4657
price estimate:	$1.0483
-----total token usage for verification-----
total tokens:	95519
prompt tokens:	90859
completion tokens:	4660
price estimate:	$1.04839
-----total token usage for verification-----
total tokens:	95522
prompt tokens:	90859
completion tokens:	4663
price estimate:	$1.04848


In [7]:
print(json.dumps(results, indent=2))

{
  "openai-4o-mini-run1": {
    "res_dict": [
      {
        "id": "AuRED_014",
        "label": "REFUTES",
        "claim": "' Urgent Ramallah Ministry of Health spokesman Kamal Al-Shakhra: We received 2 000 doses of the American 'Moderna' Corona vaccine and this batch will be designated for President Abbas the Fatah Central Committee and VIPs '",
        "predicted_label": "REFUTES",
        "predicted_evidence": [
          [
            "https://twitter.com/palestine_moh",
            "1357211717479116800",
            "Statement from Authority Account 'palestine moh': 'Pictures from the launch of the vaccination campaign against the Coronavirus starting with the medical and health teams in Bethlehem vaccine Palestine COVID19 '",
            0.0
          ],
          [
            "https://twitter.com/palestine_moh",
            "1356683687870488576",
            "Statement from Authority Account 'palestine moh': 'The Ministry added that the second group that will receive the va

In [8]:
import pickle as pkl

pkl.dump(results, open('results/results.pkl', 'wb'))

In [9]:
# {
#   "transformers-roberta": {
#     "res_dict": [
#       {
#         "id": "AuRED_142",
#         "label": "REFUTES",
#         "claim": "Naturalization decree in preparation: Lebanese passports for sale !",
#         "predicted_label": "REFUTES",
#         "predicted_evidence": [
#           [
#             "https://twitter.com/LBpresidency",
#             "1555986659279360001",
#             "Statement from Authority Account 'LBpresidency': ''The Information Office of the Presidency of the Republic denies a false news broadcast by the MTV station about Baabda Palace preparing a decree naturalizing 4 000 people and recalls that it had denied yesterday the false information published by the French magazine 'Liberation' about the same fabricated news ''",
#             0.5575303435325623
#           ],
#           [
#             "https://twitter.com/LBpresidency",
#             "1555424541509386240",
#             "Statement from Authority Account 'LBpresidency': ''The Information Office of the Presidency of the Republic: What was published by the French newspaper 'Liberation' about the 'selling' of Lebanese passports to non-Lebanese is false and baseless news '",
#             0.9313378930091858
#           ]
#         ]
#       },
#     ],
#     "time": 1.071157693862915,
#     "macro_f1": 0.27692307692307694,
#     "strict_f1": 0.27692307692307694,
#     "settings": {
#       "verifier": "transformers-roberta",
#       "dataset": "pre-nonam-nobio"
#     }
#   },
# }

In [10]:
pd.set_option('display.max_columns', None)

In [11]:
# Create a dictionary to store the comparison data
comparison_data = {}

# Determine the maximum number of evidence pieces across all results
max_evidence = max(
    max(len(item['predicted_evidence']) for item in result['res_dict'])
    for result in results.values()
)

# clamp to max 5 evidence pieces
max_evidence = min(max_evidence, 5)

# Iterate through each result in the results dictionary
for verifier_label, result in results.items():
    for item in result['res_dict']:
        item_id = item['id']
        if item_id not in comparison_data:
            comparison_data[item_id] = {
                'id': item['id'],
                'claim': item['claim'],
                'label': item['label'],
            }
        
        comparison_data[item_id][f'{verifier_label}-pred_label'] = item['predicted_label']
        
        # Add evidence columns with the new format
        for i in range(max_evidence):
            if i < len(item['predicted_evidence']):
                ev = item['predicted_evidence'][i]
                comparison_data[item_id][f'{verifier_label}-ev_{i+1}'] = f"({ev[3]:.1f}) {ev[2]} "
            else:
                comparison_data[item_id][f'{verifier_label}-ev_{i+1}'] = ''

# Create a DataFrame from the comparison data
df_comparison = pd.DataFrame(list(comparison_data.values()))

# Define the column order
columns = ['id', 'claim', 'label']
for verifier in results.keys():
    columns.append(f'{verifier}-pred_label')

for verifier in results.keys():
    for i in range(max_evidence):
        columns.append(f'{verifier}-ev_{i+1}')

# Reorder the DataFrame columns
df_comparison = df_comparison[columns]

# Add a column to indicate if predicted labels match
def labels_match(row):
    predicted_labels = [row[f'{verifier}-pred_label'] for verifier in results.keys()]
    return 'Yes' if len(set(predicted_labels)) == 1 else 'No'

df_comparison['Predicted Labels Match'] = df_comparison.apply(labels_match, axis=1)

# Move the 'Predicted Labels Match' column to the end
cols = df_comparison.columns.tolist()
cols = cols[:-1] + cols[-1:]
df_comparison = df_comparison[cols]

# Display the DataFrame
display(df_comparison)

# Optionally, save the DataFrame to a CSV file for easier viewing in spreadsheet software
df_comparison.to_csv('results/verifier_comparison_detailed.csv', index=False)

# Print summary statistics
print("\nSummary Statistics:")
for verifier_label, result in results.items():
    print(f"\n{verifier_label}:")
    print(f"  Macro F1: {result['macro_f1']:.4f}")
    print(f"  Strict F1: {result['strict_f1']:.4f}")
    print(f"  Time: {result['time']:.2f} seconds")

# Print the percentage of matching predictions
matching_percentage = (df_comparison['Predicted Labels Match'] == 'Yes').mean() * 100
print(f"\nPercentage of matching predictions: {matching_percentage:.2f}%")

Unnamed: 0,id,claim,label,openai-4o-mini-run1-pred_label,openai-4o-mini-run2-pred_label,openai-4o-mini-run3-pred_label,openai-4o-mini-run1-ev_1,openai-4o-mini-run1-ev_2,openai-4o-mini-run1-ev_3,openai-4o-mini-run1-ev_4,openai-4o-mini-run1-ev_5,openai-4o-mini-run2-ev_1,openai-4o-mini-run2-ev_2,openai-4o-mini-run2-ev_3,openai-4o-mini-run2-ev_4,openai-4o-mini-run2-ev_5,openai-4o-mini-run3-ev_1,openai-4o-mini-run3-ev_2,openai-4o-mini-run3-ev_3,openai-4o-mini-run3-ev_4,openai-4o-mini-run3-ev_5,Predicted Labels Match
0,AuRED_014,' Urgent Ramallah Ministry of Health spokesman...,REFUTES,REFUTES,REFUTES,REFUTES,(0.0) Statement from Authority Account 'palest...,(0.0) Statement from Authority Account 'palest...,(0.9) Statement from Authority Account 'palest...,(0.0) Statement from Authority Account 'palest...,,(0.0) Statement from Authority Account 'palest...,(0.0) Statement from Authority Account 'palest...,(0.9) Statement from Authority Account 'palest...,(0.0) Statement from Authority Account 'palest...,,(0.0) Statement from Authority Account 'palest...,(0.0) Statement from Authority Account 'palest...,(0.9) Statement from Authority Account 'palest...,(0.0) Statement from Authority Account 'palest...,,Yes
1,AuRED_037,Macron to Sky News: After my visit to Mrs Fair...,REFUTES,REFUTES,REFUTES,REFUTES,(0.0) Statement from Authority Account 'salman...,(0.9) Statement from Authority Account 'salman...,(0.0) Statement from Authority Account 'salman...,,,(0.0) Statement from Authority Account 'salman...,(0.9) Statement from Authority Account 'salman...,(0.0) Statement from Authority Account 'salman...,,,(0.0) Statement from Authority Account 'salman...,(0.9) Statement from Authority Account 'salman...,(0.0) Statement from Authority Account 'salman...,,,Yes
2,AuRED_085,Saudi Arabia evacuated 10 students from China ...,REFUTES,NOT ENOUGH INFO,NOT ENOUGH INFO,NOT ENOUGH INFO,(0.0) Statement from Authority Account 'YSUCOR...,(0.0) Statement from Authority Account 'YSUCOR...,(0.0) Statement from Authority Account 'Yemen ...,(0.0) Statement from Authority Account 'Yemen ...,(0.0) Statement from Authority Account 'Yemen ...,(0.0) Statement from Authority Account 'YSUCOR...,(0.0) Statement from Authority Account 'YSUCOR...,(0.0) Statement from Authority Account 'Yemen ...,(0.0) Statement from Authority Account 'Yemen ...,(0.0) Statement from Authority Account 'Yemen ...,(0.0) Statement from Authority Account 'YSUCOR...,(0.0) Statement from Authority Account 'YSUCOR...,(0.0) Statement from Authority Account 'Yemen ...,(0.0) Statement from Authority Account 'Yemen ...,(0.0) Statement from Authority Account 'Yemen ...,Yes
3,AuRED_089,The Corona epidemic has reached the Emirates w...,REFUTES,REFUTES,REFUTES,REFUTES,(0.0) Statement from Authority Account 'WHOEMR...,(0.9) Statement from Authority Account 'mohapu...,(0.0) Statement from Authority Account 'mohapu...,(0.9) Statement from Authority Account 'mohapu...,(0.9) Statement from Authority Account 'NCEMAU...,(0.0) Statement from Authority Account 'WHOEMR...,(0.9) Statement from Authority Account 'mohapu...,(0.0) Statement from Authority Account 'mohapu...,(0.9) Statement from Authority Account 'mohapu...,(0.9) Statement from Authority Account 'NCEMAU...,(0.0) Statement from Authority Account 'WHOEMR...,(0.9) Statement from Authority Account 'mohapu...,(0.0) Statement from Authority Account 'mohapu...,(0.9) Statement from Authority Account 'mohapu...,(0.9) Statement from Authority Account 'NCEMAU...,Yes
4,AuRED_135,The official spokesman for the Football Associ...,REFUTES,REFUTES,REFUTES,REFUTES,(0.9) Statement from Authority Account 'AlAhly...,(0.8) Statement from Authority Account 'AlAhly...,,,,(0.9) Statement from Authority Account 'AlAhly...,(0.8) Statement from Authority Account 'AlAhly...,,,,(0.9) Statement from Authority Account 'AlAhly...,(0.8) Statement from Authority Account 'AlAhly...,,,,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,AuRED_059,Urgent ️ ️ ️ ️ Hafeez Draghi was expelled from...,NOT ENOUGH INFO,NOT ENOUGH INFO,NOT ENOUGH INFO,NOT ENOUGH INFO,,,,,,,,,,,,,,,,Yes
111,AuRED_033,After his appointment as Vice-President of the...,NOT ENOUGH INFO,NOT ENOUGH INFO,NOT ENOUGH INFO,NOT ENOUGH INFO,,,,,,,,,,,,,,,,Yes
112,AuRED_001,'Musa Abu Marzouq Al-Hamsawi congratulates the...,NOT ENOUGH INFO,NOT ENOUGH INFO,NOT ENOUGH INFO,NOT ENOUGH INFO,,,,,,,,,,,,,,,,Yes
113,AuRED_039,Jerusalem The Eternal Capital of Palestine Can...,NOT ENOUGH INFO,NOT ENOUGH INFO,NOT ENOUGH INFO,NOT ENOUGH INFO,,,,,,,,,,,,,,,,Yes



Summary Statistics:

openai-4o-mini-run1:
  Macro F1: 0.9064
  Strict F1: 0.9064
  Time: 1121.56 seconds

openai-4o-mini-run2:
  Macro F1: 0.9035
  Strict F1: 0.9035
  Time: 1117.15 seconds

openai-4o-mini-run3:
  Macro F1: 0.9064
  Strict F1: 0.9064
  Time: 1093.95 seconds

Percentage of matching predictions: 99.13%
