In [5]:
import os
import json
import jsonlines
import time
import pandas as pd
import pickle as pkl
from IPython.display import display

from lkae.utils.data_loading import pkl_dir, load_pkl, load_pkls, root_dir, AuredDataset
from lkae.verification.verify import get_verifier
from lkae.utils.scoring import eval_run_custom_nofile
from lkae.verification.verify import Judge, run_verifier_on_dataset
from lkae.utils.data_loading import AuthorityPost

# datasets = load_pkls(pkl_dir)

# possilbe splits: train, dev, train_dev_combined
# (test, all_combined don't have "labels")
split = 'dev'

dataset_split = f'English_{split}'
# qrel_filename = f'{dataset_split}_qrels.txt'

# dataset_variations_dict = datasets[dataset_split]
# print(dataset_variations_dict.keys())

In [6]:
# load inverted ds
ground_truth_fp = './English_dev_inverted.jsonl'

dataset = AuredDataset(ground_truth_fp, preprocess=True, add_author_bio=False, add_author_name=False)

dataset[0]

{'id': 'AuRED_038',
 'rumor': 'News of a missile passing over Kuwaiti airspace coming from the Iraqi side',
 'label': 'SUPPORTS',
 'evidence': [AuthorityPost(url='https://twitter.com/KuwaitArmyGHQ', post_id='1304111096949866497', text="Statement from Authority Account 'KuwaitArmyGHQ': 'The Chief of Staff confirms what is being circulated through various media outlets and confirms that the various sectors of the army are working according to the states of readiness assigned to them to preserve the country's security and territorial integrity It calls on everyone to investigate accuracy and obtain information from its official sources represented by the Directorate of Moral Guidance and Public Relations '", rank=None, score=None)],
 'timeline': [],
 'retrieved_evidence': []}

In [7]:
# ground truth RQ2
gold_file = os.path.join(ground_truth_fp)
gold_list = [line for line in jsonlines.open(gold_file)]

# select a set of variations of the dataset
selected_variations = ["nopre-nonam-nobio"]

In [8]:
# load each config and construct its verifier

verifiers = {}

with open('config.json', 'r') as file:
    configs = json.load(file)

    for config in configs['configs']:
        verifier_label = get_verifier(**config)
        verifiers[config['verifier_method']] = verifier_label

verifiers

{'llama3-70b': <lkae.verification.models.llama3_azure_ai.Llama3AzureVerifier at 0x21ba37417b0>,
 'openai-4o-mini': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x21ba3741900>,
 'openai-4o': <lkae.verification.models.openai_verifier.OpenaiVerifier at 0x21bbdb83a90>}

In [9]:
# compare outputs for different verifiers on one dataset
# first get judgements for all verifiers

for i, item in enumerate(dataset):
    retrieved_ev = []
    evidences = item["evidence"]
    if evidences is None:
        print(f"skipped {i} because no evidence")
        continue
    for ev in evidences:
        retrieved_ev.append(AuthorityPost(ev.url, ev.post_id, ev.text, 1, 1))
    dataset[i]["retrieved_evidence"] = retrieved_ev

solomon = Judge(
    scale=False,  # ignore scaling, weigh each evidence evenly, except for confidence score given by verifier
    ignore_nei=True,  # ignore NEI predictions
)

selected_variation = selected_variations[0]

results = {}
for verifier_label in verifiers:
    start = time.time()

    run_filename = f'results/{selected_variation}_{verifier_label}.pkl'

    # check if the file already exists from a previous run
    if os.path.exists(run_filename):
        print(f'found {run_filename}, loading from file')
        verification_results = pkl.load(open(run_filename, 'rb'))
    else:
        print(f'running {verifier_label} on {selected_variation}')
        verification_results = run_verifier_on_dataset(
            dataset=dataset,
            verifier=verifiers[verifier_label],
            judge=solomon,
            blind=False,
        )
        pkl.dump(verification_results, open(run_filename, 'wb'))

    # verification_results = run_verifier_on_dataset(
    #     dataset=dataset,
    #     verifier=verifiers[verifier_label],
    #     judge=solomon,
    #     blind=False,
    # )

    macro_f1, strict_macro_f1 = eval_run_custom_nofile(verification_results, gold_list)

    results[verifier_label] = {
        "res_dict": verification_results,
        "time": time.time() - start,
        "macro_f1": macro_f1,
        "strict_f1": strict_macro_f1,
        "settings": {"verifier": verifier_label, "dataset": selected_variation},
    }

found results/nopre-nonam-nobio_llama3-70b.pkl, loading from file
found results/nopre-nonam-nobio_openai-4o-mini.pkl, loading from file
found results/nopre-nonam-nobio_openai-4o.pkl, loading from file


In [10]:
# print(json.dumps(results, indent=2))

In [11]:
# {
#   "transformers-roberta": {
#     "res_dict": [
#       {
#         "id": "AuRED_142",
#         "label": "REFUTES",
#         "claim": "Naturalization decree in preparation: Lebanese passports for sale !",
#         "predicted_label": "REFUTES",
#         "predicted_evidence": [
#           [
#             "https://twitter.com/LBpresidency",
#             "1555986659279360001",
#             "Statement from Authority Account 'LBpresidency': ''The Information Office of the Presidency of the Republic denies a false news broadcast by the MTV station about Baabda Palace preparing a decree naturalizing 4 000 people and recalls that it had denied yesterday the false information published by the French magazine 'Liberation' about the same fabricated news ''",
#             0.5575303435325623
#           ],
#           [
#             "https://twitter.com/LBpresidency",
#             "1555424541509386240",
#             "Statement from Authority Account 'LBpresidency': ''The Information Office of the Presidency of the Republic: What was published by the French newspaper 'Liberation' about the 'selling' of Lebanese passports to non-Lebanese is false and baseless news '",
#             0.9313378930091858
#           ]
#         ]
#       },
#     ],
#     "time": 1.071157693862915,
#     "macro_f1": 0.27692307692307694,
#     "strict_f1": 0.27692307692307694,
#     "settings": {
#       "verifier": "transformers-roberta",
#       "dataset": "pre-nonam-nobio"
#     }
#   },
# }

In [12]:
pd.set_option('display.max_columns', None)

In [13]:
# Create a dictionary to store the comparison data
comparison_data = {}

# Determine the maximum number of evidence pieces across all results
max_evidence = max(
    max(len(item['predicted_evidence']) for item in result['res_dict'])
    for result in results.values()
)

# clamp to max 5 evidence pieces
max_evidence = min(max_evidence, 5)

# Iterate through each result in the results dictionary
for verifier_label, result in results.items():
    for item in result['res_dict']:
        item_id = item['id']
        if item_id not in comparison_data:
            comparison_data[item_id] = {
                'id': item['id'],
                'claim': item['claim'],
                'label': item['label'],
            }
        
        comparison_data[item_id][f'{verifier_label}-pred_label'] = item['predicted_label']
        
        # Add evidence columns with the new format
        for i in range(max_evidence):
            if i < len(item['predicted_evidence']):
                ev = item['predicted_evidence'][i]
                comparison_data[item_id][f'{verifier_label}-ev_{i+1}'] = f"({ev[3]:.1f}) {ev[2]} "
            else:
                comparison_data[item_id][f'{verifier_label}-ev_{i+1}'] = ''

# Create a DataFrame from the comparison data
df_comparison = pd.DataFrame(list(comparison_data.values()))

# Define the column order
columns = ['id', 'claim', 'label']
for verifier in results.keys():
    columns.append(f'{verifier}-pred_label')

columns
ev_pieces = {}

for verifier in results.keys():
    for i in range(max_evidence):
        if i not in ev_pieces:
            ev_pieces[i] = []
        ev_pieces[i] += [f'{verifier}-ev_{i+1}']


for i, item in ev_pieces.items():
    for e in item:
        columns.append(e)

columns

# Reorder the DataFrame columns
df_comparison = df_comparison[columns]

# Add a column to indicate if predicted labels match
def labels_match(row):
    predicted_labels = [row[f'{verifier}-pred_label'] for verifier in results.keys()]
    return 'Yes' if len(set(predicted_labels)) == 1 else 'No'

df_comparison['Predicted Labels Match'] = df_comparison.apply(labels_match, axis=1)

# Move the 'Predicted Labels Match' column to the end
cols = df_comparison.columns.tolist()
cols = cols[:-1] + cols[-1:]
df_comparison = df_comparison[cols]

# Display the DataFrame
display(df_comparison)

# Optionally, save the DataFrame to a CSV file for easier viewing in spreadsheet software
df_comparison.to_csv('results/verifier_comparison_detailed.csv', index=False)

# Print summary statistics
print("\nSummary Statistics:")
for verifier_label, result in results.items():
    print(f"\n{verifier_label}:")
    print(f"  Macro F1: {result['macro_f1']:.4f}")
    print(f"  Strict F1: {result['strict_f1']:.4f}")
    print(f"  Time: {result['time']:.2f} seconds")

# Print the percentage of matching predictions
matching_percentage = (df_comparison['Predicted Labels Match'] == 'Yes').mean() * 100
print(f"\nPercentage of matching predictions: {matching_percentage:.2f}%")

Unnamed: 0,id,claim,label,llama3-70b-pred_label,openai-4o-mini-pred_label,openai-4o-pred_label,llama3-70b-ev_1,openai-4o-mini-ev_1,openai-4o-ev_1,llama3-70b-ev_2,openai-4o-mini-ev_2,openai-4o-ev_2,llama3-70b-ev_3,openai-4o-mini-ev_3,openai-4o-ev_3,llama3-70b-ev_4,openai-4o-mini-ev_4,openai-4o-ev_4,llama3-70b-ev_5,openai-4o-mini-ev_5,openai-4o-ev_5,Predicted Labels Match
0,AuRED_038,News of a missile passing over Kuwaiti airspac...,SUPPORTS,SUPPORTS,NOT ENOUGH INFO,NOT ENOUGH INFO,(-0.8) Statement from Authority Account 'Kuwai...,(0.0) Statement from Authority Account 'Kuwait...,(0.0) Statement from Authority Account 'Kuwait...,,,,,,,,,,,,,No
1,AuRED_064,Today the Turkish Minister of Defense visited ...,SUPPORTS,SUPPORTS,REFUTES,NOT ENOUGH INFO,(-0.8) Statement from Authority Account 'Gover...,(0.0) Statement from Authority Account 'Govern...,(0.0) Statement from Authority Account 'Govern...,(-0.8) Statement from Authority Account 'MFATu...,(0.0) Statement from Authority Account 'MFATur...,(0.0) Statement from Authority Account 'MFATur...,(0.0) Statement from Authority Account 'MFATur...,(0.0) Statement from Authority Account 'MFATur...,(0.0) Statement from Authority Account 'MFATur...,(-0.8) Statement from Authority Account 'Trabl...,(0.0) Statement from Authority Account 'Trablu...,(0.0) Statement from Authority Account 'Trablu...,(0.8) Statement from Authority Account 'NajWhe...,(0.8) Statement from Authority Account 'NajWhe...,(0.0) Statement from Authority Account 'NajWhe...,No
2,AuRED_083,Libya - Sources of the event: News of the arri...,SUPPORTS,SUPPORTS,SUPPORTS,SUPPORTS,(-0.9) Statement from Authority Account 'USEmb...,(-0.9) Statement from Authority Account 'USEmb...,(-0.9) Statement from Authority Account 'USEmb...,,,,,,,,,,,,,Yes
3,AuRED_086,Al-Ittihad Raid I League Urgently the administ...,SUPPORTS,SUPPORTS,NOT ENOUGH INFO,NOT ENOUGH INFO,(0.9) Statement from Authority Account 'spokes...,(0.0) Statement from Authority Account 'spokes...,(0.0) Statement from Authority Account 'spokes...,(-0.8) Statement from Authority Account 'Saudi...,(0.0) Statement from Authority Account 'Saudi ...,(0.0) Statement from Authority Account 'Saudi ...,(0.0) Statement from Authority Account 'Saudi ...,(0.0) Statement from Authority Account 'Saudi ...,(0.0) Statement from Authority Account 'Saudi ...,(0.0) Statement from Authority Account 'Saudi ...,(0.0) Statement from Authority Account 'Saudi ...,(0.0) Statement from Authority Account 'Saudi ...,(-0.8) Statement from Authority Account 'Saudi...,(0.0) Statement from Authority Account 'Saudi ...,(0.0) Statement from Authority Account 'Saudi ...,No
4,AuRED_099,Qatar did not throw Iranian peas into garbage ...,SUPPORTS,SUPPORTS,REFUTES,NOT ENOUGH INFO,(-0.8) Statement from Authority Account 'albal...,(0.9) Statement from Authority Account 'albala...,(0.0) Statement from Authority Account 'albala...,(-0.8) Statement from Authority Account 'albal...,(0.0) Statement from Authority Account 'albala...,(0.0) Statement from Authority Account 'albala...,,,,,,,,,,No
5,AuRED_100,The Kingdom will not support the Global Pandem...,REFUTES,REFUTES,REFUTES,REFUTES,(0.9) Statement from Authority Account 'FahadA...,(0.9) Statement from Authority Account 'FahadA...,(0.9) Statement from Authority Account 'FahadA...,(0.9) Statement from Authority Account 'spagov...,(0.9) Statement from Authority Account 'spagov...,(0.9) Statement from Authority Account 'spagov...,,,,,,,,,,Yes
6,AuRED_104,Urgent: Libya denies the opening of new corrid...,REFUTES,REFUTES,REFUTES,REFUTES,(0.9) Statement from Authority Account 'Dabaib...,(0.9) Statement from Authority Account 'Dabaib...,(0.9) Statement from Authority Account 'Dabaib...,,,,,,,,,,,,,Yes
7,AuRED_106,Riyad Mahrez is not among the candidates to wi...,REFUTES,REFUTES,REFUTES,REFUTES,(0.9) Statement from Authority Account 'LesVer...,(0.9) Statement from Authority Account 'LesVer...,(0.9) Statement from Authority Account 'LesVer...,,,,,,,,,,,,,Yes
8,AuRED_108,Qatar renews contract with 'Al-Annabi' coach,REFUTES,REFUTES,REFUTES,REFUTES,(0.8) Statement from Authority Account 'QFA': ...,(0.0) Statement from Authority Account 'QFA': ...,(0.9) Statement from Authority Account 'QFA': ...,(0.9) Statement from Authority Account 'QFA': ...,(0.9) Statement from Authority Account 'QFA': ...,(0.9) Statement from Authority Account 'QFA': ...,,,,,,,,,,Yes
9,AuRED_109,Qatar will not host the 2024 World Endurance R...,REFUTES,REFUTES,REFUTES,REFUTES,(0.9) Statement from Authority Account 'KBKAlT...,(0.9) Statement from Authority Account 'KBKAlT...,(0.9) Statement from Authority Account 'KBKAlT...,,,,,,,,,,,,,Yes



Summary Statistics:

llama3-70b:
  Macro F1: 0.7560
  Strict F1: 0.7560
  Time: 0.00 seconds

openai-4o-mini:
  Macro F1: 0.5208
  Strict F1: 0.5208
  Time: 0.00 seconds

openai-4o:
  Macro F1: 0.7393
  Strict F1: 0.7393
  Time: 0.01 seconds

Percentage of matching predictions: 42.11%
