## data loading

In [1]:
import os, json
base_path = '../clef2024-checkthat-lab/task5'
base_data_path = os.path.join(base_path, 'data')

# os.listdir(base_data_path)
train_jsons = []

with open(os.path.join(base_data_path, 'English_train.json'), encoding='utf8') as file:
    for line in file:
        train_jsons += [json.loads(line)]

# len(train_jsons)        
dev_jsons = []

with open(os.path.join(base_data_path, 'English_dev.json'), encoding='utf8') as file:
    for line in file:
        dev_jsons += [json.loads(line)]

# len(dev_jsons)      

## NLI

In [2]:
from transformers import pipeline

# Initialize the NLI pipeline with a pre-trained model
# nli_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
nli_pipeline = pipeline("text-classification", model="roberta-large-mnli")

def check_evidence_with_statement(evidence, statement):
    # Define the candidate labels for NLI
    # candidate_labels = ["SUPPORTS", "REFUTES"]
    input_text = f"{evidence} [SEP] {statement}"

    # Use the NLI pipeline to predict the relationship
    # result = nli_pipeline(evidence, hypothesis=statement, candidate_labels=candidate_labels, multi_label=False)
    result = nli_pipeline(input_text)

    # Return the result
    return result

def create_json_obj(rumor_json):
    if rumor_json['evidence']:
        rumor_text = rumor_json['rumor']
        # evidence_sentences = '. '.join([e[2] for e in rumor_json['evidence']])
        # result = check_evidence_with_statement(evidence_sentences, rumor_text)

        # # print(rumor_json)

        # if result:
        #     highest_score_label = result["labels"][0]
        #     highest_score = result["scores"][0]
        #     # print(f"Label: {highest_score_label}, Score: {highest_score}")
        # else:
        #     # print("Error in processing the NLI task.")
        #     pass
        
        label_map = {
            "CONTRADICTION": "REFUTES",
            "NEUTRAL": "NOT ENOUGH INFO",
            "ENTAILMENT": "SUPPORTS"
        }
        
        predicted_evidence = []
        scores = []

        for author_account, tweet_id, evidence_text in rumor_json['evidence']:
            res = check_evidence_with_statement(evidence_text, rumor_text)
            label = label_map[res[0]['label']]
            score = res[0]['score']
            # CLEF CheckThat! task 5: score is [-1, +1] where 
            #   -1 means evidence strongly refuted
            #   +1 means evidence strongly supports
            if label == "REFUTES":
                score *= -1
            elif label == "NOT ENOUGH INFO":
                score = 0

            predicted_evidence += [[
                author_account,
                tweet_id,
                evidence_text,
                score,
            ]]

            scores += [score]

        cumsum = sum(scores) / len(scores)
        
        if cumsum > 0.3:
            pred_label = "SUPPORTS"
        elif cumsum < -0.3:
            pred_label = "REFUTES"
        else:
            pred_label = "NOT ENOUGH INFO"

        res_json = {
            "id": rumor_json['id'],
            "predicted_label": pred_label,
            "claim": rumor_json['rumor'],
            "label": rumor_json['label'],
            "predicted_evidence": predicted_evidence,
        }

        return res_json
    else:
        return None

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
from utils import write_jsonlines_from_dicts
from tqdm import tqdm

fn = 'temp-data/zeroshot-ver.jsonl'
res_dicts = []
for item in tqdm(dev_jsons):
    res = create_json_obj(item)
    if res:
        res_dicts += [res]

write_jsonlines_from_dicts(fn, res_dicts)

100%|██████████| 32/32 [00:57<00:00,  1.79s/it]

<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>





In [7]:
# copied from C/lef2024-checkthat-lab/task5/scorer/verification_scorer.py for ease of use during dev only
import argparse
import jsonlines
from csv import writer
import numpy as np


def strict_f1(actual, predicted, actual_evidence, predicted_evidence, label):

    tp = 0
    fp = 0
    fn = 0
    for i in range(len(actual)):
        if actual[i] != "NOT ENOUGH INFO":
            if (actual[i] == label) & (
                (predicted[i] == label)
                & (bool(set(predicted_evidence[i]) & set(actual_evidence[i])) == True)
            ):
                tp = tp + 1
            elif (actual[i] != label) & (predicted[i] == label):
                fp = fp + 1
            elif (actual[i] == label) & (
                (predicted[i] == label)
                & (bool(set(predicted_evidence[i]) & set(actual_evidence[i])) == False)
            ):
                fp = fp + 1
            elif (predicted[i] != label) & (actual[i] == label):
                fn = fn + 1
        else:
            if (actual[i] == label) & (predicted[i] == label):
                tp = tp + 1
            elif (actual[i] != label) & (predicted[i] == label):
                fp = fp + 1
            elif (predicted[i] != label) & (actual[i] == label):
                fn = fn + 1

    try:
        precision = tp / (tp + fp)
    except:
        precision = 0
    try:
        recall = tp / (tp + fn)
    except:
        recall = 0
    try:
        f1 = 2 * (precision * recall) / (precision + recall)
    except:
        f1 = 0
    return f1


def f1(actual, predicted, label):

    tp = 0
    fp = 0
    fn = 0
    for i in range(len(actual)):
        if (actual[i] == label) & (predicted[i] == label):
            tp = tp + 1
        elif (actual[i] != label) & (predicted[i] == label):
            fp = fp + 1
        elif (predicted[i] != label) & (actual[i] == label):
            fn = fn + 1

    try:
        precision = tp / (tp + fp)
    except:
        precision = 0
    try:
        recall = tp / (tp + fn)
    except:
        recall = 0
    try:
        f1 = 2 * (precision * recall) / (precision + recall)
    except:
        f1 = 0
    return f1


def f1_macro(actual, predicted):
    # `macro` f1- unweighted mean of f1 per label
    return np.mean([f1(actual, predicted, label) for label in np.unique(actual)])


def f1_macro_strict(actual, predicted, actual_evidence, predicted_evidence):
    # `macro` f1- unweighted mean of macro-f1 per label
    return np.mean(
        [
            strict_f1(actual, predicted, actual_evidence, predicted_evidence, label)
            for label in np.unique(actual)
        ]
    )


def eval_run(pred_file, gold_file, out_file):

    gold_dict_labels = {}
    gold_dict_evidence = {}
    for line in jsonlines.open(gold_file):
        gold_dict_labels[line["id"]] = line["label"]
        temp_ev = []
        for ev in line["evidence"]:
            temp_ev.append(str(ev[1]))
        gold_dict_evidence[line["id"]] = temp_ev

    pred = [line for line in jsonlines.open(pred_file)]
    pred_labels = [line["predicted_label"] for line in pred]
    pred_evidence = []
    for line in pred:
        pred_instance = []
        for ev in line["predicted_evidence"]:
            pred_instance.append(str(ev[1]))
        pred_evidence.append(pred_instance)

    actual_labels = []
    actual_evidence = []
    for line in pred:
        actual_labels.append(gold_dict_labels[line["id"]])
        actual_instance = []
        for i in gold_dict_evidence[line["id"]]:
            actual_instance.append(i)
        actual_evidence.append(actual_instance)

    # compute macro-F1 and strict macro-F1
    macro_F1 = f1_macro(actual_labels, pred_labels)
    strict_macro_F1 = f1_macro_strict(
        actual_labels, pred_labels, actual_evidence, pred_evidence
    )

    print("Macro_F1", macro_F1)
    print("Strict Macro_F1", strict_macro_F1)

    result_list = [pred_file.split("/")[-1], macro_F1, strict_macro_F1]
    with open(out_file, "a") as f_object:
        writer_object = writer(f_object, delimiter="\t")
        writer_object.writerow(result_list)
        f_object.close()

In [8]:
task5_dir = '../clef2024-checkthat-lab/task5'

sample_submission_file = task5_dir + '/submission_samples/KGAT_zeroShot_verification_English_dev.json'

nli_submission_file = 'temp-data/zeroshot-ver.jsonl'
ground_truth_file = task5_dir + '/data/Arabic_dev.json'
out_file = 'temp-data/out.csv'

print('sample')
eval_run(sample_submission_file,ground_truth_file, out_file)

print('nli')
eval_run(nli_submission_file,ground_truth_file, out_file)

sample
Macro_F1 0.5081585081585082
Strict Macro_F1 0.5081585081585082
nli
Macro_F1 0.5098039215686274
Strict Macro_F1 0.5098039215686274
