# start to finish

## data loading

In [1]:
from utils import load_rumors_from_jsonl
import os

out_dir = './temp-data'

clef_path = '../clef2024-checkthat-lab/task5'
data_path = os.path.join(clef_path, 'data')

filepath_train = os.path.join(data_path, 'English_train.json')
filepath_dev = os.path.join(data_path, 'English_dev.json')

train_jsons = load_rumors_from_jsonl(filepath_train)
dev_jsons = load_rumors_from_jsonl(filepath_dev)

print(f'loaded {len(train_jsons)} training json objects and {len(dev_jsons)} dev objects')

loaded 96 training json objects and 32 dev objects


## Retrieval

### naive tfidf

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple

def retrieve_relevant_documents(query: str, timeline: List[List[str]]) -> List[Tuple[str, str, str, float]]:
    # Get only doc texts
    author_accounts = [t[0] for t in timeline]
    tweet_ids = [t[1] for t in timeline]
    documents = [t[2] for t in timeline]

    # Combine query and documents for TF-IDF vectorization
    combined_texts = [query] + documents
    
    # Generate TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined_texts)
    
    # Calculate similarity of the query to each document
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
    
    # Rank documents based on similarity scores
    ranked_doc_indices = similarity_scores.argsort()[0][::-1]

    # Sort the documents according to rank
    ranked_scores = [similarity_scores[0][i] for i in ranked_doc_indices]
    ranked_authors = [author_accounts[i] for i in ranked_doc_indices]
    ranked_ids = [tweet_ids[i] for i in ranked_doc_indices]
    ranked_documents = [documents[i] for i in ranked_doc_indices]

    # Create a list of tuples of shape (author, evidence_id, evidence_text, score)
    ranked_tuples = (list(zip(ranked_authors, ranked_ids, ranked_documents, ranked_scores)))
    
    return ranked_tuples

In [3]:
from utils import RankedDocs
from typing import List

def retrieve_using_tfidf(query: str, timeline: List[List[str]], k: int = 5) -> List[RankedDocs]:
    """
    Parameters:
        - query: a rumor or claim tweet text
        - timeline: timeline surrounding the tweet
        - k: top-k results to return. defaults to 5 

    Returns: 
    List of tuples of shape [(author_account, authority_tweet_id, doc_text, rank, score), ...]
    """

    ranked_docs = retrieve_relevant_documents(query, timeline)

    res = []
    for rank, (author_account, authority_tweet_id, doc_text, score) in enumerate(ranked_docs[:k]):
        res += [(author_account, authority_tweet_id, doc_text, rank+1, score)] 

    return res

## Verification

### using NLI

In [5]:
from transformers import pipeline
from typing import Dict, TypedDict, Union

class VerificationResult(TypedDict):
    label: str
    score: float

# Initialize the NLI pipeline with a pre-trained model
# nli_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
nli_pipeline = pipeline("text-classification", model="roberta-large-mnli")

def check_statement_with_evidence(statement: str, evidence: str) -> VerificationResult:
    # Define the candidate labels for NLI
    # candidate_labels = ["SUPPORTS", "REFUTES"]
    input_text = f"{evidence} [SEP] {statement}"

    # Use the NLI pipeline to predict the relationship
    # result = nli_pipeline(evidence, hypothesis=statement, candidate_labels=candidate_labels, multi_label=False)
    result = nli_pipeline(input_text)

    # Return the result
    return result[0]

def factcheck_using_evidence(claim: str, evidence: List[RankedDocs]):
    """
    Predict a judgement for a rumor using the retrieved evidence.

    Parameters:
        - rumor_dict (Dict): a Python Dict of a single rumor from the dataset, extended by the key 'rerieved_evidence'
    """
    label_map = {
        "CONTRADICTION": "REFUTES",
        "NEUTRAL": "NOT ENOUGH INFO",
        "ENTAILMENT": "SUPPORTS"
    }
    
    predicted_evidence = []
    scores = []

    for author_account, tweet_id, evidence_text, rank, score in evidence:
        res = check_statement_with_evidence(claim, evidence_text)
        label = label_map[res['label']]
        score = res['score']

        # CLEF CheckThat! task 5: score is [-1, +1] where 
        #   -1 means evidence strongly refuted
        #   +1 means evidence strongly supports

        if label == "REFUTES":
            score *= -1
        elif label == "NOT ENOUGH INFO":
            score = 0 # TODO uhmmm...

        predicted_evidence += [[
            author_account,
            tweet_id,
            evidence_text,
            score,
        ]]

        scores += [score]

    cumsum = sum(scores) / len(scores)
    
    if cumsum > 0.3:
        pred_label = "SUPPORTS"
    elif cumsum < -0.3:
        pred_label = "REFUTES"
    else:
        pred_label = "NOT ENOUGH INFO"

    return pred_label, predicted_evidence

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Running start-to-end

In [10]:
from utils import write_jsonlines_from_dicts
from tqdm import tqdm

ranked_docs_by_id = {}
rumors_retrieved = []
res_jsons = []

class OutputDict(TypedDict):
    id: str
    label: str
    claim: str
    predicted_label: str
    predicted_evidence: List[List[Union[str, float]]]

for rumor_dict in tqdm(dev_jsons):

    # unpack the dict from the dataset
    rumor_id = rumor_dict['id']
    query = rumor_dict['rumor']
    label = rumor_dict['label']
    timeline = rumor_dict['timeline']
    evidence = rumor_dict['evidence']
    
    ranked_docs = retrieve_using_tfidf(query, timeline, 5)

    pred_label, pred_evidence = factcheck_using_evidence(query, ranked_docs)

    res_json = {
        "id": rumor_id,
        "label": label,
        "claim": query,
        "predicted_label": pred_label,
        "predicted_evidence": pred_evidence,
    }
    res_jsons += [res_json]

outfile_tfidf_ver = 'temp-data/zeroshot-ver-from-tfidf.jsonl'
write_jsonlines_from_dicts(outfile_tfidf_ver, res_jsons)

100%|██████████| 32/32 [02:53<00:00,  5.43s/it]


## Scoring

In [11]:
from scoring_utils import eval_run

task5_dir = '../clef2024-checkthat-lab/task5'

sample_submission_file = task5_dir + '/submission_samples/KGAT_zeroShot_verification_English_dev.json'

nli_submission_file = 'temp-data/zeroshot-ver-from-tfidf.jsonl'
ground_truth_file = task5_dir + '/data/Arabic_dev.json'
out_file = 'temp-data/out.csv'

print('sample')
eval_run(sample_submission_file,ground_truth_file, out_file)

print('nli')
eval_run(nli_submission_file,ground_truth_file, out_file)

sample
Macro_F1 0.5081585081585082
Strict Macro_F1 0.5081585081585082
nli
Macro_F1 0.24111560953666214
Strict Macro_F1 0.20797720797720798
