## Mistral

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
if torch.cuda.is_available():
    device = "cuda" # the device to load the model onto


In [2]:
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2

# TODO: quantization

In [10]:
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.int,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
)

In [2]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
# model_4bit = AutoModelForCausalLM.from_pretrained(
#              model_id, 
#              device_map="auto",
#              quantization_config=quantization_config,)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [11]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [3]:

messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

In [4]:
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

In [5]:
model_inputs = encodeds.to(device)
model_inputs
# model_4bit.to(device)

tensor([[    1,   733, 16289, 28793,  1824,   349,   574, 16020,  2076,  2487,
         28804,   733, 28748, 16289, 28793,  6824, 28725,   315, 28742, 28719,
          3448, 10473,   298,   264,  1179, 11322, 19961,   302,  6138, 23598,
         18342, 28723,   661, 13633,   776,   272,  1103,  3558,   302,   686,
         16944, 15637,   423,   298,  5681,   315, 28742, 28719, 13198,   582,
           297,   272,  6132, 28808,     2,   733, 16289, 28793,  2378,   368,
           506,   993,  7136,   864, 21116, 28804,   733, 28748, 16289, 28793]],
       device='cuda:0')

In [None]:
generated_ids = model_4bit.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

## OpenAI API

In [7]:
from utils import load_rumors_from_jsonl
import os

out_dir = './temp-data'

clef_path = '../clef2024-checkthat-lab/task5'
data_path = os.path.join(clef_path, 'data')

filepath_train = os.path.join(data_path, 'English_train.json')
filepath_dev = os.path.join(data_path, 'English_dev.json')

train_jsons = load_rumors_from_jsonl(filepath_train)
dev_jsons = load_rumors_from_jsonl(filepath_dev)

print(f'loaded {len(train_jsons)} training json objects and {len(dev_jsons)} dev objects')

loaded 96 training json objects and 32 dev objects


### retrieval

In [51]:
import os
import numpy as np
from openai import OpenAI

from clef.embedding_utils import cosine_similarity

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"), # This is the default and can be omitted
)

def get_embedding(text):
    response = client.embeddings.create(
        input = text,
        model = 'text-embedding-3-small'
    )
    return response.data[0].embedding

In [54]:
def retrieve_relevant_documents_openai(rumor_id, query, timeline, k=5):

    print(rumor_id, query)

    # Generate embedding for the rumor
    rumor_embedding = get_embedding(query)

    # Generate embeddings for each tweet in the timeline
    timeline_embeddings = [get_embedding(tweet[2]) for tweet in timeline]

    # Compute similarities
    similarities = [cosine_similarity(rumor_embedding, tweet_embedding) for tweet_embedding in timeline_embeddings]

    # Select the most relevant tweets based on similarities
    # Here, we choose a simple approach to select the top N most similar tweets. You can adjust N as needed.
    # [-k:][::-1] --> this is: top-k, then reverse the list  so highest-scoring entry is index 0, then second-highest on index 1 etc.
    most_relevant_tweet_indices = np.argsort(similarities)[-k:][::-1] 

    scores = [similarities[i] for i in most_relevant_tweet_indices]
    relevant_tweets = [timeline[i] for i in most_relevant_tweet_indices]

    ranked = []
    for i, (cos_sim, [acc, id, text]) in enumerate(zip(scores, relevant_tweets)):
        ranked += [[rumor_id, id, i+1, cos_sim]]
    
        print('\t',[rumor_id, id, i+1, cos_sim, text])

    return ranked
        

In [55]:
from tqdm.auto import tqdm

data = []

for item in tqdm(dev_jsons):
    rumor_id = item['id']
    query = item['rumor']
    timeline = item['timeline']
    
    # ranked_docs = retrieve_relevant_documents(rumor_id, query, timeline)
    data += retrieve_relevant_documents_openai(rumor_id, query, timeline)



from utils import write_trec_format_output

out_path = 'temp-data/openai-trec-dev.txt'
write_trec_format_output(out_path, data, 'OPENAI')

  0%|          | 0/32 [00:00<?, ?it/s]

AuRED_142 Naturalization decree in preparation: Lebanese passports for sale?! https://t.co/UuQ7yMbSWJ https://t.co/Jf1K1NbZJD
	 ['AuRED_142', '1555424541509386240', 1, 0.5292499614547626, '“The Information Office of the Presidency of the Republic: What was published by the French newspaper “Liberation” about the “selling” of Lebanese passports to non-Lebanese is false and baseless news.']
	 ['AuRED_142', '1555986659279360001', 2, 0.38413405162678044, '“The Information Office of the Presidency of the Republic denies a false news broadcast by the MTV station about Baabda Palace preparing a decree naturalizing 4,000 people, and recalls that it had denied yesterday the false information published by the French magazine ‘Liberation’ about the same fabricated news. "']
	 ['AuRED_142', '1556600039211925504', 3, 0.3836070343328534, 'Today, the President of the Republic, General Michel Aoun, signed 9 laws that were previously approved by the House of Representatives. Details at the following li

In [56]:
from scoring_utils import eval_run_retrieval

task5_dir = '../clef2024-checkthat-lab/task5'
sample_submission_file = task5_dir + '/submission_samples/KGAT_zeroShot_evidence_English_dev.txt'
lucene_submission_file = 'temp-data/lucene-trec-dev.txt'
tfidf_submission_file = 'temp-data/tfidf-trec-dev.txt'
terrier_submission_file = 'temp-data/terrier-trec-bm25-qe.txt'
sbert_submission_file = 'temp-data/sbert-trec-dev.txt'
openai_submission_file = 'temp-data/openai-trec-dev.txt'

golden_labels_file = task5_dir + '/data/dev_qrels.txt'
out_file = 'temp-data/out.csv'

print('sample', '\t',eval_run_retrieval(sample_submission_file,golden_labels_file))
print('lucence', '', eval_run_retrieval(lucene_submission_file,golden_labels_file))
print('tfidf', '\t', eval_run_retrieval(tfidf_submission_file,golden_labels_file))
print('terrier', '', eval_run_retrieval(terrier_submission_file,golden_labels_file))
print('sbert', '\t', eval_run_retrieval(sbert_submission_file,golden_labels_file))
print('openai', '\t', eval_run_retrieval(openai_submission_file,golden_labels_file))

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



sample 	 {'R@5': 0.6357894736842106, 'AP': 0.5612280701754385}
lucence  {'R@5': 0.6971929824561404, 'AP': 0.663766081871345}
tfidf 	 {'R@5': 0.7235087719298245, 'AP': 0.6301754385964913}
terrier  {'R@5': 0.6859649122807018, 'AP': 0.6412280701754386}
sbert 	 {'R@5': 0.7080701754385965, 'AP': 0.6363508771929824}
openai 	 {'R@5': 0.7452631578947368, 'AP': 0.6448421052631579}


## verification

In [None]:

a = '“The Information Office of the Presidency of the Republic denies a false news broadcast by the MTV station about Baabda Palace preparing a decree naturalizing 4,000 people, and recalls that it had denied yesterday the false information published by the French magazine ‘Liberation’ about the same fabricated news. "'
b = 'Naturalization decree in preparation: Lebanese passports for sale?! https://t.co/UuQ7yMbSWJ https://t.co/Jf1K1NbZJD'

input_message_temp = f'The premise: "{a}"\nThe statement: "{b}"'

get_completion(input_message_temp).content

'REFUTAL'

In [4]:
from typing import NamedTuple


class VerificationResult(NamedTuple):
    label: str
    score: float

import os
from openai import OpenAI


client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"), # This is the default and can be omitted
)

system_message = 'You are a helpful assistant. You need to decide if a premise either supports the statement ("SUPPORTS"), refutes the statement ("REFUTES") or if the premise is not related to the statement ("NOT ENOUGH INFO"). No yapping.'

def get_completion(input_message):
    completion = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": input_message}
        ]
    )

    return completion.choices[0].message
    
def check_statement_with_evidence_openai(statement: str, evidence: str) -> VerificationResult:
    input_text = f'The premise: "{evidence}"\nThe statement: "{statement}"'

    result = get_completion(input_text).content

    valid_labels = [
        "REFUTES",
        "NOT ENOUGH INFO",
        "SUPPORTS"
    ]

    if result in valid_labels:
        return (result, 1.0)
    else:
        return ("NOT ENOUGH INFO", 1.0)

In [5]:
from typing import List
from tqdm.auto import tqdm
from clef.utils import RankedDocs

def factcheck_using_evidence_openai(claim: str, evidence: List[RankedDocs], debug: bool = True):
    predicted_evidence = []
    confidences = []
    if debug: tqdm.write(f'{claim}')

    for author_account, tweet_id, evidence_text, rank, score in evidence:
        if not evidence_text:
            if debug: tqdm.write('[DEBUG] evidence string empty')
            return ("NOT ENOUGH INFO", [])
        label, confidence = check_statement_with_evidence_openai(claim, evidence_text)

        # CLEF CheckThat! task 5: score is [-1, +1] where 
        #   -1 means evidence strongly refutes
        #   +1 means evidence strongly supports

        confidence = confidence * score # scale by retrieval score

        if label == "REFUTES":
            # confidence is always positive, for REFUTES make confidence negative
            confidence *= -1
        elif label == "NOT ENOUGH INFO":
            confidence *= 0 # TODO uhmmm...

        predicted_evidence += [[
            author_account,
            tweet_id,
            evidence_text,
            confidence,
        ]]

        if label != "NOT ENOUGH INFO":
            confidences += [confidence]
        if debug: tqdm.write(f'\t{confidence} {evidence_text}')

    if confidences:
        meanconf = sum(confidences) / len(confidences) # mean confidence, no weighting
    else:
        meanconf = 0
    
    if meanconf > 0.1:
        pred_label = "SUPPORTS"
    elif meanconf < -0.1:
        pred_label = "REFUTES"
    else:
        pred_label = "NOT ENOUGH INFO"
    
    return pred_label, predicted_evidence

### RQ2

In [8]:
# individual step RQ2


# fake scores for RQ2
from copy import deepcopy


rq2_jsons = deepcopy(dev_jsons)

for i, item in enumerate(dev_jsons):
    item['retrieved_evidence'] = []

    for author_account, tweet_id, tweet_text in item['evidence']:
        item['retrieved_evidence'] += [[
            author_account, #'author_account'
            tweet_id, #'authority_tweet_id'
            tweet_text, #'doc_text'
            1, #'rank'
            1, #'score'
        ]]

    rq2_jsons[i] = item
# print(json.dumps(rq2_jsons[0], indent=2))

In [9]:
from utils import write_jsonlines_from_dicts, clean_tweet_aggressive
from tqdm.auto import tqdm

res_jsons = []

for item in tqdm(rq2_jsons[:]):
    rumor = item["rumor"]
    retrieved_evidence = item["retrieved_evidence"]

    clean_evidence = []
    for link, id, evidence, rank, score in retrieved_evidence:
        clean_evidence += [[link, id, evidence, rank, score]] #[[link, id, clean_tweet_aggressive(evidence), rank, score]]
    
    clean_rumor = rumor #clean_tweet_aggressive(rumor)

    if retrieved_evidence: # only run fact check if we actually have retrieved evidence
        pred_label, pred_evidence = factcheck_using_evidence_openai(clean_rumor, clean_evidence)

        tqdm.write(f'label: {item["label"]}')
        tqdm.write(f'predicted: {pred_label}')
        tqdm.write('')
        
        res_jsons += [
            {
                "id": item["id"],
                "label": item["label"],
                "claim": clean_rumor,
                "predicted_label": pred_label,
                "predicted_evidence": pred_evidence,
            }
        ]

outfile = "temp-data/zeroshot-ver-rq2-openai.jsonl"
write_jsonlines_from_dicts(outfile, res_jsons)

  0%|          | 0/32 [00:00<?, ?it/s]

Naturalization decree in preparation: Lebanese passports for sale?! https://t.co/UuQ7yMbSWJ https://t.co/Jf1K1NbZJD
	-1.0 The Information Office of the Presidency of the Republic denies a false news broadcast by the MTV station about Baabda Palace preparing a decree naturalizing 4 000 people and recalls that it had denied yesterday the false information published by the French magazine Liberation about the same fabricated news
	-1.0 The Information Office of the Presidency of the Republic What was published by the French newspaper Liberation about the selling of Lebanese passports to non-Lebanese is false and baseless news
label: REFUTES
predicted: REFUTES

In the video.. The spread of unidentified gunmen east of the capital, Baghdad https://t.co/L18KV8tKuZ
	-1.0 The security forces of all types and formations are the ones who hold the reins in all governorates of the country and there is no truth to what some social media sites have reported about the spread of unknown gunmen in vario

### RQ3

In [12]:
# for RQ3


# for adding scores from TREC file
from clef.utils import combine_rumors_with_trec_file_judgements
import os, json

# organizers:
submission_path = os.path.join(clef_path, 'submission_samples', 'KGAT_zeroShot_evidence_English_dev.txt')

# own retrieval:
submission_path = './temp-data/openai-trec-dev.txt'

dev_jsons = combine_rumors_with_trec_file_judgements(dev_jsons, submission_path)

print(json.dumps(dev_jsons[2], indent=4))

{
    "id": "AuRED_132",
    "rumor": "*Riyad Salama to NBN, raising the value of the dollar through Resolution 158 from 8,000 LBP per dollar to 15,000 LBP, starting from the first of next month.* *Riyad Salama to NBN, we are raising the ceiling on banking withdrawals, starting from the first of next month, from $400 to $1,500 for individuals.*",
    "label": "REFUTES",
    "timeline": [
        [
            "https://twitter.com/nbntweets",
            "1591521645403029505",
            "Protests in #Germany demanding #Turkey to stop using chemical weapons in #Iraqi Kurdistan https://t.co/URM6stQ2sj https://t.co/8uxLDgH5yD"
        ],
        [
            "https://twitter.com/nbntweets",
            "1591519539355082752",
            "Urgent - The Pakistani Foreign Ministry announces the postponement of the visit of the Saudi Crown Prince to Islamabad"
        ],
        [
            "https://twitter.com/nbntweets",
            "1591517950657581057",
            "Reopening #Beddawi 

In [13]:
from utils import write_jsonlines_from_dicts, clean_tweet_aggressive
from tqdm.auto import tqdm

res_jsons = []

for item in tqdm(dev_jsons[:]):
    rumor = item["rumor"]
    retrieved_evidence = item["retrieved_evidence"]

    clean_evidence = []
    for link, id, evidence, rank, score in retrieved_evidence:
        clean_evidence += [[link, id, evidence, rank, score]] #[[link, id, clean_tweet_aggressive(evidence), rank, score]]
    
    clean_rumor = rumor #clean_tweet_aggressive(rumor)

    if retrieved_evidence: # only run fact check if we actually have retrieved evidence
        pred_label, pred_evidence = factcheck_using_evidence_openai(clean_rumor, clean_evidence)

        tqdm.write(f'label: {item["label"]}')
        tqdm.write(f'predicted: {pred_label}')
        tqdm.write('')
        
        res_jsons += [
            {
                "id": item["id"],
                "label": item["label"],
                "claim": clean_rumor,
                "predicted_label": pred_label,
                "predicted_evidence": pred_evidence,
            }
        ]

outfile = "temp-data/zeroshot-ver-rq3-openai.jsonl"
write_jsonlines_from_dicts(outfile, res_jsons)

  0%|          | 0/32 [00:00<?, ?it/s]

Naturalization decree in preparation: Lebanese passports for sale?! https://t.co/UuQ7yMbSWJ https://t.co/Jf1K1NbZJD
	0.0 Today, the President of the Republic, General Michel Aoun, signed 9 laws that were previously approved by the House of Representatives. Details at the following link: https://t.co/wmrSaaEwDu
	0.0 President Aoun received the Minister of Foreign Affairs and Expatriates, Dr. Abdullah Bouhabib, and the Minister of Social Affairs, Hector Hajjar, and discussed with them developments related to the file of displaced Syrians in Lebanon https://t.co/QLQAJSKzs1
	-0.38413405162678044 “The Information Office of the Presidency of the Republic denies a false news broadcast by the MTV station about Baabda Palace preparing a decree naturalizing 4,000 people, and recalls that it had denied yesterday the false information published by the French magazine ‘Liberation’ about the same fabricated news. "
	0.0 The President of the Republic awarded the Papal Ambassador the National Order of

### eval

In [14]:
from scoring_utils import eval_run

task5_dir = '../clef2024-checkthat-lab/task5'

sample_submission_file = task5_dir + '/submission_samples/KGAT_zeroShot_verification_English_dev.json'

rq2_nli_submission_file = 'temp-data/zeroshot-ver-rq2.jsonl'
rq2_openai_submission_file = 'temp-data/zeroshot-ver-rq2-openai.jsonl'

rq3_nli_submission_file = 'temp-data/zeroshot-ver-rq3.jsonl'
rq3_openai_submission_file = 'temp-data/zeroshot-ver-rq3-openai.jsonl'

ground_truth_file = task5_dir + '/data/English_dev.json'
out_file = 'temp-data/out.csv'

print('baseline from the authors')
eval_run(sample_submission_file,ground_truth_file, out_file)

print()

print('RQ2-nli')
eval_run(rq2_nli_submission_file,ground_truth_file, out_file)

print('RQ2-openai')
eval_run(rq2_openai_submission_file,ground_truth_file, out_file)

print()

print('RQ3-nli')
eval_run(rq3_nli_submission_file,ground_truth_file, out_file)

print('RQ3-openai')
eval_run(rq3_openai_submission_file,ground_truth_file, out_file)

baseline from the authors
Macro_F1 0.5081585081585082
Strict Macro_F1 0.5081585081585082

RQ2-nli
Macro_F1 0.7919254658385093
Strict Macro_F1 0.7919254658385093
RQ2-openai
Macro_F1 0.9090909090909091
Strict Macro_F1 0.9090909090909091

RQ3-nli
Macro_F1 0.2857142857142857
Strict Macro_F1 0.2654320987654321
RQ3-openai
Macro_F1 0.8700649675162418
Strict Macro_F1 0.8700649675162418
