In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
if torch.cuda.is_available():
    device = "cuda" # the device to load the model onto


In [2]:
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2

# TODO: quantization

In [10]:
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.int,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
)

In [2]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
# model_4bit = AutoModelForCausalLM.from_pretrained(
#              model_id, 
#              device_map="auto",
#              quantization_config=quantization_config,)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [11]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [3]:

messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

In [4]:
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

In [5]:
model_inputs = encodeds.to(device)
model_inputs
# model_4bit.to(device)

tensor([[    1,   733, 16289, 28793,  1824,   349,   574, 16020,  2076,  2487,
         28804,   733, 28748, 16289, 28793,  6824, 28725,   315, 28742, 28719,
          3448, 10473,   298,   264,  1179, 11322, 19961,   302,  6138, 23598,
         18342, 28723,   661, 13633,   776,   272,  1103,  3558,   302,   686,
         16944, 15637,   423,   298,  5681,   315, 28742, 28719, 13198,   582,
           297,   272,  6132, 28808,     2,   733, 16289, 28793,  2378,   368,
           506,   993,  7136,   864, 21116, 28804,   733, 28748, 16289, 28793]],
       device='cuda:0')

In [None]:
generated_ids = model_4bit.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

## SBERT

In [12]:
from utils import load_rumors_from_jsonl
import os

out_dir = './temp-data'

clef_path = '../clef2024-checkthat-lab/task5'
data_path = os.path.join(clef_path, 'data')

filepath_train = os.path.join(data_path, 'English_train.json')
filepath_dev = os.path.join(data_path, 'English_dev.json')

train_jsons = load_rumors_from_jsonl(filepath_train)
dev_jsons = load_rumors_from_jsonl(filepath_dev)

print(f'loaded {len(train_jsons)} training json objects and {len(dev_jsons)} dev objects')

loaded 96 training json objects and 32 dev objects


In [54]:
from clef.utils import clean_tweet

data_cleaned_train = []

for entry in train_jsons:
    
    tl_clean = []
    for account_url, tl_tweet_id, tl_tweet in entry['timeline']:
        tl_tweet_cleaned = clean_tweet(tl_tweet)
        if tl_tweet_cleaned:
            tl_clean += [[account_url, tl_tweet_id, tl_tweet_cleaned]]

    ev_clean = []
    for account_url, ev_tweet_id, ev_tweet in entry['evidence']:
        ev_tweet_cleaned = clean_tweet(ev_tweet)
        if ev_tweet_cleaned:
            ev_clean += [[account_url, ev_tweet_id, ev_tweet_cleaned]]

    data_cleaned_train += [{
        'id': entry['id'],
        'rumor': clean_tweet(entry['rumor']),
        'label': entry['label'],
        'timeline': tl_clean,
        'evidence': ev_clean,
    }]

# data_cleaned_train

In [60]:
from clef.utils import clean_tweet

data_cleaned_dev = []

for entry in dev_jsons:
    
    tl_clean = []
    for account_url, tl_tweet_id, tl_tweet in entry['timeline']:
        tl_tweet_cleaned = clean_tweet(tl_tweet)
        if tl_tweet_cleaned:
            tl_clean += [[account_url, tl_tweet_id, tl_tweet_cleaned]]

    ev_clean = []
    for account_url, ev_tweet_id, ev_tweet in entry['evidence']:
        ev_tweet_cleaned = clean_tweet(ev_tweet)
        if ev_tweet_cleaned:
            ev_clean += [[account_url, ev_tweet_id, ev_tweet_cleaned]]

    data_cleaned_dev += [{
        'id': entry['id'],
        'rumor': clean_tweet(entry['rumor']),
        'label': entry['label'],
        'timeline': tl_clean,
        'evidence': ev_clean,
    }]

# data_cleaned_dev[0]

In [57]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_relevant_documents_sbert(rumor_id, query, timeline): #, debug=False, evidence=[]):
    corpus = [t[2] for t in timeline]
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

    top_k = min(5, len(corpus))
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    # if debug:
    #     print("\n\n======================\n\n")
    #     print("Query:", query)
    #     evidence_ids = [e[1] for e in evidence]

    found = []
    docs = []

    for i, (score, idx) in enumerate(zip(top_results[0], top_results[1])):
            id = timeline[idx][1]

            # if debug:
            #     is_evidence = id in evidence_ids
            #     star = "(*)" if is_evidence else "\t"
            #     print(star, '\t', "(Rank: {:.0f})".format(i+1), "(Score: {:.4f})".format(score), corpus[idx])
            #     if is_evidence: found += [id]

            docs += [[rumor_id, id, i+1, score.item()]]

    # if debug:    
    #     for _, ev_id, ev_text in evidence:
    #         if ev_id not in found:
    #                 print('(!) ', ev_text)
    
    return docs

In [58]:
data = []
for item in data_cleaned_dev[:]:
    query = item['rumor']
    timeline = item['timeline']
    data += retrieve_relevant_documents_sbert(item['id'], query, timeline)

from utils import write_trec_format_output

out_path = 'temp-data/sbert-trec-dev.txt'
write_trec_format_output(out_path, data, 'SBERT')

In [59]:
import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

if not pt.started():
    pt.init()

def evaluate_run(pred_path,golden_path):
    golden = ptio.read_qrels(golden_path)
    pred= ptio._read_results_trec(pred_path)
    eval= ptpipelines.Evaluate(pred, golden , metrics = [R@5,MAP],perquery=False)
    return eval

task5_dir = '../clef2024-checkthat-lab/task5'
sample_submission_file = task5_dir + '/submission_samples/KGAT_zeroShot_evidence_English_dev.txt'
lucene_submission_file = 'temp-data/lucene-trec.txt'
tfidf_submission_file = 'temp-data/tfidf-trec.txt'
terrier_submission_file = 'temp-data/terrier-trec.txt'
sbert_submission_file = 'temp-data/sbert-trec-dev.txt'

golden_labels_file = task5_dir + '/data/dev_qrels.txt'
out_file = 'temp-data/out.csv'

print('sample', '\t', evaluate_run(sample_submission_file,golden_labels_file))
print('lucence', '', evaluate_run(lucene_submission_file,golden_labels_file))
print('tfidf', '\t', evaluate_run(tfidf_submission_file,golden_labels_file))
print('terrier', '', evaluate_run(terrier_submission_file,golden_labels_file))
print('sbert', '\t', evaluate_run(sbert_submission_file,golden_labels_file))

sample 	 {'R@5': 0.6357894736842106, 'AP': 0.5612280701754385}
lucence  {'R@5': 0.0, 'AP': 0.0}
tfidf 	 {'R@5': 0.7235087719298245, 'AP': 0.6301754385964913}
terrier  {'R@5': 0.05263157894736842, 'AP': 0.05263157894736842}
sbert 	 {'R@5': 0.7080701754385965, 'AP': 0.6363508771929824}
