## data loading

In [2]:
from utils import load_rumors_from_jsonl
import os

out_dir = './temp-data'

clef_path = '../clef2024-checkthat-lab/task5'
data_path = os.path.join(clef_path, 'data')

filepath_train = os.path.join(data_path, 'English_train.json')
filepath_dev = os.path.join(data_path, 'English_dev.json')

train_jsons = load_rumors_from_jsonl(filepath_train)
dev_jsons = load_rumors_from_jsonl(filepath_dev)

print(f'loaded {len(train_jsons)} training json objects and {len(dev_jsons)} dev objects')

loaded 96 training json objects and 32 dev objects


In [3]:
from clef.utils import clean_tweet

data_cleaned_train = []

for entry in train_jsons:
    
    tl_clean = []
    for account_url, tl_tweet_id, tl_tweet in entry['timeline']:
        tl_tweet_cleaned = clean_tweet(tl_tweet)
        if tl_tweet_cleaned:
            tl_clean += [[account_url, tl_tweet_id, tl_tweet_cleaned]]

    ev_clean = []
    for account_url, ev_tweet_id, ev_tweet in entry['evidence']:
        ev_tweet_cleaned = clean_tweet(ev_tweet)
        if ev_tweet_cleaned:
            ev_clean += [[account_url, ev_tweet_id, ev_tweet_cleaned]]

    data_cleaned_train += [{
        'id': entry['id'],
        'rumor': clean_tweet(entry['rumor']),
        'label': entry['label'],
        'timeline': tl_clean,
        'evidence': ev_clean,
    }]

# data_cleaned_train

In [4]:
from clef.utils import clean_tweet

data_cleaned_dev = []

for entry in dev_jsons:
    
    tl_clean = []
    for account_url, tl_tweet_id, tl_tweet in entry['timeline']:
        tl_tweet_cleaned = clean_tweet(tl_tweet)
        if tl_tweet_cleaned:
            tl_clean += [[account_url, tl_tweet_id, tl_tweet_cleaned]]

    ev_clean = []
    for account_url, ev_tweet_id, ev_tweet in entry['evidence']:
        ev_tweet_cleaned = clean_tweet(ev_tweet)
        if ev_tweet_cleaned:
            ev_clean += [[account_url, ev_tweet_id, ev_tweet_cleaned]]

    data_cleaned_dev += [{
        'id': entry['id'],
        'rumor': clean_tweet(entry['rumor']),
        'label': entry['label'],
        'timeline': tl_clean,
        'evidence': ev_clean,
    }]

# data_cleaned_dev

## pyserini

In [3]:
from pyserini.search.lucene import LuceneSearcher
import os
import json
import subprocess

# if you get the error "NameError: name '_C' is not defined" --> restart the Jupyter Kernel

def searchPyserini(query,
                   timeline,
                   k = 5,
                   temp_dir = 'temp-data-dir',
                   index = 'temp-data-dir/index_timeline_dynamic'):
    
    # ensure "working directory" exists (where we store intermediate data like the dynamic index that will be quered later)
    if not os.path.exists(temp_dir):
        os.mkdir(temp_dir)

    # set up "dynamic" (= temporary) index using timeline data
    dynamic_idx_filename = 'eng-train-dynamic.jsonl'
    with open(os.path.join(temp_dir, dynamic_idx_filename), mode='w', encoding='utf8') as file:
        for tweet in timeline:
            id = tweet[1]
            text = tweet[2]
            file.write(json.dumps({'id': id, 'contents': text}) + '\n')
    
    # ensure index directory exists and is empty
    if os.path.exists(index):
        for filename in os.listdir(index):
            if os.path.isfile(os.path.join(index, filename)):
                os.remove(os.path.join(index, filename))
    else:
        os.mkdir(index)

    # set up pyserini command since python embeddable is not out yet
    nthreads = 1
    command = f'python -m pyserini.index.lucene ' \
    f'-input {temp_dir} ' \
    f'-collection JsonCollection ' \
    f'-generator DefaultLuceneDocumentGenerator ' \
    f'-index {index} ' \
    f'-threads {nthreads} ' \
    f'-storePositions ' \
    f'-storeDocvectors ' \
    f'-storeRaw ' \
    f'-language en'

    result = subprocess.run(command, capture_output=True)

    # load searcher from index directoy
    searcher = LuceneSearcher(index)
    hits = searcher.search(query)

    ranked_tuples = []

    for hit in hits:
        doc = searcher.doc(hit.docid)
        json_doc = json.loads(doc.raw())

        ranked_tuples += [(hit.docid, hit.score, json_doc["contents"])]

        # wrap(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}\n{json_doc["contents"]}')

    return ranked_tuples

In [5]:
# for testing...
test_rumor = data_cleaned_dev[2]
test_rumor = data_cleaned_dev[2]
query = test_rumor['rumor']
timeline = test_rumor['timeline']

ranked_docs = searchPyserini(query, timeline)
display(ranked_docs)

# simple spot check
for evidence in test_rumor['evidence']:
    print(f'{"WAS FOUND" if evidence[1] in [x[0] for x in ranked_docs] else "NOT FOUND"}\t{evidence[1]} {evidence[2]}')

[('1590400068208988160',
  23.428499221801758,
  'After circulating news that the Governor of the Bank of Lebanon Riad Salameh had announced to NBN about raising the value of the dollar and raising the ceiling on banking withdrawals the NBN channel denies the validity of this information that is being circulated citing the channel and confirms that there is no truth to it on this subject'),
 ('1591489851106668544',
  15.630800247192383,
  'Raising the exchange rate of the customs dollar the TVA and withdrawals from banks What are its repercussions and results Is there economic and financial stability in light of the current political chaos Report Rasha Al-Zein Hashem'),
 ('1589654877890019331',
  11.791999816894531,
  'The exchange rate of the dollar rose on the black market as it touched the threshold of 39 000 liras recording 38 900 liras per dollar'),
 ('1589949764107665409',
  9.603899955749512,
  "Turkish Minister of Energy Turkey's purchases of natural gas from Russia have begun 

WAS FOUND	1590400068208988160 After circulating news that the Governor of the Bank of Lebanon Riad Salameh had announced to NBN about raising the value of the dollar and raising the ceiling on banking withdrawals the NBN channel denies the validity of this information that is being circulated citing the channel and confirms that there is no truth to it on this subject
NOT FOUND	1590364198462435329 There is no truth to the information being circulated quoted by the NBN channel regarding a statement by the Governor of the Central Bank regarding banking circulars


In [12]:
from tqdm import tqdm

data = []

for r in tqdm(data_cleaned_dev):
    rumor_id = r['id']
    query = r['rumor']
    timeline = r['timeline']

    ranked_docs = searchPyserini(query, timeline)

    for rank, (authority_tweet_id, score, doc_text) in enumerate(ranked_docs[:5]):
        data += [(rumor_id, authority_tweet_id, rank+1, score)]

from utils import write_trec_format_output

out_path = 'temp-data/lucene-trec-dev.txt'
write_trec_format_output(out_path, data, 'LUCENE')

# display(data)

100%|██████████| 32/32 [01:12<00:00,  2.27s/it]


In [13]:
from tqdm import tqdm

data = []

for r in tqdm(data_cleaned_train):
    rumor_id = r['id']
    query = r['rumor']
    timeline = r['timeline']

    ranked_docs = searchPyserini(query, timeline)

    for rank, (authority_tweet_id, score, doc_text) in enumerate(ranked_docs[:5]):
        data += [(rumor_id, authority_tweet_id, rank+1, score)]

from utils import write_trec_format_output

out_path = 'temp-data/lucene-trec-train.txt'
write_trec_format_output(out_path, data, 'LUCENE')

# display(data)

100%|██████████| 96/96 [03:36<00:00,  2.25s/it]


## naive tfidf

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_relevant_documents(query, timeline):
    # Get only doc texts
    documents = [t[2] for t in timeline]
    tweet_ids = [t[1] for t in timeline]

    # Combine query and documents for TF-IDF vectorization
    combined_texts = [query] + documents
    print(combined_texts)
    # Generate TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined_texts)

    # Calculate similarity of the query to each document
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
    print(similarity_scores)
    
    # Rank documents based on similarity scores
    ranked_doc_indices = similarity_scores.argsort()[0][::-1]

    # Sort the documents according to rank
    ranked_documents = [documents[i] for i in ranked_doc_indices]
    ranked_scores = [similarity_scores[0][i] for i in ranked_doc_indices]
    ranked_ids = [tweet_ids[i] for i in ranked_doc_indices]

    # Create a list of tuples of shape (doc, score)
    ranked_tuples = (list(zip(ranked_ids, ranked_scores, ranked_documents)))
    
    return ranked_tuples

In [8]:
from tqdm import tqdm

data = []

for r in tqdm(data_cleaned_dev):
    rumor_id = r['id']
    query = r['rumor']
    timeline = r['timeline']

    # for t in timeline:
    #     print('\t', t)
    
    ranked_docs = retrieve_relevant_documents(query, timeline)
    
    # try:
    # except IndexError:
        # print(query)
        # for t in timeline:
        #     print('\t', t)
        # pass
    for rank, (authority_tweet_id, score, doc_text) in enumerate(ranked_docs[:5]):
        data += [(rumor_id, authority_tweet_id, rank+1, score)]

from utils import write_trec_format_output

out_path = 'temp-data/tfidf-trec.txt'
write_trec_format_output(out_path, data, 'TFIDF-BASIC')

# display(data)

100%|██████████| 32/32 [00:00<00:00, 134.57it/s]


## terrier

In [47]:
import pandas as pd

def jsons_to_pandas(jsons):
    data = []
    for entry in jsons:
        rumor_id = entry['id']
        query = entry['rumor']
        timeline = entry['timeline']

        for author, tw_id, tw in timeline:
            data += [
                [rumor_id, "".join([x if x.isalnum() else " " for x in query]), tw_id, tw]
            ]

    df = pd.DataFrame(data,
                      columns=["qid", "query", "docno", "text"],)
    return df

df = jsons_to_pandas(data_cleaned_dev)

In [62]:
import pandas as pd
import pyterrier as pt

from pyterrier.batchretrieve import TextScorer

if not pt.started():
    pt.init()
    
textscorer = TextScorer(takes="docs", returns="queries", body_attr="text", wmodel="BM25", controls={"qe":"on", "qemodel":"Bo1"})
rtr = textscorer.transform(df)
rtr



Unnamed: 0,qid,docid,docno,rank,score,query
0,AuRED_142,8,1555424541509386240,0,27.443717,Naturalization decree in preparation Lebanese ...
1,AuRED_142,3,1555986659279360001,1,24.391958,Naturalization decree in preparation Lebanese ...
2,AuRED_142,4,1555495801962614786,2,8.471113,Naturalization decree in preparation Lebanese ...
3,AuRED_142,2,1556558220533157890,3,6.758340,Naturalization decree in preparation Lebanese ...
4,AuRED_142,13,1554743913197477888,4,6.454263,Naturalization decree in preparation Lebanese ...
...,...,...,...,...,...,...
4569,AuRED_003,4562,1224611427606040577,47,2.903710,Under the directives of Haitham bin Tariq Al S...
4570,AuRED_003,4525,1225255537669017603,48,2.665409,Under the directives of Haitham bin Tariq Al S...
4571,AuRED_003,4568,1226564900425785344,49,1.213530,Under the directives of Haitham bin Tariq Al S...
4572,AuRED_003,4538,1226469326766723072,50,0.000000,Under the directives of Haitham bin Tariq Al S...


In [63]:
import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

ptio._write_results_trec( rtr.query('rank < 5'), 'temp-data/terrier-trec-bm25-qe.txt')
d = ptio._read_results_trec('temp-data/terrier-trec-bm25-qe.txt')


task5_dir = '../clef2024-checkthat-lab/task5'
golden_labels_file = task5_dir + '/data/dev_qrels.txt'

golden = ptio.read_qrels(golden_labels_file)
eval= ptpipelines.Evaluate(d, golden , metrics = [R@5,MAP],perquery=False)
eval

{'R@5': 0.6859649122807018, 'AP': 0.6412280701754386}

In [59]:
import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

ptio._write_results_trec( rtr.query('rank < 5'), 'temp-data/terrier-trec-bm25-qe.txt')
d = ptio._read_results_trec('temp-data/terrier-trec-bm25-qe.txt')


task5_dir = '../clef2024-checkthat-lab/task5'
golden_labels_file = task5_dir + '/data/dev_qrels.txt'

golden = ptio.read_qrels(golden_labels_file)
eval= ptpipelines.Evaluate(d, golden , metrics = [R@5,MAP],perquery=False)
eval

{'R@5': 0.7189473684210527, 'AP': 0.6810818713450292}

In [60]:
import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

# ptio._write_results_trec( rtr.query('rank < 5'), 'temp-data/terrier-trec-c.txt')
d = ptio._read_results_trec('temp-data/terrier-trec-c.txt')


task5_dir = '../clef2024-checkthat-lab/task5'
golden_labels_file = task5_dir + '/data/dev_qrels.txt'

golden = ptio.read_qrels(golden_labels_file)
eval= ptpipelines.Evaluate(d, golden , metrics = [R@5,MAP],perquery=False)
eval

{'R@5': 0.7189473684210527, 'AP': 0.6806608187134503}

In [42]:
def write_trec_format_output_from_pandas(filename: str, data, tag: str) -> None:
    """
    Writes data to a file in the TREC format.

    Parameters:
    - filename (str): The name of the file to write to.
    - data (List[Tuple[str, int, int, float]]): A list of tuples, where each tuple contains:
        - rumor_id (str): The unique ID for the given rumor.
        - authority_tweet_id (int): The unique ID for the authority tweet.
        - rank (int): The rank of the authority tweet ID for that given rumor_id.
        - score (float): The score given by the model for the authority tweet ID.
    - tag (str): The string identifier of the team/model.
    """
    with open(filename, 'w') as file:
        for row in range(len(data)):
            i = row%5
            line = f"{data.at[i, 'qid']}\tQ0\t{data.at[i, 'docno']}\t{data.at[i, 'rank']}\t{data.at[i, 'score']}\t{tag}\n"
            file.write(line)

write_trec_format_output_from_pandas('temp-data/terrier-trec.txt', rtr.query('rank < 5'), 'TERRIER-BM25')

In [44]:
import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

task5_dir = '../clef2024-checkthat-lab/task5'
golden_labels_file = task5_dir + '/data/dev_qrels.txt'

golden = ptio.read_qrels(golden_labels_file)
eval= ptpipelines.Evaluate(rtr.query('rank < 5'), golden , metrics = [R@5,MAP],perquery=False)
eval

{'R@5': 0.7189473684210527, 'AP': 0.6806608187134503}

In [32]:
d = pd.merge(rtr, df[['docno', 'text']], on='docno', how='left')
d[d['text'].str.contains("Please note that food items unfit for human consumption are destroyed after they are confiscated")]

Unnamed: 0,qid,docid,docno,rank,score,query,text
820,AuRED_099,749,1233784722238705670,149,0.0,Qatar threw Iranian peas into garbage for fear...,Hello my dear brother thank you for your obser...


## eval

In [43]:
import pyterrier as pt
import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import R, MAP    

if not pt.started():
    pt.init()

def evaluate_run(pred_path,golden_path):
    golden = ptio.read_qrels(golden_path)
    pred= ptio._read_results_trec(pred_path)
    eval= ptpipelines.Evaluate(pred, golden , metrics = [R@5,MAP],perquery=False)
    return eval

task5_dir = '../clef2024-checkthat-lab/task5'
sample_submission_file = task5_dir + '/submission_samples/KGAT_zeroShot_evidence_English_dev.txt'
lucene_submission_file = 'temp-data/lucene-trec.txt'
tfidf_submission_file = 'temp-data/tfidf-trec.txt'
terrier_submission_file = 'temp-data/terrier-trec.txt'
golden_labels_file = task5_dir + '/data/dev_qrels.txt'
out_file = 'temp-data/out.csv'

print('sample', '\t', evaluate_run(sample_submission_file,golden_labels_file))
print('lucence', '', evaluate_run(lucene_submission_file,golden_labels_file))
print('tfidf', '\t', evaluate_run(tfidf_submission_file,golden_labels_file))
print('terrier', '', evaluate_run(terrier_submission_file,golden_labels_file))

sample 	 {'R@5': 0.6357894736842106, 'AP': 0.5612280701754385}
lucence  {'R@5': 0.0, 'AP': 0.0}
tfidf 	 {'R@5': 0.7235087719298245, 'AP': 0.6301754385964913}
terrier  {'R@5': 0.05263157894736842, 'AP': 0.05263157894736842}
