## data loading

In [1]:
from utils import load_rumors_from_jsonl
import os

out_dir = './temp-data'

clef_path = '../clef2024-checkthat-lab/task5'
data_path = os.path.join(clef_path, 'data')

filepath_train = os.path.join(data_path, 'English_train.json')
filepath_dev = os.path.join(data_path, 'English_dev.json')

train_jsons = load_rumors_from_jsonl(filepath_train)
dev_jsons = load_rumors_from_jsonl(filepath_dev)

print(f'loaded {len(train_jsons)} training json objects and {len(dev_jsons)} dev objects')

loaded 96 training json objects and 32 dev objects


In [2]:
# # combine all training data, maybe to use for a global index?
# # but we need to do "zero-shot" retrieval...
# all_timeline_tweets = {}
# duplicates = 0

# for d in train_jsons:
#     for author_url, id, text in d['timeline']:
#         if id not in all_timeline_tweets:
#             all_timeline_tweets[id] = {'author_url': author_url, 'text': text }
#         else:
#             duplicates += 1

# print(len(all_timeline_tweets.keys()))
# print(duplicates)

# import json

# with open('temp-data/eng-train.jsonl', mode='w') as file:
#     file.write('')

# with open('temp-data/eng-train.jsonl', mode='a', encoding='utf8') as file:
#     for id in all_timeline_tweets:
#         o = {'id': id, 'contents': all_timeline_tweets[id]['text']}
#         file.write(json.dumps(o) + '\n')

## pyserini

In [3]:
from pyserini.search.lucene import LuceneSearcher
import os
import json
import subprocess

# if you get the error "NameError: name '_C' is not defined" --> restart the Jupyter Kernel

def searchPyserini(query,
                   timeline,
                   k = 5,
                   temp_dir = 'temp-data-dir',
                   index = 'temp-data-dir/index_timeline_dynamic'):
    
    # ensure "working directory" exists (where we store intermediate data like the dynamic index that will be quered later)
    if not os.path.exists(temp_dir):
        os.mkdir(temp_dir)

    # set up "dynamic" (= temporary) index using timeline data
    dynamic_idx_filename = 'eng-train-dynamic.jsonl'
    with open(os.path.join(temp_dir, dynamic_idx_filename), mode='w', encoding='utf8') as file:
        for tweet in timeline:
            id = tweet[1]
            text = tweet[2]
            file.write(json.dumps({'id': id, 'contents': text}) + '\n')
    
    # ensure index directory exists and is empty
    if os.path.exists(index):
        for filename in os.listdir(index):
            if os.path.isfile(os.path.join(index, filename)):
                os.remove(os.path.join(index, filename))
    else:
        os.mkdir(index)

    # set up pyserini command since python embeddable is not out yet
    nthreads = 1
    command = f'python -m pyserini.index.lucene ' \
    f'-input {temp_dir} ' \
    f'-collection JsonCollection ' \
    f'-generator DefaultLuceneDocumentGenerator ' \
    f'-index {index} ' \
    f'-threads {nthreads} ' \
    f'-storePositions ' \
    f'-storeDocvectors ' \
    f'-storeRaw ' \
    f'-language en'

    result = subprocess.run(command, capture_output=True)

    # load searcher from index directoy
    searcher = LuceneSearcher(index)
    hits = searcher.search(query)

    ranked_tuples = []

    for hit in hits:
        doc = searcher.doc(hit.docid)
        json_doc = json.loads(doc.raw())

        ranked_tuples += [(hit.docid, hit.score, json_doc["contents"])]

        # wrap(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}\n{json_doc["contents"]}')

    return ranked_tuples

In [4]:
test_rumor = dev_jsons[2]
query = test_rumor['rumor']
timeline = test_rumor['timeline']

ranked_docs = searchPyserini(query, timeline)
display(ranked_docs)

# simple spot check
for evidence in test_rumor['evidence']:
    print(f'{"WAS FOUND" if evidence[1] in [x[0] for x in ranked_docs] else "NOT FOUND"}\t{evidence[1]} {evidence[2]}')

[('1590400068208988160',
  23.554899215698242,
  'After circulating news that the Governor of the Bank of Lebanon, Riad Salameh, had announced to NBN about raising the value of the dollar and raising the ceiling on banking withdrawals, the NBN channel denies the validity of this information that is being circulated, citing the channel, and confirms that there is no truth to it on this subject. https://t.co/oIF6QW9rTB https://t.co/ZRlfbccr2y'),
 ('1591489851106668544',
  15.838800430297852,
  'Raising the exchange rate of the customs dollar, the TVA, and withdrawals from banks... What are its repercussions and results? Is there economic and financial stability in light of the current political chaos? Report: Rasha Al-Zein Hashem https://t.co/Q7nJM2Dvdt @rashazeinnbn'),
 ('1589949764107665409',
  9.809499740600586,
  "Turkish Minister of Energy: #Turkey's purchases of natural gas from #Russia have begun to be partially paid in Russian rubles.. The share of payments in local currency in e

WAS FOUND	1590400068208988160 After circulating news that the Governor of the Bank of Lebanon, Riad Salameh, had announced to NBN about raising the value of the dollar and raising the ceiling on banking withdrawals, the NBN channel denies the validity of this information that is being circulated, citing the channel, and confirms that there is no truth to it on this subject. https://t.co/oIF6QW9rTB https://t.co/ZRlfbccr2y
NOT FOUND	1590364198462435329 There is no truth to the information being circulated, quoted by the #NBN channel, regarding a statement by the Governor of the Central Bank regarding banking circulars https://t.co/1nnPkPnTfS


In [5]:
from tqdm import tqdm

data = []

for r in tqdm(dev_jsons):
    rumor_id = r['id']
    query = r['rumor']
    timeline = r['timeline']

    ranked_docs = searchPyserini(query, timeline)

    for rank, (authority_tweet_id, score, doc_text) in enumerate(ranked_docs[:5]):
        data += [(rumor_id, authority_tweet_id, rank+1, score)]

display(data)       

100%|██████████| 32/32 [01:21<00:00,  2.55s/it]


[('AuRED_142', '1555986659279360001', 1, 4.115900039672852),
 ('AuRED_142', '1555424541509386240', 2, 2.2701001167297363),
 ('AuRED_142', '1555495801962614786', 3, 1.1647000312805176),
 ('AuRED_142', '1554743913197477888', 4, 1.0306999683380127),
 ('AuRED_142', '1556558220533157890', 5, 0.9453999996185303),
 ('AuRED_144', '1576137274990858240', 1, 3.260200023651123),
 ('AuRED_144', '1575937576061501450', 2, 3.212399959564209),
 ('AuRED_144', '1575169360397996032', 3, 2.446899890899658),
 ('AuRED_144', '1576132457803710464', 4, 2.2381999492645264),
 ('AuRED_144', '1574697757662744576', 5, 2.180500030517578),
 ('AuRED_132', '1590400068208988160', 1, 23.554899215698242),
 ('AuRED_132', '1591489851106668544', 2, 15.838800430297852),
 ('AuRED_132', '1589949764107665409', 3, 9.809499740600586),
 ('AuRED_132', '1590995361132212224', 4, 9.197500228881836),
 ('AuRED_132', '1591404278996168705', 5, 9.040399551391602),
 ('AuRED_099', '1234108219544522752', 1, 3.8231000900268555),
 ('AuRED_099', '

In [6]:
from utils import write_trec_format_output

out_path = 'temp-data/lucene-trec.txt'
write_trec_format_output(out_path, data, 'LUCENE')

## naive tfidf

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_relevant_documents(query, timeline):
    # Get only doc texts
    documents = [t[2] for t in timeline]
    tweet_ids = [t[1] for t in timeline]

    # Combine query and documents for TF-IDF vectorization
    combined_texts = [query] + documents
    
    # Generate TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined_texts)
    
    # Calculate similarity of the query to each document
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
    
    # Rank documents based on similarity scores
    ranked_doc_indices = similarity_scores.argsort()[0][::-1]

    # Sort the documents according to rank
    ranked_documents = [documents[i] for i in ranked_doc_indices]
    ranked_scores = [similarity_scores[0][i] for i in ranked_doc_indices]
    ranked_ids = [tweet_ids[i] for i in ranked_doc_indices]

    # Create a list of tuples of shape (doc, score)
    ranked_tuples = (list(zip(ranked_ids, ranked_scores, ranked_documents)))
    
    return ranked_tuples

In [3]:
from tqdm import tqdm

data = []

for r in tqdm(dev_jsons):
    rumor_id = r['id']
    query = r['rumor']
    timeline = r['timeline']

    ranked_docs = retrieve_relevant_documents(query, timeline)

    for rank, (authority_tweet_id, score, doc_text) in enumerate(ranked_docs[:5]):
        data += [(rumor_id, authority_tweet_id, rank+1, score)]

# display(data)
        
from utils import write_trec_format_output

out_path = 'temp-data/tfidf-trec.txt'
write_trec_format_output(out_path, data, 'TFIDF-BASIC')

  0%|          | 0/32 [00:00<?, ?it/s]

100%|██████████| 32/32 [00:00<00:00, 118.00it/s]


## clustering

In [None]:
docs_sem = []
for i in train_jsons:
    docs_sem += [t[2] for t in i['timeline']]

docs_sem = list(set(docs_sem))
len(docs_sem)

In [None]:
docs_sem

In [None]:
import os
os.environ['HF_HUB_OFFLINE']='1'

In [None]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:

# Assume `documents` is your list of document texts
embeddings = model.encode(docs_sem)


In [None]:

# Perform K-means clustering
num_clusters = 2  # Adjust based on your analysis
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(embeddings)

# Use t-SNE for visualization
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(embeddings)

# Plotting
plt.figure(figsize=(12, 8))
for i in range(num_clusters):
    cluster_indices = kmeans.labels_ == i
    plt.scatter(reduced_embeddings[cluster_indices, 0], reduced_embeddings[cluster_indices, 1], label=f'Cluster {i}')
plt.legend()
plt.title('t-SNE visualization of document clusters')
plt.show()


## eval

In [4]:
import pyterrier as pt
from pyterrier.measures import *
from ir_measures import R, MAP
if not pt.started():
    pt.init()

def evaluate_run(pred_path,golden_path):
    golden = pt.io.read_qrels(golden_path)
    pred=pt.io._read_results_trec(pred_path)
    eval=pt.Evaluate(pred, golden , metrics = [R@5,MAP],perquery=False)
    return eval

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



In [5]:
task5_dir = '../clef2024-checkthat-lab/task5'
sample_submission_file = task5_dir + '/submission_samples/KGAT_zeroShot_evidence_English_dev.txt'
lucene_submission_file = 'temp-data/lucene-trec.txt'
tfidf_submission_file = 'temp-data/tfidf-trec.txt'
golden_labels_file = task5_dir + '/data/dev_qrels.txt'
out_file = 'temp-data/out.csv'

print('sample', '\t', evaluate_run(sample_submission_file,golden_labels_file))
print('lucence', '', evaluate_run(lucene_submission_file,golden_labels_file))
print('tfidf', '\t', evaluate_run(tfidf_submission_file,golden_labels_file))

sample 	 {'R@5': 0.6357894736842106, 'AP': 0.5612280701754385}
lucence  {'R@5': 0.6950877192982456, 'AP': 0.6355555555555555}
tfidf 	 {'R@5': 0.6687719298245615, 'AP': 0.593157894736842}
