# 1) Importing data

In [None]:
import pandas as pd
import numpy as np

## 1.a) Import the collection set
The collection set contains metadata of CORD-19 academic papers.

The preprocessed and filtered CORD-19 dataset is available on the Gitlab repository here: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b

Participants should first download the file then upload it on the Google Colab session with the following steps.


In [None]:
# 1) Download the collection set from the Gitlab repository: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b
# 2) Drag and drop the downloaded file to the "Files" section (left vertical menu on Colab)
# 3) Modify the path to your local file path
PATH_COLLECTION_DATA = '../subtask4b_collection_data.pkl' #MODIFY PATH

In [None]:
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

In [None]:
df_collection.info()

In [None]:
df_collection.head()

## 1.b) Import the query set

The query set contains tweets with implicit references to academic papers from the collection set.

The preprocessed query set is available on the Gitlab repository here: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b

Participants should first download the file then upload it on the Google Colab session with the following steps.

In [None]:
# 1) Download the query tweets from the Gitlab repository: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4/subtask_4b?ref_type=heads
# 2) Drag and drop the downloaded file to the "Files" section (left vertical menu on Colab)
# 3) Modify the path to your local file path
PATH_QUERY_TRAIN_DATA = '../subtask4b_query_tweets_train.tsv' #MODIFY PATH
PATH_QUERY_DEV_DATA = '../subtask4b_query_tweets_dev.tsv' #MODIFY PATH

In [None]:
df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')

In [None]:
df_query_train.head()

In [None]:
df_query_train.info()

In [None]:
df_query_dev.head()

In [None]:
df_query_dev.info()

# 2) Running TF-IDF with preprocessing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import re

# Download necessary NLTK resources (if not already downloaded)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
def standard_tokenizer(text):
    """Standard tokenizer with stemming and stop word removal."""
    if not isinstance(text, str):
        return []
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text.lower())
    stop_words = set(nltk.corpus.stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmer = SnowballStemmer('english')
    return [stemmer.stem(word) for word in filtered_tokens]

def name_tokenizer(text):
    """Tokenizer for extracting potential names (capitalized n-grams)."""
    if not isinstance(text, str):
        return ["unknown"] # return unknown
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    name_ngrams = []
    for n in [2, 3]:  # Bigrams and trigrams
        for i in range(len(pos_tags) - n + 1):
            ngram_pos = pos_tags[i:i + n]
            ngram_words = [word for word, tag in ngram_pos]
            ngram_tags = [tag for word, tag in ngram_pos]
            if all(tag == 'NNP' for tag in ngram_tags) or \
               (n == 2 and ngram_tags[0] == 'NNP' and ngram_tags[1] in ['NN', 'NNP']):
                if all(word[0].isupper() for word in ngram_words):
                    name_ngrams.append(" ".join(ngram_words).lower())
    if not name_ngrams:
        return ["unknown"] #return unkown if no name ngrams are found.
    return name_ngrams

In [None]:
# 1. Metadata Vectorization
title_abstract_vectorizer = TfidfVectorizer(tokenizer=standard_tokenizer)
title_abstract_matrix = title_abstract_vectorizer.fit_transform(df_collection['title'] + " " + df_collection['abstract'])

authors_vectorizer = TfidfVectorizer(tokenizer=name_tokenizer)
authors_matrix = authors_vectorizer.fit_transform(df_collection['authors'].fillna(''))

journal_vectorizer = TfidfVectorizer(tokenizer=standard_tokenizer)
journal_matrix = journal_vectorizer.fit_transform(df_collection['journal'].fillna(''))

def extract_year(text):
    """Extracts a year (4 digits) from text."""
    match = re.search(r'\b\d{4}\b', text)
    if match:
        return int(match.group(0))
    return None

def get_weighted_similarity_topk(query_text):
    title_abstract_vector = title_abstract_vectorizer.transform([query_text])
    title_abstract_similarity = cosine_similarity(title_abstract_vector, title_abstract_matrix).flatten()

    potential_authors = " ".join(name_tokenizer(query_text))
    potential_journal = query_text # TODO extract something
    potential_year = extract_year(query_text)

    authors_vector = authors_vectorizer.transform([potential_authors])
    journal_vector = journal_vectorizer.transform([potential_journal])

    authors_similarity = cosine_similarity(authors_vector, authors_matrix).flatten()
    journal_similarity = cosine_similarity(journal_vector, journal_matrix).flatten()

    publish_time_similarity = np.zeros_like(title_abstract_similarity)
    if potential_year is not None:
        publish_time_similarity = 1 - np.abs(df_collection['timet'] - potential_year) / (df_collection['timet'].max() - df_collection['timet'].min())

    weighted_similarity = (
        0.6 * title_abstract_similarity +
        0.2 * authors_similarity +
        0.1 * journal_similarity +
        0.1 * publish_time_similarity
    )

    top_indices = np.argsort(weighted_similarity)[::-1][:5]
    return df_collection['cord_uid'].iloc[top_indices].tolist()

In [None]:
# Retrieve topk candidates using TF-IDF
df_query_train['tfidf_topk'] = df_query_train['tweet_text'].apply(get_weighted_similarity_topk)
df_query_dev['tfidf_topk'] = df_query_dev['tweet_text'].apply(get_weighted_similarity_topk)

# 3) Evaluating TF-IDF with preprocessing

In [None]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k=[1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(
            lambda x: (1 / ([i for i in x[col_pred][:k]].index(x[col_gold]) + 1)
                      if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [None]:
# Evaluate TF-IDF results
results_train_tfidf = get_performance_mrr(df_query_train, 'cord_uid', 'tfidf_topk')
results_dev_tfidf = get_performance_mrr(df_query_dev, 'cord_uid', 'tfidf_topk')

In [None]:
# Print TF-IDF MRR@k results
print(f"TF-IDF Results on the train set: {results_train_tfidf}")
print(f"TF-IDF Results on the dev set: {results_dev_tfidf}")

# 4) Exporting results to prepare the submission on Codalab

In [None]:
#df_query_dev['preds'] = df_query_dev['bm25_topk'].apply(lambda x: x[:5])

In [None]:
#df_query_dev[['post_id', 'preds']].to_csv('predictions.tsv', index=None, sep='\t')