# CLEF 2025 - CheckThat! Lab  - Task 4 Scientific Web Discourse - Subtask 4b (Scientific Claim Source Retrieval) - NLP Approach



# 1) Importing data

In [1]:
import numpy as np
import pandas as pd

PATH_COLLECTION_DATA = '../subtask4b_collection_data.pkl' 
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

PATH_QUERY_TRAIN_DATA = '../subtask4b_query_tweets_train.tsv' 
PATH_QUERY_DEV_DATA = '../subtask4b_query_tweets_dev.tsv' 
PATH_QUERY_TEST_DATA = '../subtask4b_query_tweets_test.tsv' 

df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')
df_query_test = pd.read_csv(PATH_QUERY_TEST_DATA, sep = '\t')


## Code for uploading  Model to Hugging Face


In [2]:

# Save the trained model on hugging face (in a new repo)
from huggingface_hub import create_repo

def create_repo_on_huggingface(repo_id_str):
    try:
        repo_url = create_repo(repo_id=repo_id_str, exist_ok=True, private=True)
        print(f"Created or found repository on Hugging Face Hub: {repo_url}")
        # create_repo returns the URL of the repository, not the repo_id string.
        # Let's keep the repo_id string for upload_folder
        repo_id = repo_id_str

    except TypeError as e:
        print(f"Error creating repository: {e}")
        print("It seems your huggingface_hub library version is incompatible.")
        print("Please update it: pip install -U huggingface_hub")
    except Exception as e:
        print(f"An unexpected error occurred while creating the repository: {e}")
    return repo_id

ModuleNotFoundError: No module named 'huggingface_hub'

In [None]:
#repo_id = create_repo_on_huggingface('LukasXperiaZ/all-mpnet-base-v2-neural-ir-2-epochs')

Created or found repository on Hugging Face Hub: https://huggingface.co/ChrisNeuba/all-mpnet-base-v2-neural-ir-2-epochs


In [7]:
# Uploads the model to hugging face
from huggingface_hub import upload_folder

def upload_model_to_huggingface(local_folder_path, repo_id):
    # Path to your local directory containing the trained model files

    print(f"Uploading files from {local_folder_path} to {repo_id}...")

    upload_folder(
        folder_path=local_folder_path,
        repo_id=repo_id,
        repo_type='model', # Specify the type of repository
        commit_message='Upload final model from checkpoint',
    )

    print("Upload complete!")

In [None]:

local_folder_path = '/output/all-mpnet-base-v2-neural-ir-2-epochs'
upload_model_to_huggingface(local_folder_path, repo_id)

Mounted at /content/drive
Uploading files from /content/drive/MyDrive/fine_tuned_model_5 to ChrisNeuba/all-mpnet-base-v2-neural-ir-2-epochs...


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Upload complete!


# 2. Running with SBERT



## 2.1. Running fine tuned model No. 1 

This one was fine-tuned on the training data with 2 epochs.

In [None]:

from sentence_transformers import SentenceTransformer, util
import torch

model_name = 'LukasXperiaZ/all-mpnet-base-v2-neural-ir-2-epochs'

model_1 = SentenceTransformer(model_name)
print(f"Model loaded successfully from Hugging Face Hub: {model_name}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]






README.md:   0%|          | 0.00/50.0k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Model loaded successfully from Hugging Face Hub: ChrisNeuba/all-mpnet-base-v2-neural-ir-2-epochs


In [None]:
# Fill missing abstracts with empty string
df_collection['abstract'] = df_collection['abstract'].fillna('')

# Combine title, abstract, authors and journal into a single string
papers = (
    df_collection['title'].fillna('') + '. ' +
    df_collection['abstract'].fillna('') + ' ' +
    df_collection['authors'].fillna('') + ' ' +
    df_collection['journal'].fillna('')
).tolist()
cord_uids = df_collection['cord_uid'].tolist()

# Encode all papers once
paper_embeddings = model_1.encode(papers, convert_to_tensor=True)
uid_to_index = {uid: idx for idx, uid in enumerate(cord_uids)}


In [None]:
import torch.nn.functional as F

paper_embeddings = F.normalize(paper_embeddings, p=2, dim=1)

def get_sbert_topk(df_query, model, k=10):
    queries = df_query['tweet_text'].tolist()
    query_embeddings = model.encode(queries, convert_to_tensor=True)

    # Normalize embeddings
    query_embeddings = F.normalize(query_embeddings, p=2, dim=1)

    topk_results = []
    for query_emb in query_embeddings:

        scores = util.cos_sim(query_emb, paper_embeddings)[0]
        top_k = torch.topk(scores, k=k)
        topk_uids = [cord_uids[idx] for idx in top_k.indices]
        topk_results.append(topk_uids)
    return topk_results

In [None]:
# Generate top-k predictions for train/dev/test
df_query_train['sbert_topk'] = get_sbert_topk(df_query_train, model_1, k=10)
df_query_dev['sbert_topk'] = get_sbert_topk(df_query_dev, model_1, k=10)
df_query_test['sbert_topk'] = get_sbert_topk(df_query_test, model_1, k=10)

### 2.1.1 Evaluation

Evaluate the first model

In [17]:
def get_performance_mrr(data, col_gold, col_pred, list_k=[1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(
            lambda x: (1 / ([i for i in x[col_pred][:k]].index(x[col_gold]) + 1)
                       if x[col_gold] in [i for i in x[col_pred][:k]] else 0),
            axis=1)
        d_performance[k] = data["in_topx"].mean()
    return d_performance



In [18]:
# Evaluate MRR
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'sbert_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'sbert_topk')
#results_test = get_performance_mrr(df_query_test, 'cord_uid', 'sbert_topk')

print(f"SBERT Results on the train set: {results_train}")
print(f"SBERT Results on the dev set: {results_dev}")
#print(f"SBERT Results on the test set: {results_test}")

SBERT Results on the train set: {1: np.float64(0.620788920874504), 5: np.float64(0.7006548406338339), 10: np.float64(0.7081666746939446)}
SBERT Results on the dev set: {1: np.float64(0.6035714285714285), 5: np.float64(0.6659047619047619), 10: np.float64(0.6730342970521542)}


### 2.1.2 Exporting results to prepare the submission on Codalab

In [None]:
df_query_test['preds'] = df_query_test['sbert_topk'].apply(lambda x: x[:5])

df_query_test[['post_id', 'preds']].to_csv('/output/neural_ir/02_epochs/test_results.tsv', index=None, sep='\t')

## 2.2 Model No. 2

Fine-Tune another model with more epochs

### 2.2.1 Fine-Tuning

In [None]:

from sentence_transformers import InputExample, losses
from torch.utils.data import DataLoader

def build_paper_text(row):
    parts = [row.get('title', ''), row.get('abstract', ''), row.get('journal', ''), row.get('authors', '')]
    return ' '.join([p.strip() for p in parts if isinstance(p, str) and p.strip()])

model_2 = SentenceTransformer('all-mpnet-base-v2') # or use 'pritamdeka/S-PubMedBert-MS-MARCO'

cord_uid_to_text = {
    row['cord_uid']: build_paper_text(row)
    for _, row in df_collection.iterrows()
}

train_examples = [
    InputExample(texts=[row['tweet_text'], cord_uid_to_text[row['cord_uid']]])
    for _, row in df_query_train.iterrows()
    if row['cord_uid'] in cord_uid_to_text
]

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model_2)

model_2.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    warmup_steps=100,
    output_path='/output/all-mpnet-base-v2-neural-ir-10-epochs'
)

model_2.save("/output/all-mpnet-base-v2-neural-ir-10-epochs")


In [None]:
# save model on huggingface
repo_id_2 = create_repo_on_huggingface('LukasXperiaZ/all-mpnet-base-v2-neural-ir-10-epochs')

local_folder_path = '/output/all-mpnet-base-v2-neural-ir-10-epochs'
upload_model_to_huggingface(local_folder_path, repo_id_2)

### 2.2.2 Evaluation

In [None]:

# embed all papers on model
paper_embeddings = model_1.encode(papers, convert_to_tensor=True)
# ensure the shape for the cosine similarity
paper_embeddings = F.normalize(paper_embeddings, p=2, dim=1)

df_query_train['sbert_topk'] = get_sbert_topk(df_query_train, model_2, k=10)
df_query_dev['sbert_topk'] = get_sbert_topk(df_query_dev, model_2, k=10)
df_query_test['sbert_topk'] = get_sbert_topk(df_query_test, model_2, k=10)

# Evaluate MRR
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'sbert_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'sbert_topk')
#results_test = get_performance_mrr(df_query_test, 'cord_uid', 'sbert_topk')

print(f"SBERT Results on the train set: {results_train}")
print(f"SBERT Results on the dev set: {results_dev}")
#print(f"SBERT Results on the test set: {results_test}")

In [None]:

df_query_test['preds'] = df_query_test['sbert_topk'].apply(lambda x: x[:5])

df_query_test[['post_id', 'preds']].to_csv('/output/neural_ir/10_epochs/test_results.tsv', index=None, sep='\t')