In [9]:
import numpy as np
import pandas as pd
import pickle
import ast
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
import torch
from scipy.special import softmax
import random
from transformers import set_seed
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from typing import Dict, Iterable
import json, pprint


In [10]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
set_seed(SEED)

# Data Read in 

In [11]:
#predictions
train_preds = pd.read_csv("../predictions/10_intfloat-e5-large-v2_lr7e-6_HN_chunked_all_fields_20250527_220428_train.tsv", sep="\t")
dev_preds = pd.read_csv("../predictions/10_intfloat-e5-large-v2_lr7e-6_HN_chunked_all_fields_20250527_220428_dev.tsv", sep="\t")
test_preds = pd.read_csv("../predictions/10_intfloat-e5-large-v2_lr7e-6_HN_chunked_all_fields_20250527_220428_test.tsv",sep="\t")


#----------------------------------------------fixed ------------------------------------------------------------------
#collection:
PATH_COLLECTION_DATA = '../data/subtask4b_collection_data.pkl'
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

#tweets 
PATH_QUERY_TRAIN_DATA = '../data/subtask4b_query_tweets_train.tsv' 
PATH_QUERY_DEV_DATA = '../data/subtask4b_query_tweets_dev.tsv'
PATH_QUERY_TEST_DATA =  '../data/subtask4b_query_tweets_test_gold.tsv'

df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')
df_query_test = pd.read_csv(PATH_QUERY_TEST_DATA, sep = '\t')

# Data limitation
- as we always give out our top 100 predictions in the previous steps, we need to limit them to the top 5/10/20 in this step
- this has proven to be more successfull than taking all 100 predictions into account 

In [12]:
def truncate_predictions(pred_str):
    pred_list = ast.literal_eval(pred_str)  
    return str(pred_list[:10]) #change amount of papers here 

train_preds["preds"] = train_preds["preds"].apply(truncate_predictions)
dev_preds["preds"] = dev_preds["preds"].apply(truncate_predictions)
test_preds["preds"] = test_preds["preds"].apply(truncate_predictions)

# Data preparation 

In [13]:
train_preds["preds"] = train_preds["preds"].apply(ast.literal_eval)
dev_preds["preds"] = dev_preds["preds"].apply(ast.literal_eval)
test_preds["preds"] = test_preds["preds"].apply(ast.literal_eval)

In [14]:
#create a lookup for the document and matching cord_ids
df_collection["document"] = df_collection[["title", "abstract"]].apply(
    lambda x: f"{x['title']} {x['abstract']}", axis=1
)

document_lookup = df_collection.set_index("cord_uid")["document"].to_dict()

In [15]:
#this creates the appropriate reranking dataframe for the train-set
rerank_data = []

for _, row in train_preds.iterrows():
    post_id = row["post_id"]
    candidate_list = row["preds"]

    for cord_uid in candidate_list:
        if cord_uid in document_lookup:   
            rerank_data.append({
                "post_id": post_id,
                "cord_uid": cord_uid,
                "document": document_lookup[cord_uid]
            })
        else:
            print(f"Missing metadata for paper {cord_uid}")

rerank_df = pd.DataFrame(rerank_data)
rerank_df.head()

Unnamed: 0,post_id,cord_uid,document
0,0,htlvpvz5,Oral Management in Rehabilitation Medicine: Or...
1,0,vccct6hq,Outcomes Among Patients Referred to Outpatient...
2,0,yec87cye,Dysphagia presentation and management followin...
3,0,fkwgq5mr,COVID-19: patient characteristics in the first...
4,0,jv3u1c0e,SARS‐CoV‐2 RNA in dental biofilms: Supragingiv...


In [16]:
full_df = rerank_df.merge(df_query_train, on="post_id", how="left")
full_df["input_text"] = full_df.apply(
    lambda x: f"{x['tweet_text']} [SEP] {x['document']}", axis=1
)

full_df["label"] = (full_df["cord_uid_x"] == full_df["cord_uid_y"]).astype(int)
full_df

Unnamed: 0,post_id,cord_uid_x,document,tweet_text,cord_uid_y,input_text,label
0,0,htlvpvz5,Oral Management in Rehabilitation Medicine: Or...,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,Oral care in rehabilitation medicine: oral vul...,1
1,0,vccct6hq,Outcomes Among Patients Referred to Outpatient...,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,Oral care in rehabilitation medicine: oral vul...,0
2,0,yec87cye,Dysphagia presentation and management followin...,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,Oral care in rehabilitation medicine: oral vul...,0
3,0,fkwgq5mr,COVID-19: patient characteristics in the first...,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,Oral care in rehabilitation medicine: oral vul...,0
4,0,jv3u1c0e,SARS‐CoV‐2 RNA in dental biofilms: Supragingiv...,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,Oral care in rehabilitation medicine: oral vul...,0
...,...,...,...,...,...,...,...
128525,14252,ik4wen7l,Modeling human adaptive immune responses with ...,"when ""the airway immune cells of children are ...",nlsv8bin,"when ""the airway immune cells of children are ...",0
128526,14252,1syeo6ff,Age‐related differences in the immune response...,"when ""the airway immune cells of children are ...",nlsv8bin,"when ""the airway immune cells of children are ...",0
128527,14252,ion9fkpg,Binding and neutralizing antibody responses to...,"when ""the airway immune cells of children are ...",nlsv8bin,"when ""the airway immune cells of children are ...",0
128528,14252,ufmvsvm2,Distinct antibody responses to SARS-CoV-2 in c...,"when ""the airway immune cells of children are ...",nlsv8bin,"when ""the airway immune cells of children are ...",0


In [17]:
#this is a sanity check to see how many gold papers are in the predictions we are looking at 
num_label_1 = (full_df["label"] == 1).sum()
print("Number of rows with label 1:", num_label_1)

num_label_1 / len(df_query_train)

Number of rows with label 1: 11685


0.9091262740216292

In [18]:
#this just creates the reranking dataframe for the dev set  

true_map     = df_query_dev.set_index("post_id")["cord_uid"].to_dict()
tweet_map    = df_query_dev.set_index("post_id")["tweet_text"].to_dict()

rerank_data_dev = []

for _, row in dev_preds.iterrows():
    pid = row["post_id"]
    candidates = row["preds"]
    true_uid = true_map[pid]
    text = tweet_map[pid]

    for cand in candidates:
    
        if cand not in document_lookup:
            print(f"⚠️ Missing metadata for {cand}")
            continue

        rerank_data_dev.append({
            "post_id": pid,
            "cord_uid_x": cand,
            "tweet_text": text,
            "document": document_lookup[cand],
            "cord_uid_y": true_uid,
            "label": int(cand == true_uid)
        })


df_dev = pd.DataFrame(rerank_data_dev)

df_dev["input_text"] = df_dev.apply(
    lambda x: f"{x['tweet_text']} [SEP] {x['document']}",
    axis=1
)

In [19]:
hf_dataset = Dataset.from_pandas(full_df[["input_text", "label"]])
hf_dev = Dataset.from_pandas(df_dev[["input_text", "post_id", "cord_uid_x", "cord_uid_y", "label"]])


# Model Selection
- here we can select one of the three models to rerank
- SciBERT, DistilBERT, and MedBERT

## SciBERT

In [20]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

def tokenize_fn(example):
    return tokenizer(example["input_text"], truncation=True,  max_length=512)


model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## DistilBERT

In [21]:
#model = AutoModelForSequenceClassification.from_pretrained(
#    "distilbert-base-uncased",
#    num_labels=2
#)
#
#tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
#
#def tokenize_fn(example):
#    return tokenizer(example["input_text"], truncation=True,  max_length=512)
#

## MedBERT

In [22]:
#BASE_MODEL = "Charangan/MedBERT" 
#
#tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
#model = AutoModelForSequenceClassification.from_pretrained(
#            BASE_MODEL,
#            num_labels=2) 
#
#def tokenize_fn(example):
#    return tokenizer(example["input_text"], truncation=True,  max_length=512)

In [None]:
tokenized_dataset = hf_dataset.map(tokenize_fn, batched=True)
tokenized_dev = hf_dev.map(tokenize_fn, batched=True)

Map:   0%|          | 0/128530 [00:00<?, ? examples/s]

## Training 

In [None]:
print("CUDA available:", torch.cuda.is_available())
print("Device count :", torch.cuda.device_count())
print("Device name  :", torch.cuda.get_device_name(0))

CUDA available: True
Device count : 1
Device name  : NVIDIA GeForce RTX 2060 SUPER


In [None]:
#here we just create the data_collator with our tokenizer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
#create the trainer with according hyperparameters 
training_args = TrainingArguments(
   output_dir="./scibert-newpred-10",
   save_strategy="epoch",
   learning_rate=2e-5,
   per_device_train_batch_size=8,
   num_train_epochs=3,
   weight_decay=0.01,
   logging_dir="./logs",
   logging_steps=100,
   save_total_limit=1,
   fp16=True,
   seed = SEED
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset, 
    eval_dataset = tokenized_dev,
    data_collator=data_collator 
)
     

In [None]:
trainer.train()

Step,Training Loss
100,0.4565
200,0.4942
300,0.4644
400,0.4686
500,0.4737
600,0.4848
700,0.3969
800,0.3578
900,0.4059
1000,0.3872


TrainOutput(global_step=24102, training_loss=0.26640965304536135, metrics={'train_runtime': 4765.0564, 'train_samples_per_second': 40.46, 'train_steps_per_second': 5.058, 'total_flos': 5.027516898598596e+16, 'train_loss': 0.26640965304536135, 'epoch': 3.0})

# Load in Model

- this lets us run inference and test metrics without having to train all over again
- just load in a pretrained model by providing the models path

In [None]:
MODEL_PATH = "scibert-newpred-10/checkpoint-24102"

model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH) #here we can load in the pretrained models 

model.to("cuda" if torch.cuda.is_available() else "cpu")
device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainer.model = model

SEP = tokenizer.sep_token or "[SEP]"

# Get Metrics
- this cell computes the relevant metrics by:
    - mapping the true ranks of all datasets from the original data back to the according dataframes
    - computing the metrics {MRR@1, MRR@5, MRR@10, Recall@1, Recall@5, Recall@10}
    - building dataframes on which the model can make predictions
    - predicting on all three splits: train, dev and test set
    - printing out the metrics for all splits

In [None]:
def _true_ranks(df: pd.DataFrame) -> np.ndarray:
 
    ranks = []
    for _, g in df.groupby("post_id"):
        g_sorted = g.sort_values("score", ascending=False)
        true_uid = g["cord_uid_y"].iloc[0]

        match = np.where(g_sorted["cord_uid_x"].values == true_uid)[0]
        rank  = match[0] + 1 if match.size else np.inf
        ranks.append(rank)

    return np.asarray(ranks, dtype=float)



def compute_metrics(df: pd.DataFrame,
                    cutoffs: Iterable[int] = (1, 5, 10)) -> Dict[str, float]:
    ranks = _true_ranks(df)
    rr    = 1.0 / ranks                                   
    out = {}
    for k in cutoffs:
        mask = ranks <= k
        out[f"MRR@{k}"]    = float((rr * mask).mean())
        out[f"Recall@{k}"] = float(mask.mean())
    return out


def build_rerank_dataframe(pred_df: pd.DataFrame,
                           df_query: pd.DataFrame,
                           document_lookup: Dict[str, str]) -> pd.DataFrame:
    true_map  = df_query.set_index("post_id")["cord_uid"].to_dict()
    tweet_map = df_query.set_index("post_id")["tweet_text"].to_dict()

    rows = []
    for _, row in pred_df.iterrows():
        pid, candidates = row["post_id"], row["preds"]
        true_uid, text  = true_map[pid], tweet_map[pid]

        for cand in candidates:
            rows.append({
                "post_id"    : pid,
                "cord_uid_x" : cand,
                "tweet_text" : text,
                "document"   : document_lookup[cand],
                "cord_uid_y" : true_uid,
                "label"      : int(cand == true_uid)
            })

    return pd.DataFrame(rows)


all_metrics = {}
for split in ["train", "dev", "test"]:
    pred_df  = globals()[f"{split}_preds"]          
    df_query = globals()[f"df_query_{split}"]       

   
    df_tmp = build_rerank_dataframe(pred_df, df_query, document_lookup)


    hf_ds = Dataset.from_pandas(
        df_tmp[["tweet_text", "document", "post_id",
                "cord_uid_x", "cord_uid_y", "label"]].assign(
            input_text=lambda x: x["tweet_text"] + " [SEP] " + x["document"]
        )
    )
    tokenised = hf_ds.map(tokenize_fn, batched=True)
    cols_to_remove = set(tokenised.column_names) - set(tokenizer.model_input_names)
    if "label" in cols_to_remove:        
        cols_to_remove.add("label")
    tokenised = tokenised.remove_columns(list(cols_to_remove))            
    logits = trainer.predict(tokenised).predictions
    df_tmp["score"] = softmax(logits, axis=1)[:, 1]

    globals()[f"df_{split}"] = df_tmp                
    all_metrics[split]       = compute_metrics(df_tmp)


pprint.pprint(all_metrics, width=120, compact=True)


Map:   0%|          | 0/64265 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7230 [00:00<?, ? examples/s]

{'dev': {'MRR@1': 0.6764285714285714,
         'MRR@10': 0.7269285714285715,
         'MRR@5': 0.7269285714285715,
         'Recall@1': 0.6764285714285714,
         'Recall@10': 0.8021428571428572,
         'Recall@5': 0.8021428571428572},
 'test': {'MRR@1': 0.6099585062240664,
          'MRR@10': 0.664638082065468,
          'MRR@5': 0.664638082065468,
          'Recall@1': 0.6099585062240664,
          'Recall@10': 0.7468879668049793,
          'Recall@5': 0.7468879668049793},
 'train': {'MRR@1': 0.826888664125107,
           'MRR@10': 0.8458336575118649,
           'MRR@5': 0.8458336575118649,
           'Recall@1': 0.826888664125107,
           'Recall@10': 0.8706916673150237,
           'Recall@5': 0.8706916673150237}}


In [None]:
hallo prlease reror


SyntaxError: invalid syntax (90707109.py, line 1)

In [None]:
# ── Export ranked predictions per post_id in TSV format ──────────────────────
import csv, pandas as pd

TOP_K = None     # set to None for the full ranked list

for split in ["train","dev", "test"]:                # add "train" if you need it
    scored = globals()[f"df_{split}"]        # produced earlier

    ranked = (
        scored
          .sort_values("score", ascending=False)
          .groupby("post_id")
          .head(TOP_K if TOP_K is not None else len(scored))   # keep top-k
          .groupby("post_id")["cord_uid_x"]
          .apply(list)                                         # → python list
          .reset_index(name="preds")
    )

    # Ensure the list serialises with single quotes like your example
    ranked["preds"] = ranked["preds"].apply(lambda lst: str(lst))

    # Save:  post_id <TAB> ['id1', 'id2', ...]
    out_path = f"{split}_ranked_preds.tsv"
    ranked.to_csv(
        out_path,
        sep="\t",
        index=False,
        header=True,
        quoting=csv.QUOTE_NONE,   # no extra double-quotes
        escapechar="\\",          # escape tabs/newlines if they ever occur
    )
    globals()[f"ranked_preds_{split}"] = ranked   # keep available in memory
    print(f"Wrote {out_path}  ({len(ranked)} queries)")

# Quick peek
display(ranked_preds_dev.head())
display(ranked_preds_test.head())


Wrote train_ranked_preds.tsv  (12853 queries)
Wrote dev_ranked_preds.tsv  (1400 queries)
Wrote test_ranked_preds.tsv  (1446 queries)


Unnamed: 0,post_id,preds
0,16,"['3qvh482o', 'hg3xpej0', 'styavbvi', 'nksd3wuw..."
1,69,"['r58aohnu', 'r5iu5nr1', 'ajco76bb', 'u1q6wl45..."
2,73,"['sts48u9i', 'qkg8fwbp', 'a7frertc', '6xc4j09c..."
3,93,"['3sr2exq9', 'k0f4cwig', 'sv48gjkk', 'kca5r5hr..."
4,96,"['ybwwmyqy', 'ouvq2wpq', 'rs3umc1x', 'vabb2f26..."


Unnamed: 0,post_id,preds
0,1,"['qgwu9fsk', 'x4zuv4jo', 'lty6oq3d', '5aev7ltr..."
1,2,"['wigakw9h', 'mm2aotem', 'g4i4hkhz', '4vkkaqhz..."
2,3,"['00ugdhvf', 'yosjlefy', 'm3m2n3fw', 'gtp5daep..."
3,4,"['ru2ty1y9', 'r4q0zqam', '1zp73mvu', 'ruwmeau5..."
4,5,"['f5p37j7g', 'nzat41wu', 'lmzkseiz', '2jdlavwj..."
