In [21]:
import re
import string
import unicodedata
import pandas as pd
from pathlib import Path
from datasets import load_dataset
from collections import defaultdict
from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from datasets import Dataset

import sys
sys.path.append("../../src")
import util.preprocessing_util as util

In [22]:
DATA_DIR = Path("../../data/dev")
processed_dir = DATA_DIR / "processed"

In [23]:
datasets = ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']

# load pubmedqa from RAGBENCH
pubmedqa = load_dataset("rungalileo/ragbench", "pubmedqa")

In [70]:
train_data = pubmedqa['train']
test_data = pubmedqa['test']

In [71]:
train_data = train_data.to_pandas()
test_data = test_data.to_pandas()

# Exploration

In [72]:
train_data.head()

Unnamed: 0,id,question,documents,response,generation_model_name,annotating_model_name,dataset_name,documents_sentences,response_sentences,sentence_support_information,...,trulens_groundedness,trulens_context_relevance,ragas_faithfulness,ragas_context_relevance,gpt3_adherence,gpt3_context_relevance,gpt35_utilization,relevance_score,utilization_score,completeness_score
0,pubmedqa_39085,Is there a functional neural correlate of indi...,[The present study tested whether individuals ...,"Yes, the study mentioned in the first piece of...",gpt-3.5-turbo-0125,gpt-4o,pubmedqa_train,"[[[0a, The present study tested whether indivi...","[[a, Yes, the study mentioned in the first pie...",[{'explanation': 'This sentence is directly su...,...,,,,,,,,0.142857,0.142857,1.0
1,pubmedqa_42813,Can we use the Omron T9P automated blood press...,"[Recent events in our hospital, combined with ...","Yes, based on the context provided, we can use...",gpt-3.5-turbo-0125,gpt-4o,pubmedqa_train,"[[[0a, Recent events in our hospital, combined...","[[a, Yes, based on the context provided, we ca...",[{'explanation': 'This sentence summarizes the...,...,,,,,,,,0.454545,0.181818,0.4
2,pubmedqa_57525,Intraabdominal vascular injury: are we getting...,[Intraabdominal vascular injury (IAVI) as a re...,"Based on the pieces of context provided, the s...",gpt-3.5-turbo-0125,gpt-4o,pubmedqa_train,"[[[0a, Intraabdominal vascular injury (IAVI) a...","[[a, Based on the pieces of context provided, ...",[{'explanation': 'This sentence is partially s...,...,,,,,,,,0.461538,0.307692,0.666667
3,pubmedqa_29863,Hand grip and pinch strength in patients with ...,[The hand grip strength test and pinch was sig...,"Yes, according to the context provided, the ha...",gpt-3.5-turbo-0125,gpt-4o,pubmedqa_train,"[[[0a, The hand grip strength test and pinch w...","[[a, Yes, according to the context provided, t...",[{'explanation': 'This sentence is supported b...,...,,,,,,,,0.818182,0.181818,0.222222
4,pubmedqa_36126,Is Canada ready for patient accessible electro...,[Access to personal health information through...,"Based on the information provided, Canada is n...",gpt-3.5-turbo-0125,gpt-4o,pubmedqa_train,"[[[0a, Access to personal health information t...","[[a, Based on the information provided, Canada...",[{'explanation': 'The sentence is supported by...,...,,,,,,,,0.636364,0.454545,0.714286


In [73]:
train_data.shape

(19600, 26)

In [74]:
train_data.columns

Index(['id', 'question', 'documents', 'response', 'generation_model_name',
       'annotating_model_name', 'dataset_name', 'documents_sentences',
       'response_sentences', 'sentence_support_information',
       'unsupported_response_sentence_keys', 'adherence_score',
       'overall_supported_explanation', 'relevance_explanation',
       'all_relevant_sentence_keys', 'all_utilized_sentence_keys',
       'trulens_groundedness', 'trulens_context_relevance',
       'ragas_faithfulness', 'ragas_context_relevance', 'gpt3_adherence',
       'gpt3_context_relevance', 'gpt35_utilization', 'relevance_score',
       'utilization_score', 'completeness_score'],
      dtype='object')

In [78]:
train_data.all_relevant_sentence_keys.unique

<bound method Series.unique of 0                                        [0a]
1                        [0a, 1a, 3a, 3b, 4a]
2                    [0a, 0b, 2a, 3c, 3e, 4a]
3        [0a, 0b, 1a, 2a, 2b, 2c, 2d, 2e, 2f]
4                [0a, 0b, 0c, 0d, 1a, 1b, 1c]
                         ...                 
19595        [0a, 0b, 1a, 1b, 2a, 2c, 2d, 3a]
19596                    [0a, 0b, 0c, 0d, 1b]
19597                [0a, 1a, 2a, 3a, 3d, 4c]
19598        [0b, 0c, 0d, 0e, 1a, 1b, 1c, 1d]
19599                [0b, 1a, 1b, 1c, 4a, 4f]
Name: all_relevant_sentence_keys, Length: 19600, dtype: object>

In [82]:
observation = train_data.iloc[1]

In [83]:
observation.question

'Can we use the Omron T9P automated blood pressure monitor in pregnancy?'

In [84]:
observation.all_relevant_sentence_keys

array(['0a', '1a', '3a', '3b', '4a'], dtype=object)

# Attributes

**question:** The medical question being asked. (Used as input)
**documents_sentences:** Contains the sentences from the source documents. (Context for classification)
**all_relevant_sentence_keys:**	Identifies which sentences are relevant. (Binary label for classification)

**all_relevant_sentence_keys:**

- This column contains sentence identifiers (e.g., '0a', '1b') that point to relevant sentences in documents_sentences.
- Format: 'Xd', where X is the index of the retrieved document, and d is the sentence ID within that document.
- These keys identify which sentences from the retrieved documents were deemed relevant for answering the question.

**Goal:**

- Pair each question with its individual document sentences → (question, sentence) pairs.
- Label each sentence as relevant (1) or not relevant (0) → Using all_relevant_sentence_keys.
- Train a classifier using BERT embeddings to classify each sentence.


In [86]:
train_data = train_data[["question", "documents_sentences", "all_relevant_sentence_keys"]]
test_data = test_data[["question", "documents_sentences", "all_relevant_sentence_keys"]]

# Attribute Transformation

In [87]:
import ast

In [88]:
def safe_eval(val):
    if isinstance(val, str):  # If it's a string, evaluate it
        return ast.literal_eval(val)
    elif isinstance(val, list) or isinstance(val, tuple):  # If already a list, return as is
        return list(val)
    else:
        return []  # Default case, return empty list

In [89]:
train_data.loc[:, "documents_sentences"] = train_data["documents_sentences"] \
    .apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

test_data.loc[:, "documents_sentences"] = test_data["documents_sentences"] \
    .apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [90]:
def transform_dataset(train_data):
    """
    Transforms the dataset into a list of (question, sentence, label) tuples.

    Args:
        train_data (pd.DataFrame): DataFrame containing 'question', 'all_relevant_sentence_keys', and 'documents_sentences'.

    Returns:
        list: A list of tuples (question, sentence_text, label).
    """
    data_rows = []

    for _, row in train_data.iterrows():
        question = row["question"]
        relevant_keys = set(row["all_relevant_sentence_keys"])  # Convert to set for quick lookup

        for doc_sentences in row["documents_sentences"]:
            for sentence in doc_sentences:  # sentence is a list like [key, text]
                if len(sentence) == 2:  # Ensure correct format
                    sentence_key, sentence_text = sentence
                    label = 1 if sentence_key in relevant_keys else 0  # Assign label
                    data_rows.append((question, sentence_text, label))

    return data_rows

In [91]:
train_data = pd.DataFrame(transform_dataset(train_data), columns=["question", "sentence", "label"])
test_data = pd.DataFrame(transform_dataset(test_data), columns=["question", "sentence", "label"])

In [92]:
train_data.head()

Unnamed: 0,question,sentence,label
0,Is there a functional neural correlate of indi...,The present study tested whether individuals w...,1
1,Is there a functional neural correlate of indi...,This study examined whether heightened cardiov...,0
2,Is there a functional neural correlate of indi...,"Heart rate variability (HRV), a measure of aut...",0
3,Is there a functional neural correlate of indi...,Previous studies have also not controlled for ...,0
4,Is there a functional neural correlate of indi...,Low socioeconomic status is associated with in...,0


# Preprocessing

In [75]:
train_data.dtypes

question    object
sentence    object
label        int64
dtype: object

Convert the text to lowercase

In [76]:
train_data["question"] = train_data["question"].astype(str)
train_data["sentence"] = train_data["sentence"].astype(str)
test_data["question"] = test_data["question"].astype(str)
test_data["sentence"] = test_data["sentence"].astype(str)

Check for missing values

In [77]:
print(train_data.isnull().sum())

question    0
sentence    0
label       0
dtype: int64


**Note:** We might want to check for duplicates here

Convert strings to lowercase

In [78]:
train_data["question"] = train_data["question"].str.lower()
train_data["sentence"] = train_data["sentence"].str.lower()
test_data["question"] = test_data["question"].str.lower()
test_data["sentence"] = test_data["sentence"].str.lower()

Remove Extra Whitespaces & Newlines

In [26]:
def clean_whitespace(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    return text.strip()

In [79]:
train_data["question"] = train_data["question"].apply(clean_whitespace)
train_data["sentence"] = train_data["sentence"].apply(clean_whitespace)
test_data["question"] = test_data["question"].apply(clean_whitespace)
test_data["sentence"] = test_data["sentence"].apply(clean_whitespace)

Normalize Unicode Characters

In [80]:
def normalize_unicode(text):
    return unicodedata.normalize("NFKC", text)

In [81]:
train_data["question"] = train_data["question"].apply(normalize_unicode)
train_data["sentence"] = train_data["sentence"].apply(normalize_unicode)
test_data["question"] = test_data["question"].apply(normalize_unicode)
test_data["sentence"] = test_data["sentence"].apply(normalize_unicode)

Remove Punctuation

In [82]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [83]:
train_data["question"] = train_data["question"].apply(remove_punctuation)
train_data["sentence"] = train_data["sentence"].apply(remove_punctuation)
test_data["question"] = test_data["question"].apply(remove_punctuation)
test_data["sentence"] = test_data["sentence"].apply(remove_punctuation)

Safe if necessary

In [85]:
'''
train_data.to_csv("../../data/dev/processed/pubmedqa_train.csv", index=False)
test_data.to_csv("../../data/dev/processed/pubmedqa_test.csv", index=False)
''';

In [86]:
train_data.head()

Unnamed: 0,question,sentence,label
0,is there a functional neural correlate of indi...,the present study tested whether individuals w...,1
1,is there a functional neural correlate of indi...,this study examined whether heightened cardiov...,0
2,is there a functional neural correlate of indi...,heart rate variability hrv a measure of autono...,0
3,is there a functional neural correlate of indi...,previous studies have also not controlled for ...,0
4,is there a functional neural correlate of indi...,low socioeconomic status is associated with in...,0


# Masking

**Mask Sentences**

- Replace all sentences except one with a [MASK] token
- Keep only one sentence unmasked at a time

In [87]:
def mask_sentences(sentences, target_idx):
    """
    Masks all sentences except the one at target_idx.
    """
    return " ".join(
        [sent if idx == target_idx else "[MASK]" for idx, (sid, sent) in enumerate(sentences)]
    )

Aggregate Sentences Per Question

In [88]:
# Dictionary to store aggregated data
aggregated_train_data = defaultdict(lambda: {"sentences": [], "labels": []})
aggregated_test_data = defaultdict(lambda: {"sentences": [], "labels": []})

# aggregate sentences per question
for _, row in train_data.iterrows():
    q = row["question"]
    aggregated_train_data[q]["sentences"].append(row["sentence"])
    aggregated_train_data[q]["labels"].append(row["label"])

for _, row in test_data.iterrows():
    q = row["question"]
    aggregated_test_data[q]["sentences"].append(row["sentence"])
    aggregated_test_data[q]["labels"].append(row["label"])

# convert to df
aggregated_train_df = pd.DataFrame([
    {"question": q, "sentences": v["sentences"], "labels": v["labels"]}
    for q, v in aggregated_train_data.items()
])
aggregated_test_df = pd.DataFrame([
    {"question": q, "sentences": v["sentences"], "labels": v["labels"]}
    for q, v in aggregated_test_data.items()
])

Mask Sentences One by One - Create multiple masked versions for training

In [89]:
def create_masked_inputs(df):
    masked_data = []

    for _, row in df.iterrows():
        question = row["question"]
        sentences = row["sentences"]
        labels = row["labels"]

        for i, sentence in enumerate(sentences):
            # Mask all sentences except the one at index i
            masked_context = ["[MASK]" if j != i else s for j, s in enumerate(sentences)]
            full_context = " ".join(masked_context)

            # Store new training instance
            masked_data.append({"question": question, "context": full_context, "label": labels[i]})

    return pd.DataFrame(masked_data)

In [90]:
masked_train_df = create_masked_inputs(aggregated_train_df)
masked_test_df = create_masked_inputs(aggregated_test_df)

In [91]:
masked_train_df.head()

Unnamed: 0,question,context,label
0,is there a functional neural correlate of indi...,the present study tested whether individuals w...,1
1,is there a functional neural correlate of indi...,[MASK] this study examined whether heightened ...,0
2,is there a functional neural correlate of indi...,[MASK] [MASK] heart rate variability hrv a mea...,0
3,is there a functional neural correlate of indi...,[MASK] [MASK] [MASK] previous studies have als...,0
4,is there a functional neural correlate of indi...,[MASK] [MASK] [MASK] [MASK] low socioeconomic ...,0


# Tokenization (Not End-To-End)

We encode the question together with the entire context (all sentences in the document). This allows the model to have full context while classifying.

However, we only predict the relevance of one sentence at a time.

To achieve this, we mask the other sentences in the context so that the model focuses only on the target sentence.

This ensures that during training, only the sentence being classified is "visible" for learning, while the rest are ignored.

Get tokenizer from HuggingFace

In [39]:
token = "XXXX";

In [40]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base", num_labels=2, token=token)

In [41]:
'''
Normally we can create the encodings like this but since its taking very long on my laptop we use the custom version with the tqdm progress bar which is also much faster and memory efficient.
'''
'''
encodings = tokenizer(
    list(masked_df["question"]),
    list(masked_df["context"]),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)''';

In [92]:
# Convert the dataframe into lists for iteration
questions = list(masked_train_df["question"])
contexts = list(masked_train_df["context"])

# Initialize an empty list to store tokenized outputs
input_ids, attention_masks = [], []

In [93]:
for question, context in tqdm(zip(questions, contexts), total=len(questions), desc="Tokenizing"):
    encoding = tokenizer(
        question,
        context,
        padding="max_length",  # Ensures uniform length
        truncation=True,       # Cuts off if too long
        max_length=512,        # ModernBERT supports up to 512 tokens
        return_tensors="pt"
    )

    input_ids.append(encoding["input_ids"].squeeze())  # Remove batch dim
    attention_masks.append(encoding["attention_mask"].squeeze())

Tokenizing:   0%|          | 0/237544 [00:00<?, ?it/s]

In [94]:
input_ids = torch.stack(input_ids)
attention_masks = torch.stack(attention_masks)

# Check the shape of tokenized data
print(f"Tokenized dataset shape: {input_ids.shape}")

Tokenized dataset shape: torch.Size([237544, 512])


Convert Encodings to PyTorch Dataset

In [95]:
class SentenceRelevanceDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = torch.tensor(labels)  # Convert labels to tensor

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.labels[idx]
        }

# Convert labels to tensor
labels = torch.tensor(list(masked_train_df["label"]))

# Create dataset
dataset = SentenceRelevanceDataset(input_ids, attention_masks, labels)

print(f"Dataset size: {len(dataset)}")

Dataset size: 237544


  self.labels = torch.tensor(labels)  # Convert labels to tensor


Split the dataset

In [96]:
# Define split ratio (80% train, 20% validation)
train_size = 0.8
train_idx, val_idx = train_test_split(
    range(len(dataset)), test_size=1 - train_size, random_state=1050
)

# Create train and validation subsets
from torch.utils.data import Subset
train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

Training samples: 190035
Validation samples: 47509


Create DataLoaders

In [97]:
BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Training End-To-End

Load ModernBERT Model (Trainable Embeddings)

In [48]:
model = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-base", num_labels=2, token=token
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Check if model parameters are trainable (Ensure We Are Training End-to-End)

In [49]:
for param in model.parameters():
    param.requires_grad = True  # Ensure the entire model is updated

Prepare the dataset

In [50]:
hf_dataset_train = Dataset.from_pandas(masked_train_df)
hf_dataset_test = Dataset.from_pandas(masked_test_df)

In [51]:
# Create a global tqdm progress bar
progress_bar = tqdm(total=(len(hf_dataset_train) + len(hf_dataset_test)),
                    desc="Tokenizing", position=0, leave=True)

Tokenizing:   0%|          | 0/237544 [00:00<?, ?it/s]

In [52]:
def tokenize_batch(batch):
    """Tokenizes question + masked context while updating tqdm."""

    # Tokenize batch
    encodings = tokenizer(
        batch["question"],
        batch["context"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    # Update tqdm manually (batch_size at a time)
    progress_bar.update(len(batch["question"]))

    return {
        "input_ids": encodings["input_ids"].tolist(),
        "attention_mask": encodings["attention_mask"].tolist(),
        "labels": batch["label"]  # Keep labels
    }

In [53]:
BATCH_SIZE = 16

In [54]:
tokenized_dataset_train = hf_dataset_train.map(
    tokenize_batch,
    batched=True,
    batch_size=32  # Adjust based on your system
)
tokenized_dataset_test = hf_dataset_test.map(
    tokenize_batch,
    batched=True,
    batch_size=32  # Adjust based on your system
)



Map:   0%|          | 0/237544 [00:00<?, ? examples/s]

In [55]:
progress_bar.close()

Convert to PyTorch Dataset

In [56]:
# Ensure dataset is set to PyTorch format
tokenized_dataset_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_dataset_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Split dataset into train/test
'''
split = 0.8
train_size = int(split * len(tokenized_dataset))
test_size = len(tokenized_dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(tokenized_dataset, [train_size, test_size])
'''

print(f"Training samples: {len(tokenized_dataset_train)}, Test samples: {len(tokenized_dataset_test)}")

Training samples: 190035, Test samples: 47509


Create DataLoader for Batching (optional)

In [57]:
'''
BATCH_SIZE = 16

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Check one batch
batch = next(iter(train_dataloader))
print({key: value.shape for key, value in batch.items()})
''';

Quick check if everything is correct before training

In [58]:
print("----- Training Set -----")
# Check correct format
print(tokenized_dataset_train)
# Check required fields
print(tokenized_dataset_train.column_names)
print("----- Test Set -----")
# Check correct format
print(tokenized_dataset_test)
# Check required fields
print(tokenized_dataset_test.column_names)

Dataset({
    features: ['question', 'context', 'label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 237544
})
['question', 'context', 'label', 'input_ids', 'attention_mask', 'labels']


In [59]:
output_dir = Path("../../results")

In [61]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
)

In [None]:
# Train model
trainer.train()