In [116]:
import re
import torch
import string
import unicodedata
import numpy as np
import pandas as pd
from pathlib import Path
from datasets import load_dataset
from transformers import AutoTokenizer

import sys
sys.path.append('../../')

In [None]:
DATA_DIR = Path("../data/dev")
processed_dir = DATA_DIR / "processed"

In [21]:
datasets = ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']

# load a specific split of a subset dataset
pubmedqa = load_dataset("rungalileo/ragbench", "pubmedqa")

In [50]:
train_data = pubmedqa['train']

In [51]:
train_data = train_data.to_pandas()

# Exploration

In [64]:
train_data.head()

Unnamed: 0,id,question,documents,response,documents_sentences,response_sentences,all_relevant_sentence_keys,relevance_score
0,pubmedqa_39085,Is there a functional neural correlate of indi...,[The present study tested whether individuals ...,"Yes, the study mentioned in the first piece of...","[[[0a, The present study tested whether indivi...","[[a, Yes, the study mentioned in the first pie...",[0a],0.142857
1,pubmedqa_42813,Can we use the Omron T9P automated blood press...,"[Recent events in our hospital, combined with ...","Yes, based on the context provided, we can use...","[[[0a, Recent events in our hospital, combined...","[[a, Yes, based on the context provided, we ca...","[0a, 1a, 3a, 3b, 4a]",0.454545
2,pubmedqa_57525,Intraabdominal vascular injury: are we getting...,[Intraabdominal vascular injury (IAVI) as a re...,"Based on the pieces of context provided, the s...","[[[0a, Intraabdominal vascular injury (IAVI) a...","[[a, Based on the pieces of context provided, ...","[0a, 0b, 2a, 3c, 3e, 4a]",0.461538
3,pubmedqa_29863,Hand grip and pinch strength in patients with ...,[The hand grip strength test and pinch was sig...,"Yes, according to the context provided, the ha...","[[[0a, The hand grip strength test and pinch w...","[[a, Yes, according to the context provided, t...","[0a, 0b, 1a, 2a, 2b, 2c, 2d, 2e, 2f]",0.818182
4,pubmedqa_36126,Is Canada ready for patient accessible electro...,[Access to personal health information through...,"Based on the information provided, Canada is n...","[[[0a, Access to personal health information t...","[[a, Based on the information provided, Canada...","[0a, 0b, 0c, 0d, 1a, 1b, 1c]",0.636364


In [65]:
type(train_data)

pandas.core.frame.DataFrame

In [66]:
train_data.columns

Index(['id', 'question', 'documents', 'response', 'documents_sentences',
       'response_sentences', 'all_relevant_sentence_keys', 'relevance_score'],
      dtype='object')

Select relevant attributes

In [60]:
relevant_columns = ['id', 'question', 'documents', 'response', 'documents_sentences',
                   'response_sentences', 'all_relevant_sentence_keys', 'relevance_score']
train_data = train_data[relevant_columns]

In [67]:
observation = train_data.iloc[0]

In [68]:
observation.question

'Is there a functional neural correlate of individual differences in cardiovascular reactivity?'

In [46]:
train_data.all_relevant_sentence_keys.unique

<bound method Series.unique of 0                                        [0a]
1                        [0a, 1a, 3a, 3b, 4a]
2                    [0a, 0b, 2a, 3c, 3e, 4a]
3        [0a, 0b, 1a, 2a, 2b, 2c, 2d, 2e, 2f]
4                [0a, 0b, 0c, 0d, 1a, 1b, 1c]
                         ...                 
19595        [0a, 0b, 1a, 1b, 2a, 2c, 2d, 3a]
19596                    [0a, 0b, 0c, 0d, 1b]
19597                [0a, 1a, 2a, 3a, 3d, 4c]
19598        [0b, 0c, 0d, 0e, 1a, 1b, 1c, 1d]
19599                [0b, 1a, 1b, 1c, 4a, 4f]
Name: all_relevant_sentence_keys, Length: 19600, dtype: object>

# Attribute Selection

**question:** The medical question being asked. (Used as input)
**documents_sentences:** Contains the sentences from the source documents. (Context for classification)
**all_relevant_sentence_keys:**	Identifies which sentences are relevant. (Binary label for classification)

**Goal:**

- Pair each question with its individual document sentences → (question, sentence) pairs.
- Label each sentence as relevant (1) or not relevant (0) → Using all_relevant_sentence_keys.
- Train a classifier using BERT embeddings to classify each sentence.


In [48]:
train_data = train_data[["question", "documents_sentences", "all_relevant_sentence_keys"]]

In [49]:
train_data.head()

Unnamed: 0,question,documents_sentences,all_relevant_sentence_keys
0,Is there a functional neural correlate of indi...,"[[[0a, The present study tested whether indivi...",[0a]
1,Can we use the Omron T9P automated blood press...,"[[[0a, Recent events in our hospital, combined...","[0a, 1a, 3a, 3b, 4a]"
2,Intraabdominal vascular injury: are we getting...,"[[[0a, Intraabdominal vascular injury (IAVI) a...","[0a, 0b, 2a, 3c, 3e, 4a]"
3,Hand grip and pinch strength in patients with ...,"[[[0a, The hand grip strength test and pinch w...","[0a, 0b, 1a, 2a, 2b, 2c, 2d, 2e, 2f]"
4,Is Canada ready for patient accessible electro...,"[[[0a, Access to personal health information t...","[0a, 0b, 0c, 0d, 1a, 1b, 1c]"


# Attribute Transformation

In [72]:
import ast

In [76]:
def safe_eval(val):
    if isinstance(val, str):  # If it's a string, evaluate it
        return ast.literal_eval(val)
    elif isinstance(val, list) or isinstance(val, tuple):  # If already a list, return as is
        return list(val)
    else:
        return []  # Default case, return empty list

In [77]:
train_data["all_relevant_sentence_keys"] = train_data["all_relevant_sentence_keys"].apply(safe_eval)

In [75]:
train_data["documents_sentences"] = train_data["documents_sentences"]\
    .apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [78]:
# List to store transformed data
data_rows = []

# Iterate through dataset
for _, row in train_data.iterrows():
    question = row["question"]
    relevant_keys = set(row["all_relevant_sentence_keys"])  # Convert to set for quick lookup

    # Iterate through each document’s sentences (handling nested lists)
    for doc_sentences in row["documents_sentences"]:
        for sentence in doc_sentences:  # sentence is a list like [key, text]
            if len(sentence) == 2:  # Ensure correct format
                sentence_key, sentence_text = sentence
                label = 1 if sentence_key in relevant_keys else 0  # Assign label
                data_rows.append((question, sentence_text, label))

In [81]:
data = pd.DataFrame(data_rows, columns=["question", "sentence", "label"])

In [82]:
data.head()

Unnamed: 0,question,sentence,label
0,Is there a functional neural correlate of indi...,The present study tested whether individuals w...,0
1,Is there a functional neural correlate of indi...,This study examined whether heightened cardiov...,0
2,Is there a functional neural correlate of indi...,"Heart rate variability (HRV), a measure of aut...",0
3,Is there a functional neural correlate of indi...,Previous studies have also not controlled for ...,0
4,Is there a functional neural correlate of indi...,Low socioeconomic status is associated with in...,0


# Preprocessing

In [84]:
data.dtypes

question    object
sentence    object
label        int64
dtype: object

Convert the text to lowercase

In [85]:
data["question"] = data["question"].astype(str)
data["sentence"] = data["sentence"].astype(str)

Check for missing values

In [87]:
print(data.isnull().sum())

question    0
sentence    0
label       0
dtype: int64


**Note:** We might want to check for duplicates here

Convert strings to lowercase

In [95]:
data["question"] = data["question"].str.lower()
data["sentence"] = data["sentence"].str.lower()

Remove Extra Whitespaces & Newlines

In [98]:
def clean_whitespace(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    return text.strip()

In [99]:
data["question"] = data["question"].apply(clean_whitespace)
data["sentence"] = data["sentence"].apply(clean_whitespace)

Normalize Unicode Characters

In [102]:
def normalize_unicode(text):
    return unicodedata.normalize("NFKC", text)

In [103]:
data["question"] = data["question"].apply(normalize_unicode)
data["sentence"] = data["sentence"].apply(normalize_unicode)

Remove Punctuation

In [105]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [106]:
data["question"] = data["question"].apply(remove_punctuation)
data["sentence"] = data["sentence"].apply(remove_punctuation)

Safe if neccessary

In [107]:
# data.to_csv("pubmedqa.csv", index=False)

# Tokenization

**ModernBERT** expects subword tokenization using the AutoTokenizer from the Hugging Face Transformers library

In [112]:
token = "XXX";

In [114]:
# Load ModernBERT tokenizer (make sure you have the correct model name)
tokenizer = AutoTokenizer.from_pretrained("UCLNLP/ModernBERT", token=token)

OSError: UCLNLP/ModernBERT is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [115]:
# Tokenizing questions and sentences together
encodings = tokenizer(
    list(data["question"]),
    list(data["sentence"]),
    padding=True,        # Ensures all sequences have the same length
    truncation=True,
    max_length=512,      # ModernBERT can handle 512 tokens
    return_tensors="pt"
)

NameError: name 'tokenizer' is not defined

Prepare data for model

In [None]:
# Convert labels to tensors (assuming 0 = not relevant, 1 = essential)
labels = torch.tensor(data["label"].values, dtype=torch.long)

# Create dataset object
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

dataset = CustomDataset(encodings, labels)

Train the model (WIP)