# BioBERT-based model to classify biomedical QA pairs into yes/no/maybe.

Two-stage pipeline:
  1. Fine-tune BioBERT on labeled + (weighted) artificial data.
  2. Evaluate model.

## Importing Dataset

In [None]:
%pip install datasets --upgrade

In [2]:
from datasets import load_dataset

# Load the labeled, unlabeled, and artificial subsets of PubMedQA
# The dataset is split into three subsets:

dataset_labeled = load_dataset("qiaojin/PubMedQA", 'pqa_labeled')
dataset_unlabeled = load_dataset("qiaojin/PubMedQA", 'pqa_unlabeled')
dataset_artificial = load_dataset('qiaojin/PubMedQA', 'pqa_artificial')

In [3]:
# Convert the datasets to pandas DataFrames for easier manipulation
import pandas as pd

df_labeled_original = pd.DataFrame(dataset_labeled['train'])
df_unlabeled_original = pd.DataFrame(dataset_unlabeled['train'])
df_artificial_original = pd.DataFrame(dataset_artificial['train'])

In [4]:
# Make a copy of the original datasets to work on
df_labeled = df_labeled_original.copy()
df_unlabeled = df_unlabeled_original.copy()
df_artificial = df_artificial_original.copy()

In [None]:
%pip install transformers torch

## Preprocessing

In [6]:
# Function to merge Question and Contexts into Input
def merge_fields(row):
    question = row['question']
    context_text = " ".join(row['context']['contexts'])
    return f"{question} {context_text}"

# Apply function to all datasets
for df in [df_labeled, df_artificial, df_unlabeled]:
    df['context_str'] = df.apply(merge_fields, axis=1)

In [7]:
label_map = {'yes': 0, 'no': 1, 'maybe': 2}
df_labeled['label'] = df_labeled['final_decision'].map(label_map)
df_artificial['label'] = df_artificial['final_decision'].map(label_map)

In [8]:
df_labeled_final = df_labeled[['context_str', 'label']]
df_artificial_final = df_artificial[['context_str', 'label']]

In [9]:
# Check class distribution
print(df_labeled_final['label'].value_counts())
print(df_artificial_final['label'].value_counts())

label
0    552
1    338
2    110
Name: count, dtype: int64
label
0    196144
1     15125
Name: count, dtype: int64


In [10]:
# Balance Artificial Dataset
df_artificial_final = df_artificial_final.groupby('label').sample(n=min(df_artificial_final['label'].value_counts()), random_state=42)
# Shuffle the dataset
df_artificial_final = df_artificial_final.sample(frac=1, random_state=42)

In [11]:
from sklearn.model_selection import train_test_split

# Split df_labeled_final
labeled_train, labeled_test = train_test_split(
    df_labeled_final,
    test_size=0.2,      # 20% for testing
    random_state=42,    # for reproducibility
    stratify=df_labeled_final['label']  # optional: ensures class distribution is preserved
)

# Split df_artificial_final
artificial_train, artificial_test = train_test_split(
    df_artificial_final,
    test_size=0.2,
    random_state=42,
    stratify=df_artificial_final['label']
)


In [12]:
df_train = pd.concat([labeled_train, artificial_train], ignore_index=True, sort=False)
df_train.head()

Unnamed: 0,context_str,label
0,Increased neutrophil migratory activity after ...,0
1,Are UK radiologists satisfied with the trainin...,1
2,Do patients with rheumatoid arthritis establis...,0
3,A short stay or 23-hour ward in a general and ...,0
4,Do family physicians know the costs of medical...,1


In [13]:
df_test = pd.concat([labeled_test, artificial_test], ignore_index=True, sort=False)
df_test.head()

Unnamed: 0,context_str,label
0,Are home sampling kits for sexually transmitte...,2
1,Scrotal approach to both palpable and impalpab...,0
2,Are polymorphisms in oestrogen receptors genes...,0
3,Do elderly patients benefit from surgery in ad...,1
4,Does route of delivery affect maternal and per...,1


## Finetune BioBERT for QA Classification

Tokenize the data

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
def encode_data(tokenizer, questions, max_length):
    encoded = tokenizer.batch_encode_plus(
        questions,
        truncation=True,
        padding='longest',
        max_length=max_length,
        return_tensors='pt'  # return PyTorch tensors
    )
    return encoded["input_ids"], encoded["attention_mask"]
# Use an appropriate max_length
input_ids, attention_mask = encode_data(tokenizer, df_train['context_str'].tolist(), max_length=512 )

Finetuning BioBERT


In [15]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,    # number of training epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

from datasets import Dataset

# Assume your input_ids, attention_mask, and labels are lists or tensors
data = {
    'input_ids': input_ids.tolist(),
    'attention_mask': attention_mask.tolist(),
    'labels': df_train['label'].tolist()
}

train_ds = Dataset.from_dict(data)

# Create the Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


KeyboardInterrupt: 