# Baselines

In [3]:
from src.data import load_omnimed_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from datasets import load_dataset
import pandas as pd
import tempfile
from transformers import DataCollatorForMultipleChoice
import evaluate
import numpy as np
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train_df, val_df, test_df = load_omnimed_dataset()

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

# Check for image overlap
print("Overlap train-test:", len(set(train_df['image_path']) & set(test_df['image_path'])))
print("Overlap train-val:", len(set(train_df['image_path']) & set(val_df['image_path'])))


Train size: 42380
Validation size: 7472
Test size: 5535
Overlap train-test: 0
Overlap train-val: 0


In [5]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
print("Using device:", device)

Using device: mps


In [6]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")
model = AutoModelForMultipleChoice.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")
model = model.to(device)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def to_hf_dataset(df:pd.DataFrame):
    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
        df.to_csv(tmp.name, index=False)
        dataset = load_dataset('csv', data_files={'data': tmp.name}, split='data')
    return dataset

def preprocess_function(examples):
    option_cols = ["option_A", "option_B", "option_C", "option_D"]
    first_sentences = []
    second_sentences = []
    labels = []

    for i in range(len(examples["question"])):
        question = str(examples["question"][i])
        options = [str(examples[col][i]) for col in option_cols]
        first_sentences.extend([question] * 4)
        second_sentences.extend(options)
        label = option_cols.index(str(examples["gt_label"][i]))
        labels.append(label)

    tokenized = tokenizer(first_sentences, second_sentences, truncation=True)
    result = {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized.items()}
    #use tokenzier.decode to ensure the results are correct and no data leakage
    
    
    result["labels"] = labels
    return result

In [8]:
hf_test = to_hf_dataset(test_df)
hf_train = to_hf_dataset(train_df)
hf_val = to_hf_dataset(val_df)

Generating data split: 5535 examples [00:00, 405432.54 examples/s]
Generating data split: 42380 examples [00:00, 566994.90 examples/s]
Generating data split: 7472 examples [00:00, 339587.37 examples/s]


In [9]:
remove_columns = ["dataset","question_id","modality_type","question_type","image_path","option_A","option_B","option_C","option_D","gt_label"]
tokenized_train = hf_train.map(preprocess_function, batched=True,remove_columns=remove_columns)
tokenized_val = hf_val.map(preprocess_function, batched=True,remove_columns=remove_columns)
tokenized_test = hf_test.map(preprocess_function, batched=True,remove_columns=remove_columns)

Map:   0%|          | 0/42380 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 42380/42380 [00:02<00:00, 20746.25 examples/s]
Map: 100%|██████████| 7472/7472 [00:00<00:00, 20617.18 examples/s]
Map: 100%|██████████| 5535/5535 [00:00<00:00, 18671.07 examples/s]


In [10]:
collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="bert_text_baseline",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()

Inference

In [20]:
tokenizer = AutoTokenizer.from_pretrained("models/bert_text_baseline/checkpoint-2649")
model = AutoModelForMultipleChoice.from_pretrained("models/bert_text_baseline/checkpoint-2649")
model = model.to(device)

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def infere_and_evaluate(test_df,model,tokenizer,device):
    #Not bached but one at a time
    preds = []
    gt = []
    for idx, row in test_df.iterrows():
        question = str(row["question"])
        options = [str(row[col]) for col in ["option_A", "option_B", "option_C", "option_D"]]
        gt_label = row["gt_label"]
        
        inputs = tokenizer([[question,options[0]], [question,options[1]],[question,options[2]],[question,options[3]]], return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()})
            logits = outputs.logits
            pred = logits.argmax(dim=1).cpu().item()

        preds.append(pred)
        #gt is the index of the correct option
        gt.append(["option_A", "option_B", "option_C", "option_D"].index(gt_label))

    accuracy = accuracy_score(gt, preds)
    precision = precision_score(gt, preds, average='macro', zero_division=0)
    recall = recall_score(gt, preds, average='macro', zero_division=0)
    f1 = f1_score(gt, preds, average='macro', zero_division=0)
    results = {
        "accuracy": accuracy,
        "precision_macro": precision,
        "recall_macro": recall,
        "f1_macro": f1
    }

    return results, preds, gt

In [22]:
results, preds,gt = infere_and_evaluate(test_df, model, tokenizer, device)
print(f"Test accuracy: {results["accuracy"]:.4f}")
print(f"Test precision (macro): {results["precision_macro"]:.4f}")
print(f"Test recall (macro): {results["recall_macro"]:.4f}")
print(f"Test F1 (macro): {results["f1_macro"]:.4f}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Test accuracy: 0.9906
Test precision (macro): 0.9909
Test recall (macro): 0.9913
Test F1 (macro): 0.9911


Accuracy: 0.9906
Precision: 0.9909
Recall: 0.9913
F1-Score: 0.9911

In [23]:
unique_questions_test_list = test_df['question'].unique()
unique_questions_train_list = train_df['question'].unique()
questions_both = set(unique_questions_test_list) & set(unique_questions_train_list)
print("Number of unique questions appearing in both train and test sets:", len(questions_both))
print("Total number of unique questions in test set:", len(unique_questions_test_list))

Number of unique questions appearing in both train and test sets: 1868
Total number of unique questions in test set: 2316


In [27]:
test_df["preds"] = preds
test_df["preds"] = test_df["preds"].map({0:"option_A",1:"option_B",2:"option_C",3:"option_D"})


In [25]:
test_both = test_df[test_df['question'].isin(questions_both)]
accuracy_both = accuracy_score(test_both['gt_label'].map({"option_A":0,"option_B":1,"option_C":2,"option_D":3}), test_both['preds'].map({"option_A":0,"option_B":1,"option_C":2,"option_D":3}))
print("Accuracy on questions appearing in both train and test sets:", accuracy_both)

Accuracy on questions appearing in both train and test sets: 0.9916881060755987


In [26]:
test_unique = test_df[~test_df['question'].isin(questions_both)]
accuracy_unique = accuracy_score(test_unique['gt_label'].map({"option_A":0,"option_B":1,"option_C":2,"option_D":3}), test_unique['preds'].map({"option_A":0,"option_B":1,"option_C":2,"option_D":3}))
print("Accuracy on questions appearing only in the test set:", accuracy_unique)

Accuracy on questions appearing only in the test set: 0.979253112033195
