In [None]:
# The rest of the code requires this package update

!pip install datasets==4.0.0

import datasets
print(datasets.__version__)

4.0.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Get data from file

import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Postdoc/cleaned_stripped_mimic_notes.csv")
print(df.shape)

(51695, 4)


In [None]:
from transformers import AutoTokenizer

# Load the tokenizer for Bio_ClinicalBERT
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

from datasets import Dataset

# Convert pandas dataframe to HuggingFace dataset
dataset = Dataset.from_pandas(df)

# Tokenize all examples
def tokenize_batch(example):
    return tokenizer(
        example["clean_relevant_note_truncate"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized_dataset = dataset.map(tokenize_batch, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/51695 [00:00<?, ? examples/s]

In [None]:
# Reformat data
tokenized_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'label']
)

# Split data into train and temp
temp_split = tokenized_dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = temp_split["train"]
temp_dataset = temp_split["test"]

# Then split temp into val and test
val_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

In [None]:
from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/Postdoc/results/v3",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/Colab Notebooks/Postdoc/logs/v3",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

In [None]:
import numpy as np
!pip install evaluate
import evaluate

# Metrics to compute
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels)["f1"],
        "recall": recall.compute(predictions=preds, references=labels)["recall"],
        "precision": precision.compute(predictions=preds, references=labels)["precision"]
    }



In [None]:
from transformers import AutoModelForSequenceClassification

# Import pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import torch

# Function to evaluate the model on the test set
def evaluate_model_on_test_set(trainer, test_dataset):
    # Ensure test dataset is in torch format
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # Run model predictions
    predictions = trainer.predict(test_dataset)
    logits = predictions.predictions
    labels = predictions.label_ids
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1)
    preds = torch.argmax(probs, axis=1).numpy()

    # Compute metrics
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds)
    rec = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    auc = roc_auc_score(labels, probs[:, 1])
    cm = confusion_matrix(labels, preds)

    # Display results
    print("🔍 Test Set Evaluation:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"AUROC:     {auc:.4f}")
    print("\nConfusion Matrix:")
    print(cm)

    print("\nDetailed Classification Report:")
    print(classification_report(labels, preds, digits=4))

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auroc": auc,
        "confusion_matrix": cm
    }

In [None]:
import torch
from transformers import Trainer

# Custom class to give label = 0 or 1 different weights in the cross entropy loss function, to account for class imbalance
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Setting the weights here
        weights = torch.tensor([0.72, 1.60]).to(logits.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = WeightedLossTrainer(


In [None]:
# Run the model evaluation before training. As expected, accuracy and F1 are low.

results = evaluate_model_on_test_set(trainer, test_dataset)

[34m[1mwandb[0m: Currently logged in as: [33mlili-zeng[0m ([33mlili-zeng-mcgill-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


🔍 Test Set Evaluation:
Accuracy:  0.6823
Precision: 0.2466
Recall:    0.0151
F1 Score:  0.0284
AUROC:     0.4962

Confusion Matrix:
[[5255  110]
 [2354   36]]

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.6906    0.9795    0.8101      5365
           1     0.2466    0.0151    0.0284      2390

    accuracy                         0.6823      7755
   macro avg     0.4686    0.4973    0.4192      7755
weighted avg     0.5538    0.6823    0.5692      7755



In [None]:
# Train model on 3 epochs

trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,F1,Recall,Precision
1,0.4861,0.508443,0.0087,0.771602,0.656849,0.70802,0.612577
2,0.4037,0.532182,0.0087,0.789399,0.669634,0.691312,0.649274
3,0.3402,0.608529,0.0087,0.792236,0.664025,0.664996,0.663057


TrainOutput(global_step=13572, training_loss=0.4607840450049641, metrics={'train_runtime': 10580.5098, 'train_samples_per_second': 10.26, 'train_steps_per_second': 1.283, 'total_flos': 2.856280994777088e+16, 'train_loss': 0.4607840450049641, 'epoch': 3.0})

In [None]:
# Save model and tokenizer to a folder
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/Postdoc/Bio_ClinicalBERT-fine_tuned/v3")           # saves model weights + config
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Postdoc/Bio_ClinicalBERT-fine_tuned/v3")    # saves tokenizer files

('/content/drive/MyDrive/Colab Notebooks/Postdoc/Bio_ClinicalBERT-fine_tuned/v3/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Postdoc/Bio_ClinicalBERT-fine_tuned/v3/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Postdoc/Bio_ClinicalBERT-fine_tuned/v3/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/Postdoc/Bio_ClinicalBERT-fine_tuned/v3/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/Postdoc/Bio_ClinicalBERT-fine_tuned/v3/tokenizer.json')

In [None]:
# Evaluate model again after training. Accuracy and F1 scores dramatically improve.

results = evaluate_model_on_test_set(trainer, test_dataset)

🔍 Test Set Evaluation:
Accuracy:  0.7929
Precision: 0.6546
Recall:    0.6946
F1 Score:  0.6740
AUROC:     0.8461

Confusion Matrix:
[[4489  876]
 [ 730 1660]]

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.8601    0.8367    0.8483      5365
           1     0.6546    0.6946    0.6740      2390

    accuracy                         0.7929      7755
   macro avg     0.7574    0.7656    0.7611      7755
weighted avg     0.7968    0.7929    0.7945      7755



In [None]:
from sklearn.metrics import precision_recall_fscore_support

# Optional code to find threshold that gives best F1 score

# Get raw logits from trainer.predict
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
labels = predictions.label_ids

# Try a range of thresholds
best_f1 = 0
best_thresh = 0.5

for thresh in np.arange(0.1, 0.9, 0.01):
    preds_thresh = (probs >= thresh).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(labels, preds_thresh, average='binary')
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"Best F1: {best_f1:.4f} at threshold: {best_thresh:.2f}")

Best F1: 0.6749 at threshold: 0.56
