In [1]:
# pip install transformers torch pandas scikit-learn

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

path = "drive/MyDrive/HealthML/final_project_files/"
df = pd.read_csv(path + "processed_notes.csv")

# Split the data into train, validation, and test sets
train_val, test = train_test_split(df, test_size=0.1, random_state=42)
train, val = train_test_split(train_val, test_size=0.1, random_state=42)

# Model Preparation

Load the ClinicalBERT model with a classification head from the Hugging Face transformers library: https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT?text=Paris+is+the+%5BMASK%5D+of+France.

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, logging
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

# Suppress info messages from transformers (optional, not necessary)
logging.set_verbosity_warning()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# Load model with a specific configuration for binary classification (I could be wrong here if anyone can check it)
model = BertForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=2,
    ignore_mismatched_sizes=True  # This will suppress the warnings about mismatch sizes
)
model.to(device)

# Class for handle tokenization
class NotesDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',  # Ensure all sequences are padded to the same length
            truncation=True,  # Ensure that sequences longer than model max are truncated
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, batch_size, max_len=512):
    ds = NotesDataset(
        texts=df.TEXT.to_numpy(),
        labels=df.Label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
    )

batch_size = 32
train_data_loader = create_data_loader(train, tokenizer, batch_size)
val_data_loader = create_data_loader(val, tokenizer, batch_size)
test_data_loader = create_data_loader(test, tokenizer, batch_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from tqdm import tqdm

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * 10  # 10 is the number of epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    total_batches = len(data_loader)
    # print(total_batches)

    ### TODO: Something may be wrong here
    for step, d in tqdm(enumerate(data_loader), total=len(data_loader), desc="Training", position=0, leave=True):
        # print("step: ", step)
        # print("d: ", d)
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        # print("input_ids: ", input_ids)
        # print("attention_mask: ", attention_mask)
        # print("labels: ", labels)

        model.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        # print('outputs: ', outputs)

        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        # Print progress every 10 batches (just want to check progres here)
        if (step + 1) % 10 == 0 or step == total_batches - 1:
            print(f'Batch {step + 1}/{total_batches}, Loss: {loss.item():.4f}')

    average_loss = np.mean(losses)
    accuracy = correct_predictions.double() / n_examples
    return accuracy, average_loss

In [6]:
for epoch in tqdm(range(10)):
    print(f'Epoch {epoch + 1}')
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        optimizer,
        device,
        scheduler,
        len(train)  # Make sure 'train' contains the correct number of samples
    )
    print(f'Train loss {train_loss:.4f}, Accuracy {train_acc:.4f}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1


Training:   1%|          | 10/1421 [00:29<1:21:08,  3.45s/it]

Batch 10/1421, Loss: 0.5516


Training:   1%|▏         | 20/1421 [00:57<1:19:00,  3.38s/it]

Batch 20/1421, Loss: 0.7019


Training:   2%|▏         | 30/1421 [01:25<1:17:12,  3.33s/it]

Batch 30/1421, Loss: 0.5910


Training:   3%|▎         | 40/1421 [01:53<1:18:02,  3.39s/it]

Batch 40/1421, Loss: 0.6910


Training:   4%|▎         | 50/1421 [02:21<1:17:39,  3.40s/it]

Batch 50/1421, Loss: 0.5654


Training:   4%|▍         | 60/1421 [02:49<1:16:23,  3.37s/it]

Batch 60/1421, Loss: 0.5270


Training:   5%|▍         | 70/1421 [03:17<1:16:03,  3.38s/it]

Batch 70/1421, Loss: 0.5099


Training:   6%|▌         | 80/1421 [03:46<1:15:44,  3.39s/it]

Batch 80/1421, Loss: 0.5747


Training:   6%|▋         | 90/1421 [04:14<1:14:20,  3.35s/it]

Batch 90/1421, Loss: 0.5009


Training:   6%|▋         | 91/1421 [04:16<1:02:35,  2.82s/it]
  0%|          | 0/10 [04:17<?, ?it/s]


KeyboardInterrupt: 