# Full Train BERT model

## Loading Dataset

In [1]:
# load the dataset and tokenizer
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mnli")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## Tokenize

In [2]:
# tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["premise"], example["hypothesis"], truncation=True)
# batch processing
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# tokenized_datasets["train"]["label"][:8]

In [3]:
# post-processing to prepare for dataloader
tokenized_datasets = tokenized_datasets.remove_columns(["premise", "hypothesis", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch") # Pytorch tensors
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [5]:
# No padding done yet
tokenized_datasets["train"]["attention_mask"][:1]
tokenized_datasets["train"]["labels"][:8]

tensor([1, 0, 0, 0, 1, 0, 1, 0])

## Data Loader

In [6]:
from torch.utils.data import DataLoader
from datasets import concatenate_datasets

# For dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Prepare the data in batch size of 8 with dynamic padding, shuffles at each epoch
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)

# combine both validation sets
combined_validation = concatenate_datasets([
    tokenized_datasets["validation_matched"],
    tokenized_datasets["validation_mismatched"]
])

# Prepare eval data in batch size of 8
eval_dataloader = DataLoader(
    combined_validation, batch_size=8, collate_fn=data_collator
)

In [7]:
print(combined_validation["labels"].unique())

tensor([0, 1, 2])


In [8]:
# checking the first batch
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 62]),
 'token_type_ids': torch.Size([8, 62]),
 'attention_mask': torch.Size([8, 62])}

## Load BERT model

In [9]:
# Load the base model
from transformers import AutoModelForSequenceClassification
# Weights are randomnized
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Testing
sample = tokenized_datasets["train"][:8]
batch = data_collator(sample)
# print(batch)

In [11]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.2584, grad_fn=<NllLossBackward0>) torch.Size([8, 3])


## Optimiser

In [12]:
# Define optimiser
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5) # State learning rate

## Scheduler

In [13]:
# Define scheduler to change learning rate
from transformers import get_scheduler

num_epochs = 2
# num epochs * num of batches
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", # Type of scheduler (linear decay)
    optimizer=optimizer,
    num_warmup_steps=0, # No warm-up period, meaning the learning rate starts at the maximum value right away and decreases linearly
    num_training_steps=num_training_steps,
)
print(num_training_steps)

98176


## Move model to GPU

In [14]:
# Move model to GPU if avail or CPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

## Set Model to Train

In [None]:
# Importing the tqdm library for progress bars
from tqdm.auto import tqdm

# Set the model to training mode
model.train()  # This ensures layers like dropout are active during training

## Training Model

In [None]:
# Loop over each epoch (the entire training dataset will be processed for each epoch)
for epoch in range(num_epochs):  # num_epochs is the number of times you want to loop through the dataset
    # Loop over each batch in the training dataset
    for step, batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", total=len(train_dataloader)):  # train_dataloader yields batches of training data
        # Move each element in the batch to the device (GPU/CPU)
        batch = {k: v.to(device) for k, v in batch.items()}  # Move tensors to device

        # Forward pass: pass the batch through the model
        outputs = model(**batch)  # Forward pass to get predictions, loss, etc.

        # Get the loss value from the model's outputs (assuming the model returns a loss)
        loss = outputs.loss  # Extract the loss from the model's outputs

        # Backward pass: Compute the gradients for backpropagation
        loss.backward()  # Backpropagate the loss to compute gradients

        # Step the optimizer to update model parameters using the gradients
        optimizer.step()  # Update the model's weights using the computed gradients

        # Update the learning rate according to the learning rate scheduler
        lr_scheduler.step()  # Adjust the learning rate based on the scheduler

        # Zero out gradients to prevent them from accumulating in the next iteration
        optimizer.zero_grad()  # Clear the gradients after the update

        # Update the progress bar by one step (one batch processed)
        progress_bar.update(1)  # Increment the progress bar by 1 (one batch done)
        
        print(f"Epoch {epoch+1}/{num_epochs}, Step {step}/{len(train_dataloader)}, Loss: {loss.item()}")

## Validation / Evaluation

In [18]:
# Importing the evaluate library for metrics
import evaluate

# Load the 'mnli' metric from the GLUE dataset
metric = evaluate.load("glue", "mnli")  

# Set the model to evaluation mode (deactivates dropout and other training behaviors)
model.eval()  # This ensures layers like dropout are deactivated, making predictions deterministic

# Loop over each batch in the evaluation dataset (eval_dataloader)
for batch in eval_dataloader:  # eval_dataloader yields batches of evaluation data
    # Move each element in the batch to the device (GPU/CPU)
    batch = {k: v.to(device) for k, v in batch.items()}  # Move tensors to device

    # Disable gradient calculation as we are in evaluation mode
    with torch.no_grad():  # No need to compute gradients during evaluation
        # Forward pass: pass the batch through the model
        outputs = model(**batch)  # Forward pass to get model outputs (logits)

    # Get the logits (raw predictions before applying any activation function like softmax)
    logits = outputs.logits  # Extract the logits from the model's outputs

    # Convert logits to predictions by selecting the index with the highest value
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class (index of max logit)

    # Add the predictions and the true labels to the metric for evaluation
    metric.add_batch(predictions=predictions, references=batch["labels"])  # Add batch predictions and true labels

# After accumulating all batches, compute the final metric result (e.g., accuracy, F1 score, etc.)
metric.compute()  # Compute and return the final evaluation metric

{'accuracy': 0.8310683564920853}

## Save model weights

In [None]:
# save the model
output_dir = "C:/Users/steve/HuggingFace Models/BERT_MNLI_model" 

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)