|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge HELPER: Clip, freeze, and schedule BERT<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# run this code, then restart the python session (and then comment it out)
# !pip install -U datasets huggingface_hub fsspec

In [None]:
# typical python libraries
import numpy as np
import matplotlib.pyplot as plt
# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# pytorch libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# huggingface libraries
from transformers import BertModel, BertTokenizer
from transformers import get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup
from datasets import load_dataset, DatasetDict

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Exercise 1: The data and the model

In [None]:
# load the IMDB dataset (from HF)
dataset = load_dataset('imdb')

# reduce the size (overwriting the variable!)
dataset = DatasetDict({split:dataset[split].select(range(5_000,20_000)) for split in ['train','test']})

In [None]:
plt.figure(figsize=(10,3))
plt.plot(dataset ,'m.',markersize=1,alpha=.2)

plt.show()

In [None]:
# define a tokenization function that processes each data sample
def tokenize_function(one_sample):
  return tokenizer(
    one_sample['text'],
    max_length = 512,         # max sequence length
    padding    = 'max_length',# pad to max length
    truncation = True)        # truncate if longer than max_length


# apply the tokenization function to the dataset (batched for efficiency)
tokenized_dataset = dataset.map(, batched=True)

# remove text pair
tokenized_dataset = tokenized_dataset.remove_columns

# change format to pytorch tensors
tokenized_dataset.set_format('something', columns=['input_ids', 'attention_mask', 'label'])

# create DataLoaders for training and testing
train_dataloader = DataLoader(
test_dataloader  = DataLoader(

In [None]:
next(iter(train_dataloader))

### Create an LLM model using pretrained BERT with a new head

In [None]:
class BertForBinaryClassification(nn.Module):
  def __init__(self, num_labels=2):
    super(BertForBinaryClassification, self).__init__()

    # load the pre-trained BERT model
    self.bert =

    # classification head that converts the 768-d pooled output into 2 final outputs
    self.classifier = nn.Linear
    self.dropout = nn.Dropout() # 10%

    # initialize the weights and biases
    nn.init.xavier_uniform_(self.classifier.weight)
    nn.init.zeros_(


  def forward(self, input_ids, attention_mask=None, token_type_ids=None):

    # forward pass through the downloaded (pretrained) BERT
    outputs = self.bert(
      input_ids      =
      attention_mask =
      token_type_ids =

    # extract the pooled output and apply dropout
    pooled_output = self.dropout(

    # final push through the classification layer.
    logits =
    return

In [None]:
# create an instance of the model and test it:
model = BertForBinaryClassification().to(device)

In [None]:
model.bert

### Freeze the attention weights

In [None]:
# parameter counts
trainParamsCount
frozenParamsCount

for name,param in model.named_parameters():
  if ('attention' in name) or ('embeddings' in name):

    print(f'--- Layer {name} is frozen (.requires_grad = {param.requires_grad}).')

  else:

    print(f'+++ Layer {name} is trainable (.requires_grad = {param.requires_grad}).')

print(f'\n\nThere are {:,} ({:.2f}%) frozen weights,')
print(f'      and {:,} ({:.2f}%) trainable weights.')

# Exercise 2: Setup a learning rate scheduler

In [None]:
# training steps
num_samples = 300

# create an optimizer with a "model"
optimizer = torch.optim.AdamW(,lr=3e-5)

# learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    ,
    num_warmup_steps = ,  # gentle 5% warmup
    num_training_steps =  # steps set to 150% so the lr stays >0
)

# quick test to see the learning rates
lrs = np.zeros(num_samples)
for i in range(num_samples):
   # update the optimizer
   # run the scheduler
  lrs[i] = scheduler. # get the actual learning rate

# plot!
plt.figure(figsize=(10,3))
plt.plot(lrs,'ko',markersize=5,markerfacecolor=[.7,.7,.9],alpha=.3)

plt.gca().set(xlabel='Training epoch',ylabel='Learning rate')
plt.show()

In [None]:
# now for the real optimizer and loss function
optimizer = torch.optim.AdamW(,lr=1e-5)
loss_fun =  # (cross-entropy loss for multi-class classification)

# IMPORTANT: redefine the scheduler
scheduler =

# Exercise 3: Fine-tune with adjustments

In [None]:
# initialize performance metrices
train_losses = np.zeros(num_samples)
train_accuracy = np.zeros(num_samples)
test_losses = np.zeros(num_samples)
test_accuracy = np.zeros(num_samples)
norms = np.zeros((num_samples,2))


## loop over data samples
for sampli in range(num_samples):

  # get a batch of data
  batch =

  # and move it to the GPU
  tokenz  = batch['input_ids']
  att_msk =
  labels  =

  # clear the previous gradients


  # forward pass and get model predictions
  logits = model()
  predLabels = torch.argmax

  # calculate and store loss + average accuracy
  loss = loss_fun
  train_losses[sampli] =
  train_accuracy[sampli] =

  # backward pass


  # get two gradient norms
  norms[sampli,0] = torch.norm().item()
  norms[sampli,1] = torch.norm().item()

  # gradient clipping to prevent exploding gradients
  nn.utils.c

  # update the weights and the learning rate


  # test the model and report losses every k samples
  if sampli%10 == 0:

    # evaluation using the test set
    model.eval()
    with torch.no_grad():

      # get a batch of data and move it to the GPU

      # forward pass and get model predictions

      # calculate and store loss + accuracy

      # report the results
      print(f'Sample {sampli:4}/{num_samples}, losses (train/test): {train_losses[sampli]:.2f}/{test_losses[sampli]:.2f}, accuracy: {train_accuracy[sampli]:.2f}/{test_accuracy[sampli]:.2f}')

      # put the model back into train mode
      model.train()

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3.5))

# plot the losses
axs[0].plot(train_losses,'ko',markerfacecolor=[.9,.7,.7],alpha=.5,label='Train')
axs[0].plot(range(0,num_samples,10),test_losses[::10],'ks-',
            markerfacecolor=[.7,.9,.7],markersize=8,alpha=.8,linewidth=3,label='Test')
axs[0].set(xlabel='Training sample',ylabel='Loss',title='Cross-entropy loss',xlim=[0,num_samples])
axs[0].legend()

# plot the prediction accuracy


plt.tight_layout()
plt.show()

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3.6))

# plot the losses
axs[0].plot(norms[:,0],'ko',markerfacecolor=[.9,.7,.7],alpha=.7)
axs[0].set(xlabel='Training sample',ylabel='Norm',title='MLP: Pre-clip gradient norm',xlim=[0,num_samples])

axs[1].plot(norms[:,1],'ks',markerfacecolor=[.7,.9,.7],alpha=.7)
axs[1].set(xlabel='Training sample',ylabel='Norm',title='Classifier: Pre-clip gradient norm',xlim=[0,num_samples])


plt.show()