|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>Partial fine-tuning by freezing attention weights<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoModelForCausalLM,GPT2Tokenizer
import requests

In [None]:
# load pretrained GPT-2 model and tokenizer
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# hyperparameters
seq_len    = 256 # max sequence length
batch_size =  16

# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
# tokenize Gulliver's travels
text = requests.get('https://www.gutenberg.org/cache/epub/829/pg829.txt').text
gtTokens = tokenizer.encode(text,return_tensors='pt')[0]

# Freeze all attention head weights

In [None]:
for name,param in gpt2.named_parameters():
  if '.h.' in name:
    param.requires_grad = False
    print(f'--- Layer {name} is frozen (.requires_grad = {param.requires_grad}).')

  elif '.h.' not in name:
    print(f'+++ Layer {name} is trainable (.requires_grad = {param.requires_grad}).')

In [None]:
# as a sanity-check, grab weights from one frozen and one trainable layer
frozenW_pre = gpt2.transformer.h[6].mlp.c_fc.weight.data
trainW_pre  = gpt2.transformer.ln_f.weight.data

In [None]:
# move the model to the GPU
gpt2 = gpt2.to(device)

# Fine-tune the model

In [None]:
# create the optimizer functions
optimizer = torch.optim.AdamW(gpt2.parameters(), lr=5e-5, weight_decay=.01)

# Note: don't need the loss function here, because it's calculated internally in the model (thanks HF :D )

In [None]:
num_samples = 123

# initialize losses
train_loss = []

for sampli in range(num_samples):

  # get a batch of data
  ix = torch.randint(len(gtTokens)-seq_len,size=(batch_size,))
  X  = gtTokens[ix[:,None] + torch.arange(seq_len)].to(device)

  # forward pass (Hugging Face shifts X internally to get y)
  gpt2.zero_grad()
  outputs = gpt2(X,labels=X)
  loss = outputs.loss

  # backprop
  loss.backward()
  optimizer.step()

  # store the per-sample loss
  train_loss.append( loss.item() )

  # update progress display
  if sampli%27==0:
    print(f'Sample {sampli:4}/{num_samples}, train loss: {train_loss[-1]:.4f}')

In [None]:
# plot the losses
plt.figure(figsize=(8,3))
plt.plot(train_loss,'k',markersize=8,label='Train loss')

plt.legend()
plt.gca().set(xlabel='Data sample',ylabel='Loss')
plt.show()

In [None]:
# grab the weight matrices again
frozenW_pst = gpt2.transformer.h[6].mlp.c_fc.weight.data.cpu()
trainW_pst  = gpt2.transformer.ln_f.weight.data.cpu()

In [None]:
# should be all zeros if the layer was frozen
print('Frozen layer, norm(post-pre):')
print('  ',torch.norm(frozenW_pst - frozenW_pre))

print('\nTrainable layer, norm(post-pre):')
print('  ',torch.norm(trainW_pst - trainW_pre))