|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Instruction tuning<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Instruction tuning GPT2-large<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# run this code, then restart the python session (and then comment it out)
# !pip install -U datasets huggingface_hub fsspec

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch

from transformers import AutoModelForCausalLM,GPT2Tokenizer
from datasets import load_dataset

import textwrap

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Exercise 1: Lengths of questions and answers

In [None]:
# load pretrained GPT-2 model and tokenizer
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2-large')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
gpt2

In [None]:
# https://huggingface.co/datasets/THUDM/webglm-qa
dataset = load_dataset('THUDM/webglm-qa')
dataset

In [None]:
sampleLengths = np.zeros((len(dataset['train']),2))

for i in range(sampleLengths.shape[0]):

  sampleLengths[i,0] = len(tokenizer.encode(dataset['train'][i]['question']))
  sampleLengths[i,1] = len(tokenizer.encode(dataset['train'][i]['answer']))

In [None]:
binedges = np.linspace(0,400,71)
yQ,xQ = np.histogram(sampleLengths[:,0],bins=binedges,density=True)
yA,xA = np.histogram(sampleLengths[:,1],bins=binedges,density=True)

plt.figure(figsize=(8,4))
plt.bar(xA[:-1],yA,width=xA[1]-xA[0],edgecolor='k',facecolor=[.9,.7,.7],alpha=.8,label='Answers')
plt.bar(xQ[:-1],yQ,width=xQ[1]-xQ[0],edgecolor='k',facecolor=[.7,.7,.9],alpha=.8,label='Questions')

plt.gca().set(xlim=[-5,300],xlabel='Number of tokens',ylabel='Density (pdf estimate)',title='Distribution of Q&A token lengths')
plt.legend()
plt.show()

# Exercise 2: Create question-starting batches

In [None]:
# max sequence length
seq_len = 256

In [None]:
# initializing (just using the first 10k data samples)
trainTokens = torch.full((10000,seq_len),tokenizer.pad_token_id)
testTokens  = torch.full((1000,seq_len),tokenizer.pad_token_id)

# loop over tokens
for idx in range(trainTokens.shape[0]):

  # construct the token sequence
  txt = f"QUESTION: {dataset['train'][idx]['question']} ANSWER: {dataset['train'][idx]['answer']}."
  tokz = tokenizer.encode(txt,add_special_tokens=True)

  # insert this sequence into the data matrix, truncating when necessary
  endOfSeq = min(seq_len,len(tokz))
  trainTokens[idx,:endOfSeq] = torch.tensor(tokz[:endOfSeq])


### repeat for test tokens
for idx in range(testTokens.shape[0]):
  txt = f"QUESTION: {dataset['validation'][idx]['question']} ANSWER: {dataset['validation'][idx]['answer']}."
  tokz = tokenizer.encode(txt,add_special_tokens=True)
  endOfSeq = min(seq_len,len(tokz))
  testTokens[idx,:endOfSeq] = torch.tensor(tokz[:endOfSeq])

In [None]:
# attention mask
attn_mask = (trainTokens[0] != tokenizer.pad_token_id).long()

print(f'Training tokens:\n{trainTokens[0]}\n')
print(f'Attention mask:\n{attn_mask}')

In [None]:
# check a random batch
ix = np.random.randint(0,trainTokens[0].shape,8)
X  = trainTokens[ix]
attn_mask = (X != tokenizer.pad_token_id).long()


print(f'Size of batch: {X.shape}')
print(f'Size of attention mask: {attn_mask.shape}\n')

print('Some examples:')
for t in range(5):
  print(f'*** Example: \n',textwrap.fill(tokenizer.decode(X[t]),123),'\n')

In [None]:
aveAM = (trainTokens == tokenizer.pad_token_id).sum()/torch.numel(trainTokens)
print(f'{aveAM*100:5.2f}% of TRAIN token positions are EOS.')

aveAM = (testTokens == tokenizer.pad_token_id).sum()/torch.numel(testTokens)
print(f'{aveAM*100:5.2f}% of TEST token positions are EOS.')

# Exercise 3: Fine-tune the model

In [None]:
# move the model to the GPU
gpt2 = gpt2.to(device)

# create the optimizer functions
optimizer = torch.optim.AdamW(gpt2.parameters(), lr=1e-4, weight_decay=.01)

In [None]:
batch_size  =   8
num_samples = 123

# initialize losses
train_loss = np.zeros(num_samples)
test_loss  = np.zeros(num_samples)


for sampli in range(num_samples):

  # get a batch of data and create a mask
  ix = np.random.randint(0,trainTokens[0].shape,batch_size)
  X  = trainTokens[ix]
  attn_mask = (X != tokenizer.pad_token_id).long()

  # move data to GPU
  attn_mask = attn_mask.to(device)
  X = X.to(device)

  # forward pass (Hugging Face shifts X internally to get y)
  gpt2.zero_grad()
  outputs = gpt2(X,labels=X,attention_mask=attn_mask)
  loss = outputs.loss

  # backprop
  loss.backward()
  optimizer.step()

  # store the per-sample loss
  train_loss[sampli] = loss.item()

  # test and update progress display
  if sampli%5==0:

    # get a batch of data and create a mask
    ix = np.random.randint(0,testTokens[0].shape,batch_size)
    X  = testTokens[ix]
    attn_mask = (X != tokenizer.pad_token_id).long()

    # move data to GPU
    attn_mask, X = attn_mask.to(device), X.to(device)

    # forward pass and get loss
    with torch.no_grad():
      gpt2.eval()
      outputs = gpt2(X,labels=X,attention_mask=attn_mask)
      test_loss[sampli] = outputs.loss.item()
    gpt2.train()

    # report progress
    print(f'Sample {sampli:4}/{num_samples}, train/test loss: {train_loss[sampli]:.4f}/{test_loss[sampli]:.4f}')

In [None]:
# plot the losses
plt.figure(figsize=(10,3.5))
plt.plot(train_loss,'k',label='Train loss')
x4test = np.where(test_loss)[0]
plt.plot(x4test,test_loss[x4test],'r',label='Test loss')

plt.legend()
plt.gca().set(xlabel='Data sample',ylabel='Loss',xlim=[-1,num_samples])
plt.show()

In [None]:
# Qualtative assessment
prompt = 'QUESTION: Where does the word "butterfly" come from?'
# prompt = 'QUESTION: Would it be strange to have a pet rock and feed it styrofoam?'
in2gpt = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

output = gpt2.generate(in2gpt,do_sample=True,max_length=200,pad_token_id=50256)
print(textwrap.fill(tokenizer.decode(output[0]),80))