|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Instruction tuning<h1>|
|<h2>Lecture:</h2>|<h1><b>Instruction training with GPT2<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# run this code, then restart the python session (and then comment it out)
# !pip install -U datasets huggingface_hub fsspec

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch

import textwrap

from transformers import AutoModelForCausalLM,GPT2Tokenizer
from datasets import load_dataset

In [None]:
# load pretrained GPT-2 model and tokenizer
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# hyperparameters
seq_len    = 256 # max sequence length
batch_size =  32

# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Import text data

In [None]:
# https://huggingface.co/datasets/THUDM/webglm-qa
dataset = load_dataset('THUDM/webglm-qa')
dataset

In [None]:
dataset['train'][3]

In [None]:
# my minor modification:
txt = 'QUESTION: ' + dataset['train'][0]['question'] + ' ANSWER: ' + dataset['train'][0]['answer']
txt

In [None]:
# tokenize the data
trainTokens = []

# just the first 5k samples (~800k tokens) for speed and simplicity
for idx in range(5000):#len(dataset['train'])):

  # get text from this example
  txt = f"QUESTION: {dataset['train'][idx]['question']} ANSWER: {dataset['train'][idx]['answer']}."

  # tokenize and concatenate it
  trainTokens += tokenizer.encode(txt)

# needs to be a torch tensor
trainTokens = torch.tensor(trainTokens)
trainTokens.shape

In [None]:
# check a random batch
ix = torch.randint(len(trainTokens)-seq_len,size=(batch_size,))
X  = trainTokens[ix[:,None] + torch.arange(seq_len)]
print(X)

In [None]:
for t in X[:5]:
  print(f'*** Example: \n',textwrap.fill(tokenizer.decode(t),75),'\n')

# Fine-tune the model

In [None]:
# move the model to the GPU
gpt2 = gpt2.to(device)

# create the optimizer functions
optimizer = torch.optim.AdamW(gpt2.parameters(), lr=5e-5, weight_decay=.01)

In [None]:
# (training takes ~8 mins on A100)
num_samples = 1234

# initialize losses
train_loss = np.zeros(num_samples)

for sampli in range(num_samples):

  # get a batch of data
  ix = torch.randint(len(trainTokens)-seq_len,size=(batch_size,))
  X  = trainTokens[ix[:,None] + torch.arange(seq_len)].to(device)

  # forward pass
  gpt2.zero_grad()
  outputs = gpt2(X,labels=X)
  loss = outputs.loss

  # backprop
  loss.backward()
  optimizer.step()

  # store the per-sample loss
  train_loss[sampli] = loss.item()

  # update progress display
  if sampli%77==0:
    print(f'Sample {sampli:4}/{num_samples}, train loss: {train_loss[sampli]:.4f}')

In [None]:
# plot the losses
plt.figure(figsize=(8,3))
plt.plot(train_loss,'k')

plt.gca().set(xlim=[0,num_samples+1],xlabel='Data sample',ylabel='Loss',title='Train loss')
plt.show()

In [None]:
# Qualtative assessment
prompt = 'QUESTION: Would it be strange to have a pet rock and feed it styrofoam?'
# prompt = 'QUESTION: Where did the word "butterfly" come from?'
in2gpt = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

output = gpt2.generate(in2gpt,max_length=100,pad_token_id=50256,do_sample=True)
print(tokenizer.decode(in2gpt[0].cpu()),'\n')
print(textwrap.fill(tokenizer.decode(output[0][len(in2gpt[0]):]),60))