|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Gulliver's learning rates<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoModelForCausalLM,GPT2Tokenizer
import requests

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# hyperparameters
seq_len    = 256 # max sequence length
batch_size =  16

# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Exercise 1: Functions to train and evaluate the model

In [None]:
# tokenize Gulliver's travels
text = requests.get('https://www.gutenberg.org/cache/epub/829/pg829.txt').text
gtTokens = tokenizer.encode(text,return_tensors='pt')[0]

# find the most frequent 100 tokens
uniq,counts = np.unique(gtTokens,return_counts=True)
freqidx = np.argsort(counts)[::-1]
top100 = uniq[freqidx[:100]]

In [None]:
def countFreqTokens(model):

  # random starting tokens
  numreps =  10 # number of random repetitions
  numtoks = 100 # output length
  randstarts = torch.randint(tokenizer.vocab_size,(numreps,1)).to(device)

  # generate some data
  out = model.generate(
    randstarts,
    max_length = numtoks+1,
    min_length = numtoks+1,
    do_sample  = True,
    bad_words_ids = [tokenizer.encode(tokenizer.eos_token)],
    pad_token_id = tokenizer.encode(tokenizer.eos_token)[0]
  ).cpu()

  # return proportion
  return np.mean(100*np.isin(out[:,1:],top100).flatten())

In [None]:
def trainTheModel(lr,num_samples):

  # download a fresh copy of the model
  gpt2 = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

  # pre-train evaluation
  pretrainEval = countFreqTokens(gpt2)

  # create the optimizer functions
  optimizer = torch.optim.AdamW(gpt2.parameters(), lr=lr, weight_decay=.01)

  # initialize losses
  train_loss = np.zeros(num_samples)


  ### now for the training
  for sampli in range(num_samples):

    # get a batch of data
    ix = torch.randint(len(gtTokens)-seq_len,size=(batch_size,))
    X  = gtTokens[ix[:,None] + torch.arange(seq_len)].to(device)

    # forward pass (Hugging Face shifts X internally to get y)
    gpt2.zero_grad()
    outputs = gpt2(X,labels=X)
    loss = outputs.loss

    # backprop
    loss.backward()
    optimizer.step()
    train_loss[sampli] = loss.item()



  # post-train evaluation
  psttrainEval = countFreqTokens(gpt2)

  return train_loss,pretrainEval,psttrainEval

# Exercise 2: Fine-tune the model with different learning rates

In [None]:
learningRates = [ 1e-4,1e-5,1e-6 ]
training_samples = 800

evalsPcts = np.zeros((3,2))
losses = []


for idx,lr in enumerate(learningRates):

  # train a fresh model and get the results
  train_loss,pretrainEval,psttrainEval = trainTheModel(lr,training_samples)

  # store the results
  evalsPcts[idx,0] = pretrainEval
  evalsPcts[idx,1] = psttrainEval
  losses.append(train_loss)

# Exercise 3: Compare the evaluations

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,4))

colors = [ [.7,.7,.9],[.7,.9,.7],[.9,.7,.7] ]
shapes = 'so^'

# plot the losses
for i in range(3):

  # plot the losses
  axs[0].plot(range(0,training_samples,7),losses[i][::7],f'k{shapes[i]}',markerfacecolor=colors[i],
              alpha=.7,markersize=8,label=f'lr = {learningRates[i]}')

  # plot the percent of common GT tokens
  axs[1].bar([i-.2,i+.2],evalsPcts[i,:],width=.4,edgecolor='k',
             facecolor=colors[i],label=f'lr = {learningRates[i]}')



axs[0].set(xlabel='Training sample',ylabel='Train loss',title='Losses')
axs[0].legend()

axs[1].set(xlabel='Training sample',xticks=[-.2,.2,.8,1.2,1.8,2.2],
           xticklabels=['pre','post','pre','post','pre','post'],
           title='Percent GT tokens generated',ylabel='% common GT tokens',ylim=[30,65])
axs[1].legend()


plt.tight_layout()
plt.show()