|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Pretrain LLMs<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Train a model to like "X"<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# pytorch stuff
import torch
import torch.nn as nn
from torch.nn import functional as F

# for printing
import textwrap

In [None]:
# GPT-2's tokenizer
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# use the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Exercise 1: The model and its “x” preference

### Model 5 all in one cell

In [None]:
# hyperparameters for GPT2-124M
n_vocab    = 50257     # GPT-2 vocab size
embed_dim  =   768     # embedding dimension
seq_len    =   256     # max sequence length
n_heads    =    12     # attention heads
n_blocks   =    12     # transformer blocks
batch_size =    16



class MultiHeadAttention(nn.Module):
  def __init__(self):
    super().__init__()

    # number of attention heads
    self.num_heads = n_heads
    self.head_dim  = embed_dim // n_heads

    # the three Q,K,V weights matrices are initialized as one, and are split inside forward()
    self.QKV = nn.Linear(embed_dim, 3*embed_dim, bias=True)

    # linear mixing after attention
    self.W0 = nn.Linear(embed_dim, embed_dim, bias=True)


  def forward(self,x):

    # sizes for later use
    B, T, E = x.shape # [batch, seq_len, embed_dim]

    # push data through Q, K, and V in one concatenated matrix
    qkv = self.QKV(x) # [batch, sequence, 3*embed]
    q,k,v = torch.split(qkv,E,dim=2) # each matrix is [B, T, E]

    # reshape to [B, T, nHeads, head_dim]
    #  and then transpose to [B, nHeads, T, head_dim]
    q = q.view(B, T, self.num_heads, self.head_dim).transpose(1,2) # [B, nHeads, T, head_dim]
    k = k.view(B, T, self.num_heads, self.head_dim).transpose(1,2)
    v = v.view(B, T, self.num_heads, self.head_dim).transpose(1,2)

    # Pytorch's dot-product attention function handles multi-head shapes
    out = F.scaled_dot_product_attention(q, k, v, is_causal=True) # [B, nHeads, T, head_dim]

    # recombine heads: (B, nHeads, T, head_dim) -> [B, T, E]
    out = out.transpose(1,2).view(B, T, E)

    # finally, linearly mix the attention heads
    out = self.W0(out)

    return out




class TransformerBlock(nn.Module):
  def __init__(self):
    super().__init__()

    ### attention subblock
    self.layernorm_1 = nn.LayerNorm(embed_dim, eps=1e-5)
    self.attn = MultiHeadAttention()


    ### linear feedforward (MLP) subblock
    self.layernorm_2 = nn.LayerNorm(embed_dim, eps=1e-5)
    # 4x expansion, then back to embedding size
    self.mlp_1 = nn.Linear(embed_dim, 4*embed_dim, bias=True)
    self.gelu  = nn.GELU()
    self.mlp_2 = nn.Linear(4*embed_dim, embed_dim, bias=True)

  def forward(self, x):

    # attention
    x_att = self.layernorm_1(x) # pre-attention normalization
    x_att = x + self.attn(x_att) # run through attention, then add pre-attention activation ("residual")


    # MLP
    x_ff = self.layernorm_2(x_att) # pre-MLP normalization
    x_ff = x_att + self.mlp_2(self.gelu( self.mlp_1(x_ff) )) # adjustment from expansion-contraction

    return x_ff



class Model(nn.Module):
  def __init__(self):
    super().__init__()

    # token + position embeddings
    self.wte = nn.Embedding(n_vocab, embed_dim) # token embedding
    self.wpe = nn.Embedding(seq_len, embed_dim) # position embedding

    # transformer blocks
    self.transformerBlocks = nn.Sequential(*[TransformerBlock() for _ in range(n_blocks)])

    # final layernorm
    self.layernorm_final = nn.LayerNorm(embed_dim, eps=1e-5)

    # lm head, with weights tied to token embedding
    self.final_head = nn.Linear(embed_dim, n_vocab, bias=False)
    self.final_head.weight = nn.Parameter(self.wte.weight)


  def forward(self, idx):

    # token + position embeddings (note the device!)
    token_emb = self.wte(idx) # [B, T, E]
    posit_emb = self.wpe(torch.arange(idx.shape[-1],device=device)) # [T, E]
    x = token_emb + posit_emb # [B, T, E]

    # pass through each transformer block
    x = self.transformerBlocks(x)

    # final layernorm and unembeddings
    x = self.layernorm_final(x)
    logits = self.final_head(x)  # [B, T, n_vocab]

    # scale and logsoftmax
    outputs = F.log_softmax(logits/np.sqrt(embed_dim),dim=-1)

    return outputs


  def generate(self, idx, n_new_tokens=50):

    for _ in range(n_new_tokens):

      # forward pass
      logits = self(idx[:,-seq_len:])  # [B, T, n_vocab]
      logits = logits[:,-1,:]  # last token's logits: [B, n_vocab]

      # undo the log-softmax to get "normal" softmax (probability values)
      probs = torch.exp(logits) # [B, n_vocab]

      # sample next token
      idx_next = torch.multinomial(probs, num_samples=1) # [B, 1]

      # append
      idx = torch.cat((idx, idx_next), dim=1) # [B, T+1]
    return idx


In [None]:
# create a new instance and put it on the GPU
model = Model().to(device)

In [None]:
# how many generated tokens contain a target letter?

# qualitative test: generate new tokens
X = torch.randint(0,tokenizer.vocab_size,(1,seq_len)).to(device)
Y = model.generate(X,n_new_tokens=200)
print(textwrap.fill(tokenizer.decode(Y[0].tolist()), width=100))

In [None]:
# quantitative test: count the number of target-containing tokens
hasTarget = 0
for t in Y[0][seq_len:]:
  if 'x' in tokenizer.decode(t):
    hasTarget += 1

print(f'{hasTarget} of {len(Y[0][seq_len:])} tokens have a target.')

# Exercise 2: Create a target token probability distribution

In [None]:
# initialize
mask = torch.zeros(tokenizer.vocab_size)

# loop over all tokens
for t in range(tokenizer.vocab_size):

  # this token
  thistoken = tokenizer.decode([t])

  # if it has a target letter
  if 'x' in thistoken:
    mask[t] = 1

print(f'{int(sum(mask))} out of {len(mask):,} ({100*mask.mean():.2f}%) tokens have target letter "x"')

# then normalize to probability dist
mask = mask/torch.sum(mask)

In [None]:
plt.figure(figsize=(10,3))
plt.plot(mask,'k.')
plt.gca().set(xlabel='Token index',ylabel='Probability')
plt.show()

# Exercise 3: Create a custom loss function

In [None]:
class myLoss_x(nn.Module):
  def __init__(self):
    super().__init__()

    # mask: 1 if token contains a target, 0 otherwise
    self.mask = torch.zeros(tokenizer.vocab_size, device=device)
    for t in range(tokenizer.vocab_size):
      thistoken = tokenizer.decode([t])
      if 'x' in thistoken:
        self.mask[t] = 1

    # normalize to pdist
    self.mask = self.mask/torch.sum(self.mask)

  def forward(self, log_probs):
    return F.kl_div(log_probs, self.mask, reduction='batchmean')

# Exercise 4: Train the model

In [None]:
# create the optimizer function
optimizer = torch.optim.AdamW(model.parameters(), lr=.001, weight_decay=.01)

# and a loss function instance
loss_function = myLoss_x().to(device)

In [None]:
num_epochs = 200

# initialize losses
total_loss = np.zeros(num_epochs)


for epoch in range(num_epochs):

  # generate data and move data to GPU
  X = torch.randint(0,tokenizer.vocab_size,(batch_size,seq_len)).to(device)

  # clear previous gradients
  optimizer.zero_grad()

  # forward pass
  log_probs = model(X)

  # calculate the losses on the final token
  loss = loss_function(log_probs[:,-1,:])

  # backprop
  loss.backward()
  optimizer.step()

  # get the loss
  total_loss[epoch] = loss.item()

  # update our progress :)
  if epoch%25==0:
    print(f'Finished epoch {epoch:4} with loss {total_loss[epoch]:.4f}')

In [None]:
# plot the losses
plt.plot(total_loss,'ks-',markerfacecolor='w',markersize=8)
plt.gca().set(xlabel='Epoch',ylabel='Loss')
plt.show()

In [None]:
# and repeat the evals
X = torch.randint(0,tokenizer.vocab_size,(1,seq_len)).to(device)
Y = model.generate(X,n_new_tokens=200)
print(textwrap.fill(tokenizer.decode(Y[0].tolist()), width=100))

In [None]:
# how many generated tokens contain a target letter?
hasTarget = 0
for t in Y[0][seq_len:]:
  if 'x' in tokenizer.decode(t):
    hasTarget += 1

print(f'{hasTarget} of {len(Y[0][seq_len:])} tokens have a target.')