|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Pretrain LLMs<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Train model 5 with weight inits<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import requests

import torch
import torch.nn as nn
import torch.nn.functional as F

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# hyperparameters for GPT2-124M
n_vocab    = 50257     # GPT-2 vocab size
embed_dim  =   768     # embedding dimension
seq_len    =   256     # max sequence length
n_heads    =    12     # attention heads
n_blocks   =    12     # transformer blocks
batch_size =    16

# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
# tokenize the text
# Gulliver's travels :)
text = requests.get('https://www.gutenberg.org/cache/epub/829/pg829.txt').text
gtTokens = torch.tensor( tokenizer.encode(text),dtype=torch.long )
len(gtTokens)

In [None]:
# train/test split

train_ratio = .9

# index to split data
test_split_point = int(train_ratio*len(gtTokens))

train_data = gtTokens[:test_split_point]
test_data  = gtTokens[test_split_point:]


# a function that returns a batch of data samples
def get_data_batch(training=True):

  # pick the dataset to use
  if training:
    data = train_data
  else:
    data = test_data

  # pick random indices to start
  ix = torch.randint(len(data)-seq_len,size=(batch_size,))

  # get the data and targets (via broadcasting outer product)
  X = data[ix[:,None] + torch.arange(seq_len)]
  y = data[ix[:,None] + torch.arange(1,seq_len+1)]
  return X,y


# example
X,y = get_data_batch()
print(f'Input data (size {X.shape}):\n',X)
print(f'\n\nTargets (size {y.shape}):\n',y)

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self):
    super().__init__()

    # number of attention heads
    self.num_heads = n_heads
    self.head_dim  = embed_dim // n_heads

    # the three Q,K,V weights matrices are initialized as one, and are split inside forward()
    self.QKV = nn.Linear(embed_dim, 3*embed_dim, bias=True)

    # linear mixing after attention
    self.W0 = nn.Linear(embed_dim, embed_dim, bias=True)


  def forward(self,x):

    # sizes for later use
    B, T, E = x.shape # [batch, seq_len, embed_dim]

    # push data through Q, K, and V in one concatenated matrix
    qkv = self.QKV(x) # [batch, sequence, 3*embed]
    q,k,v = torch.split(qkv,E,dim=2) # each matrix is [B, T, E]

    # reshape to [B, T, nHeads, head_dim]
    #  and then transpose to [B, nHeads, T, head_dim]
    q = q.view(B, T, self.num_heads, self.head_dim).transpose(1,2) # [B, nHeads, T, head_dim]
    k = k.view(B, T, self.num_heads, self.head_dim).transpose(1,2)
    v = v.view(B, T, self.num_heads, self.head_dim).transpose(1,2)

    # Pytorch's dot-product attention function handles multi-head shapes
    out = F.scaled_dot_product_attention(q, k, v, is_causal=True) # [B, nHeads, T, head_dim]

    # recombine heads: (B, nHeads, T, head_dim) -> [B, T, E]
    out = out.transpose(1,2).view(B, T, E)

    # finally, linearly mix the attention heads
    out = self.W0(out)

    return out




class TransformerBlock(nn.Module):
  def __init__(self):
    super().__init__()

    ### attention subblock
    self.layernorm_1 = nn.LayerNorm(embed_dim, eps=1e-5)
    self.attn = MultiHeadAttention()


    ### linear feedforward (MLP) subblock
    self.layernorm_2 = nn.LayerNorm(embed_dim, eps=1e-5)
    # 4x expansion, then back to embedding size
    self.mlp_1 = nn.Linear(embed_dim, 4*embed_dim, bias=True)
    self.gelu  = nn.GELU()
    self.mlp_2 = nn.Linear(4*embed_dim, embed_dim, bias=True)

  def forward(self, x):

    # attention
    x_att = self.layernorm_1(x) # pre-attention normalization
    x_att = x + self.attn(x_att) # run through attention, then add pre-attention activation ("residual")


    # MLP
    x_ff = self.layernorm_2(x_att) # pre-MLP normalization
    x_ff = x_att + self.mlp_2(self.gelu( self.mlp_1(x_ff) )) # adjustment from expansion-contraction

    return x_ff

In [None]:
class LanguageModel(nn.Module):
  def __init__(self):
    super().__init__()

    # token + position embeddings
    self.wte = nn.Embedding(n_vocab, embed_dim) # token embedding
    self.wpe = nn.Embedding(seq_len, embed_dim) # position embedding

    # transformer blocks
    self.transformerBlocks = nn.Sequential(*[TransformerBlock() for _ in range(n_blocks)])

    # final layernorm
    self.layernorm_final = nn.LayerNorm(embed_dim, eps=1e-5)

    # lm head, with weights tied to token embedding
    self.final_head = nn.Linear(embed_dim, n_vocab, bias=False)
    self.final_head.weight = nn.Parameter(self.wte.weight)




    ### --- Exercise 1: weight initializations
    self.apply(self.weightInits)

  def weightInits(self, module):

    # initialize nn.linear to normal with std=.02
    if isinstance(module, nn.Linear):
      nn.init.normal_(module.weight,mean=0,std=.02)

      # initialize bias terms to zero
      if module.bias is not None:
          nn.init.zeros_(module.bias)

    # nn.Embeddings to Xavier
    if isinstance(module, nn.Embedding):
      nn.init.xavier_normal_(module.weight)
  ### ---




  def forward(self, idx):

    # token + position embeddings (note the device!)
    token_emb = self.wte(idx) # [B, T, E]
    posit_emb = self.wpe(torch.arange(idx.shape[-1],device=device)) # [T, E]
    x = token_emb + posit_emb # [B, T, E]

    # pass through each transformer block
    x = self.transformerBlocks(x)

    # final layernorm and unembeddings
    x = self.layernorm_final(x)
    logits = self.final_head(x)  # [B, T, n_vocab]

    # scale and logsoftmax
    outputs = F.log_softmax(logits/np.sqrt(embed_dim),dim=-1)

    return outputs


  def generate(self, idx, max_new_tokens=50):

    for _ in range(max_new_tokens):

      # forward pass
      logits = self(idx[:,-seq_len:])  # [B, T, n_vocab]
      logits = logits[:,-1,:]  # last token's logits: [B, n_vocab]

      # undo the log-softmax to get "normal" softmax (probability values)
      probs = torch.exp(logits) # [B, n_vocab]

      # sample next token
      idx_next = torch.multinomial(probs, num_samples=1) # [B, 1]

      # append
      idx = torch.cat((idx, idx_next), dim=1) # [B, T+1]
    return idx


In [None]:
# create an instance and test with some data
model = LanguageModel().to(device)

X,y = get_data_batch()
X,y = X.to(device), y.to(device)
out = model(X) # ~45s on cpu, <1s on gpu :D
print(f'Input size:  {X.shape}')
print(f'Output size: {out.shape}')

In [None]:
# check the weights distributions
print('mlp bias vector: ',model.transformerBlocks[1].mlp_1.bias.data)
print('std of mlp weights: ',torch.std(model.transformerBlocks[1].mlp_1.weight.data))
print('std of embeddings weights: ',torch.std(model.wte.weight.data))
print('std of positions weights: ',torch.std(model.wpe.weight.data))

In [None]:
# where do the xavier std numbers come from?
np.sqrt(2/sum(list(model.wte.weight.shape)))

In [None]:
# but it is an nn.Embedding, not nn.Linear...
isinstance(model.wte, nn.Embedding),isinstance(model.wte, nn.Linear)

In [None]:
# pre-train exploration...

In [None]:
prompt = 'I went on holiday to Liliput and'
in2gpt = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

output = model.generate(in2gpt,max_new_tokens=100)
print(tokenizer.decode(output[0]))

# Prepare for training

In [None]:
# create the loss and optimizer functions
loss_function = nn.NLLLoss().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=.001, weight_decay=.01)

In [None]:
# check loss function with sizes
X,y = get_data_batch()
X = X[0].unsqueeze(0)
y = y[0].unsqueeze(0)
log_probs = model(X.to(device))

print(f'Model input is size:   {X.shape}')
print(f'Model output is size:  {log_probs.shape}')
print(f'Target tokens is size: {y.shape}')

# flatten as before
log_probs_flat = log_probs.view(-1,log_probs.shape[-1])

# compute the loss
loss = loss_function(log_probs_flat, y.view(-1).to(device))
print('\nLoss:',loss)

In [None]:
### --- for Exercise 2: Finding the path to the attention weights
yh,xh = np.histogram(model.transformerBlocks[1].attn.QKV.weight.detach().cpu(),bins=np.linspace(-.1,.1,101))
plt.plot(xh[:-1],yh);

# Now train the model!

In [None]:
num_samples = 501

# initialize losses
train_loss = []
test_loss = []
attn_W_dists = np.zeros((num_samples,100))
attn_W_stds = np.zeros((num_samples,len(model.transformerBlocks)))


for sampli in range(num_samples):

  # get a batch of data
  X,y = get_data_batch()

  # move data to GPU
  X,y = X.to(device), y.to(device)

  # clear previous gradients
  model.zero_grad(set_to_none=True)

  # forward pass
  log_probs = model(X)

  # calculate the losses on the (reshaped) targets
  loss = loss_function(log_probs.view(-1,log_probs.shape[-1]),y.view(-1))

  # backprop
  loss.backward()
  optimizer.step()

  # store the per-sample loss
  train_loss.append( loss.item() )


  # evaluate the model with the test set
  if sampli%50==0:



    ### --- for Exercise 2
    # get the attention weights distributions
    hidx = 4 # just from one transformer block
    qkvWeights = model.transformerBlocks[hidx].attn.QKV.weight.detach().cpu()
    yh,xh = np.histogram(qkvWeights,bins=np.linspace(-.1,.1,101))
    attn_W_dists[sampli,:] = yh
    for hidx in range(len(model.transformerBlocks)):
      qkvWeights = model.transformerBlocks[hidx].attn.QKV.weight.detach().cpu()
      attn_W_stds[sampli,hidx] = torch.std(qkvWeights)
    # ----------



    with torch.no_grad():
      X,y = get_data_batch(False)       # False -> testset data
      X,y = X.to(device), y.to(device)  # push it to the GPU
      out = model(X)                    # forward pass
      thisloss = loss_function(out.view(-1,out.shape[-1]),y.view(-1)) # calculate loss
      test_loss.append( thisloss.item() )

      # update our progress :)
      print(f'Sample {sampli:4}, train loss: {train_loss[-1]:5.2f}, test loss: {test_loss[-1]:5.2f}')

In [None]:
# plot the losses
plt.plot(train_loss,'k',label='Train loss')
plt.plot(range(0,num_samples,50),test_loss,'rs-',markerfacecolor='w',markersize=8,label='Test loss')

plt.legend()
plt.gca().set(xlabel='Epoch',ylabel='Loss')
plt.show()

# Exercise 2: Distributions of attention weights during training

In [None]:
# colors for each line
linecolors = plt.cm.plasma(np.linspace(0,1,num_samples))

_,axs = plt.subplots(1,2,figsize=(12,4))

for i in range(0,num_samples,50):
  axs[0].plot(xh[:-1],attn_W_dists[i,:],color=linecolors[i],label=f'{i}')

axs[0].legend()
axs[0].set(xlim=xh[[0,-1]],xlabel='Weight value',ylabel='Count',title='Attention weights distributions over learning')


# plot the standard deviations
linecolors = plt.cm.plasma(np.linspace(0,1,attn_W_stds.shape[1]))
for i in range(attn_W_stds.shape[1]):
  axs[1].plot(range(0,num_samples,50),attn_W_stds[::50,i],'ks',markerfacecolor=linecolors[i],markersize=9,
              label=f'Layer {i}')

axs[1].set(xlabel='Training epoch',ylabel='Standard deviation')
axs[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# A fun little test :)

In [None]:
prompt = 'I went on holiday to Liliput and'
in2gpt = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

output = model.generate(in2gpt,max_new_tokens=100)
print(tokenizer.decode(output[0]).replace('\r','\n'))