|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Pretrain LLMs<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge HELPER: What happens to unused tokens?<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import requests

import torch
import torch.nn as nn
import torch.nn.functional as F

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# hyperparameters for GPT2-124M
n_vocab    = 50257     # GPT-2 vocab size
embed_dim  =   768     # embedding dimension
seq_len    =   256     # max sequence length
n_heads    =    12     # attention heads
n_blocks   =    12     # transformer blocks
batch_size =    16

# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Exercise 1: How many tokens are in Gulliver's travels?

In [None]:
# tokenize Gulliver's travels
text = requests.get('https://www.gutenberg.org/cache/epub/829/pg829.txt').text
gtTokens = torch.tensor( tokenizer.encode(text),dtype=torch.long )
len(gtTokens)

In [None]:

print(f'{len(gtTokens):,} total tokens, and {:,} unique tokens.')
print(f"That's {:.1f}% of the tokenizer vocabulary.")

# Exercise 2: Find the 100 most, least, and unused tokens

In [None]:
# frequencies of tokens
freqidx =

# most and least common
mostcommon =
leastcommon =

# tokens in the dictionary that don't appear in the text
neverused = []
i = 0
while len(neverused)<100:
  if i not in gtTokens:

  i+=1

In [None]:
# print out some examples
print('20 MOST common tokens:')
for i in mostcommon[:20]:
  print(f' {counts[i]:4} occurances of "{tokenizer.decode(uniq[i])}"')


print('\n\n20 LEAST common tokens:')



print('\n\n20 NEVER USED tokens:')


# Exercise 3: Average category logsoftmax in a random model

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self):
    super().__init__()

    # number of attention heads
    self.num_heads = n_heads
    self.head_dim  = embed_dim // n_heads

    # the three Q,K,V weights matrices are initialized as one, and are split inside forward()
    self.QKV = nn.Linear(embed_dim, 3*embed_dim, bias=True)

    # linear mixing after attention
    self.W0 = nn.Linear(embed_dim, embed_dim, bias=True)


  def forward(self,x):

    # sizes for later use
    B, T, E = x.shape # [batch, seq_len, embed_dim]

    # push data through Q, K, and V in one concatenated matrix
    qkv = self.QKV(x) # [batch, sequence, 3*embed]
    q,k,v = torch.split(qkv,E,dim=2) # each matrix is [B, T, E]

    # reshape to [B, T, nHeads, head_dim]
    #  and then transpose to [B, nHeads, T, head_dim]
    q = q.view(B, T, self.num_heads, self.head_dim).transpose(1,2) # [B, nHeads, T, head_dim]
    k = k.view(B, T, self.num_heads, self.head_dim).transpose(1,2)
    v = v.view(B, T, self.num_heads, self.head_dim).transpose(1,2)

    # Pytorch's dot-product attention function handles multi-head shapes
    out = F.scaled_dot_product_attention(q, k, v, is_causal=True) # [B, nHeads, T, head_dim]

    # recombine heads: (B, nHeads, T, head_dim) -> [B, T, E]
    out = out.transpose(1,2).view(B, T, E)

    # finally, linearly mix the attention heads
    out = self.W0(out)

    return out




class TransformerBlock(nn.Module):
  def __init__(self):
    super().__init__()

    ### attention subblock
    self.layernorm_1 = nn.LayerNorm(embed_dim, eps=1e-5)
    self.attn = MultiHeadAttention()


    ### linear feedforward (MLP) subblock
    self.layernorm_2 = nn.LayerNorm(embed_dim, eps=1e-5)
    # 4x expansion, then back to embedding size
    self.mlp_1 = nn.Linear(embed_dim, 4*embed_dim, bias=True)
    self.gelu  = nn.GELU()
    self.mlp_2 = nn.Linear(4*embed_dim, embed_dim, bias=True)

  def forward(self, x):

    # attention
    x_att = self.layernorm_1(x) # pre-attention normalization
    x_att = x + self.attn(x_att) # run through attention, then add pre-attention activation ("residual")


    # MLP
    x_ff = self.layernorm_2(x_att) # pre-MLP normalization
    x_ff = x_att + self.mlp_2(self.gelu( self.mlp_1(x_ff) )) # adjustment from expansion-contraction

    return x_ff

In [None]:
class LanguageModel(nn.Module):
  def __init__(self):
    super().__init__()

    # token + position embeddings
    self.wte = nn.Embedding(n_vocab, embed_dim) # token embedding
    self.wpe = nn.Embedding(seq_len, embed_dim) # position embedding

    # transformer blocks
    self.transformerBlocks = nn.Sequential(*[TransformerBlock() for _ in range(n_blocks)])

    # final layernorm
    self.layernorm_final = nn.LayerNorm(embed_dim, eps=1e-5)

    # lm head, with weights tied to token embedding
    self.final_head = nn.Linear(embed_dim, n_vocab, bias=False)
    self.final_head.weight = nn.Parameter(self.wte.weight)




    ### --- weight initializations
    self.apply(self.weightInits)

  def weightInits(self, module):

    # initialize nn.linear to normal with std=.02
    if isinstance(module, nn.Linear):
      nn.init.normal_(module.weight,mean=0,std=.02)

      # initialize bias terms to zero
      if module.bias is not None:
          nn.init.zeros_(module.bias)

    # nn.Embeddings to Xavier
    if isinstance(module, nn.Embedding):
      nn.init.xavier_normal_(module.weight)
  ### ---




  def forward(self, idx):

    # token + position embeddings (note the device!)
    token_emb = self.wte(idx) # [B, T, E]
    posit_emb = self.wpe(torch.arange(idx.shape[-1],device=device)) # [T, E]
    x = token_emb + posit_emb # [B, T, E]

    # pass through each transformer block
    x = self.transformerBlocks(x)

    # final layernorm and unembeddings
    x = self.layernorm_final(x)
    logits = self.final_head(x)  # [B, T, n_vocab]

    # scale and logsoftmax
    outputs = F.log_softmax(logits/np.sqrt(embed_dim),dim=-1)

    return outputs

In [None]:
# create a model and push it to the GPU
model = LanguageModel().to(device)

In [None]:
# generate a batch of data
ix = torch.randint(len(gtTokens) - seq_len, size=(batch_size,))
X = gtTokens[ix[:,None] + torch.arange(seq_len)].to(device)

# forward pass
out =

# create a submatrix of all outputs for this set of words
submatrix =

print(f'Size of input: {X.shape}')
print(f'Size of output: {out.shape}')
print(f'Size of submatrix: {submatrix.shape}')

In [None]:
# average all output values:
most_ave  =
least_ave =
never_ave =

print(f'{most_ave:.2f} for most common')
print(f'{least_ave:.2f} for least common')
print(f'{never_ave:.2f} for unused')

In [None]:
# but the above calculations include model outputs *to* the actual tokens, which introduces a bias.

# reshape
Xflat = X.view(-1).cpu().numpy()

# extract a submatrix and flatten
submatrix =
subflat =

# filter using boolean indexing
mask = ~np.isin(Xflat,mostcommon)  # Boolean mask for filtering
data2ave =  # select only non-common token outputs


print(f'Size of input: {Xflat.shape}')
print(f'Size of output: {out.shape}')
print(f'Size of submatrix: {submatrix.shape}')
print(f'Size of flattened submatrix: {subflat.shape}')

In [None]:
# sanity-check that the mask has no high-frequency tokens
tokenizer.decode(Xflat[mask])

for m in mostcommon:
  print(m) if m in Xflat[mask] else None

# Exercise 4: Train the model

In [None]:
# create the loss and optimizer functions
loss_function = nn.NLLLoss(reduction='mean').to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=.001, weight_decay=.01)

In [None]:
num_samples = 501
tokenProbs = np.zeros((num_samples,3))

# initialize losses
train_loss = []

for sampli in range(num_samples):

  # get a batch of data
  ix =
  X  =
  y  =

  # move data to GPU
  X,y = X.to(device), y.to(device)

  # clear previous gradients
  model.zero_grad(set_to_none=True)

  # forward pass
  logProbs = model(X)

  # calculate the losses on the (reshaped) tokens
  loss = loss_function

  # backprop
  loss.backward()
  optimizer.step()

  # store the per-sample loss
  train_loss.append( loss.item() )



  ### calculate outputs for token frequency category
  # flatten the model inputs and outputs
  Xflat = X.view
  outs = logProbs.

  # MOST COMMON tokens: extract submatrix, and average tokens not present in this batch
  submatrix = outs.reshape(,-1)
  mask = ~np.isin(Xflat,mostcommon)
  tokenProbs[sampli,0] = submatrix[mask,:].mean()

  # LEAST COMMON tokens: extract submatrix, and average tokens not present in this batch


  # UNUSED tokens: extract submatrix, and average tokens not present in this batch



  # update progress display
  if sampli%100==0:
    print(f'Sample {sampli:4}, train loss: {train_loss[-1]:.4f}')

In [None]:
# plot the losses
plt.figure(figsize=(8,4))
plt.plot(train_loss,'k',markersize=8)
plt.gca().set(xlabel='Data batch',ylabel='Loss')
plt.show()

In [None]:
# show the averages
plt.figure(figsize=(12,4))


plt.show()