|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Build a GPT<h1>|
|<h2>Lecture:</h2>|<h1><b>Model 3: One attention head<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# pytorch stuff
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
#n load GPT2 tokenizer
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Hyperparameters

In [None]:
# data hyperparameters
seq_len = 8 # aka context window
n_vocab = tokenizer.n_vocab #n

# model hyperparameters
embed_dim = 2**6 # 64

# training hyperparameters
batch_size = 5

# The model

In [None]:
class Model(nn.Module):
  def __init__(self):
    super().__init__()

    # embedding matrices
    self.embedding = nn.Embedding(n_vocab,embed_dim)
    self.positions = nn.Embedding(seq_len,embed_dim)

    # final output linear layer (unembeddings)
    self.finalLinear = nn.Linear(embed_dim,n_vocab,bias=False)

    # initialize the k,q,v matrices for attention          #n
    self.layernormA = nn.LayerNorm(embed_dim)              #n
    self.key   = nn.Linear(embed_dim,embed_dim,bias=False) #n
    self.query = nn.Linear(embed_dim,embed_dim,bias=False) #n
    self.value = nn.Linear(embed_dim,embed_dim,bias=False) #n
    self.W0    = nn.Linear(embed_dim,embed_dim)            #n

    # the final output layer is tied to the token embedding
    self.finalLinear.weight = nn.Parameter(self.embedding.weight)



  def forward(self,tokx):

    # create the token+position embedding
    token_embed = self.embedding(tokx)
    posit_embed = self.positions(torch.arange(tokx.shape[-1])) # [numtokens, embedding_dims]

    # their sum is the output of the embeddings (the addition will broadcast for batch>1)
    x = token_embed + posit_embed # [batch, numtokens, embedding_dims]


    ##n --- attention sublayer begins here
    # layernorm before attention
    x = self.layernormA(x)

    # attention algo
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)

    qk = q@k.transpose(-2,-1) # dot product between query and keys (would be cosine similiarity if normalized)
    qk_scaled = qk * embed_dim**-.5 # variance-scale the QK (for this model, embed_dim = head_size)
    pastmask = torch.tril(torch.ones(x.shape[0],seq_len,seq_len)) # apply mask for future tokens
    qk_scaled[pastmask==0] = -torch.inf
    qk_softmax = F.softmax(qk_scaled,dim=-1) # softmaxify
    y = qk_softmax @ v # final attention mechanism

    y *= self.W0(y)
    #n --- end attention


    # -o-
    # MLP sublayer would be here
    # -o-


    # final output transformation (unembeddings)
    y = self.finalLinear(y) / np.sqrt(embed_dim)

    return y, (pastmask,qk_scaled,qk_softmax) #n output some attention matrices for inspection (hooks are better in real models!)


  def generate(self,tokx,temperature=1,n_new_tokens=50):

    # tokx is [batch, tokens]

    for _ in range(n_new_tokens):

      # get predictions, but only from the past seq_len tokens
      x = self(tokx[:,-seq_len:])[0] # [batch, seq_len, n_vocab]

      # extract the final token to predict the next
      x = x[:,-1,:] # [batch, n_vocab]

      # apply softmax to get probability values over all tokens in the vocab - with temperature
      probs = F.softmax(x/temperature,dim=-1) # [batch, n_vocab]

      # probabilistically sample from the distribution
      tokx_next = torch.multinomial(probs,num_samples=1) # [batch, 1]

      # append
      tokx = torch.cat( (tokx,tokx_next),dim=1) # [batch, (tokens+1)]
    return tokx


# Calculate logits (model output)

In [None]:
# create data
tokens = tokenizer.encode('I prefer oat milk in my coffee.')
X = torch.tensor(tokens[:-1]).unsqueeze(0)
y = torch.tensor(tokens[1:]).unsqueeze(0)

print(X.shape)
print(y.shape)

In [None]:
model = Model()
out,attn = model(X)

print(out.shape)

In [None]:
print(f'Expected loss for random weights: {-np.log(1/tokenizer.vocab_size):.3f}')
print(f'Observed mean log-softmax output: {torch.mean(-F.log_softmax(out.detach(),dim=-1)):.3f}')
print(f'Cross-entropy loss from pytorch:  {F.cross_entropy(out.view(-1, out.shape[-1]), y.view(-1)):.3f}')

In [None]:
print('Time-causal mask:\n',attn[0])
print('\nqk_scaled:\n',attn[1])
print('\nqk_softmax:\n',attn[2])

In [None]:
plt.imshow(attn[2].detach().squeeze(),vmin=0,vmax=.6,cmap='plasma')
plt.gca().set(xlabel='Token weighting',ylabel='Token being processed',title='Softmaxified QK$^T$')
plt.colorbar(pad=.02)
plt.show()

In [None]:
# sanity-check ;)
attn[2].detach().squeeze().sum(dim=1)

# Generate text

In [None]:
# start with some new tokens
text = 'When I grow up, I want to be a'
tokens = tokenizer.encode(text)
tokens = torch.tensor(tokens).unsqueeze(0)

generated_tokens = model.generate(tokens,temperature=2,n_new_tokens=10)[0]

# let's see how it looks!
tokenizer.decode(generated_tokens.tolist())

In [None]:
# repeat with different temperature values

temps = [ .2, .7, 1, 2, 10 ] # outrageous values...

for T in temps:
  tokz = model.generate(tokens,temperature=T,n_new_tokens=10)
  tokz = tokz[0].tolist()
  print(f'Temp = {T}:\n  {tokenizer.decode(tokz)}\n')