|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Build a GPT<h1>|
|<h2>Lecture:</h2>|<h1><b>Model 4: Multiple Transformer blocks<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# pytorch stuff
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
#n load GPT2 tokenizer
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Hyperparameters

In [None]:
# data hyperparameters
seq_len = 8 # aka context window
n_vocab = tokenizer.vocab_size

# model hyperparameters
embed_dim = 128
nTransformerBlocks = 12


# training hyperparameters
batch_size = 5

# #n One attention head

In [None]:
# create one attention head
class OneAttentionHead(nn.Module):
  def __init__(self,embed_dim):
    super().__init__()

    # create the k/q/v matrices
    self.key   = nn.Linear(embed_dim,embed_dim,bias=False)
    self.query = nn.Linear(embed_dim,embed_dim,bias=False)
    self.value = nn.Linear(embed_dim,embed_dim,bias=False)
    self.W0    = nn.Linear(embed_dim,embed_dim,bias=False)

  def forward(self,x):

    # run the token embeddings vectors through attention
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)
    y = F.scaled_dot_product_attention(q,k,v,is_causal=True)
    y = self.W0(y) # linear weightings post-attention

    return y

# #n Transformer block

In [None]:
#
class TransformerBlock(nn.Module):
  def __init__(self,embed_dim):
    super().__init__()

    # attention sublayer
    self.layerNormAttn = nn.LayerNorm(embed_dim)
    self.attn = OneAttentionHead(embed_dim)

    # feedforward (MLP) sublayer
    self.layerNormMLP  = nn.LayerNorm(embed_dim)
    self.W1   = nn.Linear(embed_dim,4*embed_dim) # 4x expansion
    self.gelu = nn.GELU()                        # nonlinearity
    self.W2   = nn.Linear(4*embed_dim,embed_dim) # 4x contraction


  def forward(self,x):

    ## -------- attention sublayer -------- ##
    x = x + self.attn(self.layerNormAttn(x)) #
    ## ------------------------------------ ##

    ## -------------------- MLP sublayer -------------------- ##
    y = x + self.W2(self.gelu(self.W1(self.layerNormMLP(x))))  #
    ## ------------------------------------------------------ ##

    return y

# The full model

In [None]:
# the full model class, which calls the previously defined classes
class LanguageModel(nn.Module):
  def __init__(self,nTransformerBlocks,embed_dim):
    super().__init__()

    # embedding matrices
    self.embedding = nn.Embedding(n_vocab,embed_dim)
    self.positions = nn.Embedding(seq_len,embed_dim)


    #n multiple Transformer blocks
    self.transformerBlocks = nn.Sequential(*[TransformerBlock(embed_dim) for _ in range(nTransformerBlocks)])


    # embedding to output (linear) layer
    self.finalLayerNorm = nn.LayerNorm(embed_dim) # final layernorm after transformers and before unembeddings
    self.finalLinear = nn.Linear(embed_dim,n_vocab,bias=False)

    # the final output layer is tied to the token embedding
    self.finalLinear.weight = nn.Parameter(self.embedding.weight)


  def forward(self,tokx):

    ## --------------------- embeddings --------------------- ##
    token_embed = self.embedding(tokx)
    posit_embed = self.positions(torch.arange(tokx.shape[-1])) # [seq_len x embedding_dim]
    x = token_embed + posit_embed # [batch, seq_len, embedding_dim]
    ## ------------------------------------------------------ ##


    #n
    ## --- transformer blocks --- ##
    x = self.transformerBlocks(x)
    ## -------------------------- ##



    ## - finally, unembeddings - ##
    x = self.finalLayerNorm(x)
    x = self.finalLinear(x)
    ## ------------------------- ##

    return x # not returning attention matrices like in model3


  def generate(self,tokx,temperature=1.,n_new_tokens=50):

    for _ in range(n_new_tokens):

      # get predictions, but only from the past seq_len tokens
      x = self(tokx[:,-seq_len:]) # [batch, seq_len, n_vocab]

      # extract the final token to predict the next
      x = x[:,-1,:] # [batch, n_vocab]

      # apply softmax to get probability values over all tokens in the vocab - with temperature
      probs = F.softmax(x/temperature,dim=-1) # [batch, n_vocab]

      # probabilistically sample from the distribution
      tokx_next = torch.multinomial(probs,num_samples=1) # [batch, 1]

      # append
      tokx = torch.cat( (tokx,tokx_next),dim=1) # [batch, (tokens+1)]
    return tokx


# Create a model instance and inspect

In [None]:
llm = LanguageModel(nTransformerBlocks,embed_dim)
llm

In [None]:
llm.transformerBlocks[4]

In [None]:
llm.transformerBlocks[4].attn

In [None]:
llm.transformerBlocks[4].attn.query.weight.detach().cpu().numpy()

In [None]:
# create data
tokens = tokenizer.encode('I prefer oat milk in my coffee.')
X = torch.tensor(tokens[:-1]).unsqueeze(0)
y = torch.tensor(tokens[1:]).unsqueeze(0)

print(X.shape)
print(y.shape)

In [None]:
out = llm(X)

print(out.shape)