|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Build a GPT<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Time model5 on CPU and GPU<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import time

In [None]:
# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
# hyperparameters for GPT2-124M
n_vocab    = 50257 # GPT2 vocab size
embed_dim  =   768 # embedding dimension
seq_len    =  1024 # max sequence length
n_heads    =    12 # attention heads
n_blocks   =    12 # transformer blocks
batch_size =     8

# Exercise 1: Create (and time) two model instances

In [None]:
### ----------- Class for multihead attention ----------- ###
class MultiHeadAttention(nn.Module):
  def __init__(self):
    super().__init__()

    # number of attention heads
    self.num_heads = n_heads
    self.head_dim  = embed_dim // n_heads

    # the three Q,K,V matrices are initialized as one, and are split inside forward()
    self.QKV = nn.Linear(embed_dim, 3*embed_dim, bias=True)

    # linear projection after attention
    self.W0 = nn.Linear(embed_dim, embed_dim, bias=True)


  def forward(self,x):

    # sizes for later use
    B, T, E = x.shape # [batch, seq_len, embed_dim]

    # push data through Q, K, and V in one concatenated matrix
    qkv = self.QKV(x) # [batch, sequence, 3*embed]
    q,k,v = torch.split(qkv,E,dim=2) # each matrix is [B, T, E]

    # reshape to [B, T, nHeads, head_dim]
    #  and then transpose to [B, nHeads, T, head_dim]
    q = q.view(B, T, self.num_heads, self.head_dim).transpose(1,2) # [B, nHeads, T, head_dim]
    k = k.view(B, T, self.num_heads, self.head_dim).transpose(1,2)
    v = v.view(B, T, self.num_heads, self.head_dim).transpose(1,2)

    # Pytorch's dot-product attention function handles multi-head shapes
    out = F.scaled_dot_product_attention(q, k, v, is_causal=True) # [B, nHeads, T, head_dim]

    # recombine heads: (B, nHeads, T, head_dim) -> [B, T, E]
    out = out.transpose(1,2).view(B, T, E)

    # finally, linearly mix the attention heads
    out = self.W0(out)

    return out
### --------------- multihead attention --------------- ###





### ----------- Class for Transformer block ----------- ###
class TransformerBlock(nn.Module):
  def __init__(self):
    super().__init__()

    ### attention subblock
    self.layernorm_1 = nn.LayerNorm(embed_dim, eps=1e-5)
    self.attn = MultiHeadAttention()


    ### linear feedforward (MLP) subblock
    self.layernorm_2 = nn.LayerNorm(embed_dim, eps=1e-5)
    # 4x expansion, then back to embedding size
    self.mlp_1 = nn.Linear(embed_dim, 4*embed_dim, bias=True)
    self.gelu  = nn.GELU()
    self.mlp_2 = nn.Linear(4*embed_dim, embed_dim, bias=True)

  def forward(self, x):

    # attention
    x_att = self.layernorm_1(x) # pre-attention normalization
    x_att = x + self.attn(x_att) # run through attention, then add pre-attention activation ("residual")


    # MLP
    x_ff = self.layernorm_2(x_att) # pre-MLP normalization
    x_ff = self.mlp_2(self.gelu( self.mlp_1(x_ff) )) # expansion-nonlinearity-contraction
    x_ff = x_att + x_ff # add back pre-MLP activation for adjustment

    return x_ff
### --------------- Transformer block --------------- ###






### --------------- class for the language model --------------- ###
class LanguageModel(nn.Module):
  def __init__(self,device):
    super().__init__()

    # token + position embeddings
    self.wte = nn.Embedding(n_vocab, embed_dim) # token embedding
    self.wpe = nn.Embedding(seq_len, embed_dim) # position embedding

    # transformer blocks
    self.transformerBlocks = nn.Sequential(*[TransformerBlock() for _ in range(n_blocks)])

    # final layernorm
    self.layernorm_final = nn.LayerNorm(embed_dim, eps=1e-5)

    # lm head, with weights tied to token embedding
    self.final_head = nn.Linear(embed_dim, n_vocab, bias=False)
    self.final_head.weight = nn.Parameter(self.wte.weight)

    self.device = device


  def forward(self, idx):

    # token + position embeddings (note the device!)
    token_emb = self.wte(idx) # [B, T, E]
    posit_emb = self.wpe(torch.arange(idx.shape[-1],device=self.device)) # [T, E]
    x = token_emb + posit_emb # [B, T, E]

    # pass through each transformer block
    x = self.transformerBlocks(x)

    # final layernorm and unembeddings
    x = self.layernorm_final(x)
    logits = self.final_head(x)  # [B, T, n_vocab]

    return logits


  def generate(self, idx, temperature=1., max_new_tokens=50):

    for _ in range(max_new_tokens):

      # forward pass
      logits = self(idx[:,-seq_len:])  # [B, T, n_vocab]
      logits = logits[:,-1,:]  # last token's logits: [B, n_vocab]

      # apply temperature + softmax
      probs = F.softmax(logits/temperature, dim=-1) # [B, n_vocab]

      # sample next token
      idx_next = torch.multinomial(probs, num_samples=1) # [B, 1]

      # append
      idx = torch.cat((idx, idx_next), dim=1) # [B, T+1]
    return idx
### ------------------ language model ------------------ ###

In [None]:
# time how long it takes to create each model instance on the CPU vs GPU
print('--- Creating the model ---')
torch.cuda.synchronize()
start_time = time.time()
model_gpu = LanguageModel(device=device).to(device)
print(f'--- GPU: {time.time()-start_time:.3f} sec')

start_time = time.time()
model_cpu = LanguageModel(device='cpu')
print(f'--- CPU: {time.time()-start_time:.3f} sec')

In [None]:
# confirm
print(model_gpu.wte.weight.device)
print(model_cpu.wte.weight.device)

# Exercise 2: Time a forward pass

In [None]:
numReps = 5


### test the GPU model
torch.cuda.synchronize()
start_time = time.time()
for _ in range(numReps):

  # some fake data (one batch)
  data = torch.randint(0,n_vocab,size=(batch_size,seq_len),device=device)
  outG = model_gpu(data)

print(f'--- GPU: {time.time()-start_time:6.3f} sec')



### repeat for the CPU model
start_time = time.time()
for _ in range(numReps):

  # some fake data (one batch)
  data = torch.randint(0,n_vocab,size=(batch_size,seq_len),device='cpu')
  outG = model_cpu(data)

print(f'--- CPU: {time.time()-start_time:6.3f} sec')

# Exercise 3: Time backprop

In [None]:
### run this cell to create a loss function and optimizer

# GPU: create the loss and optimizer functions
lossfun_gpu = nn.NLLLoss().to(device)
optimizer_gpu = torch.optim.AdamW(model_gpu.parameters(), lr=.001)

# CPU: create the loss and optimizer functions
lossfun_cpu = nn.NLLLoss()
optimizer_cpu = torch.optim.AdamW(model_cpu.parameters(), lr=.001)

In [None]:
### copy these lines to put into a for-loop to run backprop

data = torch.randint(0,n_vocab,size=(batch_size,seq_len)).to(device) # create data
model_gpu.zero_grad() # clear previous gradients
outG = model_gpu(data) # forward pass
lossG = lossfun_gpu(outG[:,-1,:],data[:,-1]) # calculate the loss
lossG.backward() # calculate gradients
optimizer_gpu.step() # implement backprop

In [None]:
### test the GPU model
torch.cuda.synchronize()
start_time = time.time()
for _ in range(numReps):
  data = torch.randint(0,n_vocab,size=(batch_size,seq_len)).to(device) # create data
  model_gpu.zero_grad() # clear previous gradients
  outG = model_gpu(data) # forward pass
  lossG = lossfun_gpu(outG[:,-1,:],data[:,-1]) # calculate the loss
  lossG.backward() # calculate gradients
  optimizer_gpu.step() # implement backprop

print(f'--- GPU: {time.time()-start_time:6.3f} sec')


### test the CPU model
torch.cuda.synchronize()
start_time = time.time()
for _ in range(numReps):
  data = torch.randint(0,n_vocab,size=(batch_size,seq_len)) # create data
  model_cpu.zero_grad() # clear previous gradients
  outC = model_cpu(data) # forward pass
  lossC = lossfun_cpu(outC[:,-1,:],data[:,-1]) # calculate the loss
  lossC.backward() # calculate gradients
  optimizer_cpu.step() # implement backprop

print(f'--- CPU: {time.time()-start_time:6.3f} sec')