|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Pretrain LLMs<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Fine dropout in model 5<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
n_vocab = tokenizer.vocab_size

In [None]:
# hyperparameters for GPT2-124M
embed_dim  =   768     # embedding dimension
seq_len    =   256     # max sequence length
n_heads    =    12     # attention heads
n_blocks   =    12     # transformer blocks
batch_size =    64
dropout    =    .1 #n dropout proportion (proportion dropped)

# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Exercise 1: Create a fine dataset

In [None]:
!pip install datatrove
from datatrove.pipeline.readers import ParquetReader

In [None]:
# import the docs
numDocs = 1000 # each doc has ~750 tokens
data_reader = ParquetReader('hf://datasets/HuggingFaceFW/fineweb/data',limit=numDocs)

# join all texts into one token vector
texttokens = np.array([])
for t in data_reader():
  texttokens = np.append(texttokens,tokenizer.encode(t.text))

# need a pytorch tensor for training
texttokens = torch.tensor(texttokens,dtype=torch.long)

In [None]:
# token count (note: set() doesn't work on torch tensors :( )
print(f'\nThere are {len(texttokens):,} tokens, of which {len(set(texttokens.numpy())):,} are unique.')

### Function to get training/testing data

In [None]:
# train/test split

train_ratio = .9

# index to split data
test_split_point = int(train_ratio*len(texttokens))

train_data = texttokens[:test_split_point]
test_data  = texttokens[test_split_point:]


# a function that returns a batch of data samples
def get_data_batch(training=True):

  # pick the dataset to use
  if training:
    data = train_data
  else:
    data = test_data

  # pick random indices to start
  ix = torch.randint(len(data)-seq_len,size=(batch_size,))

  # get the data and targets (via broadcasting outer product)
  X = data[ix[:,None] + torch.arange(seq_len)]
  y = data[ix[:,None] + torch.arange(1,seq_len+1)]
  return X,y


# example
X,y = get_data_batch()
print(f'Input data (size {X.shape}):\n',X)
print(f'\n\nTargets (size {y.shape}):\n',y)

# Exercise 2: The model with dropout

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self):
    super().__init__()

    # number of attention heads
    self.num_heads = n_heads
    self.head_dim  = embed_dim // n_heads

    # the three Q,K,V weights matrices are initialized as one, and are split inside forward()
    self.QKV = nn.Linear(embed_dim, 3*embed_dim, bias=True)

    # linear mixing after attention
    self.W0 = nn.Linear(embed_dim, embed_dim, bias=True)

    #n dropout not defined here b/c it's in F.scaled_dot_product_attention


  def forward(self,x):

    # sizes for later use
    B, T, E = x.shape # [batch, seq_len, embed_dim]

    # push data through Q, K, and V in one concatenated matrix
    qkv = self.QKV(x) # [batch, sequence, 3*embed]
    q,k,v = torch.split(qkv,E,dim=2) # each matrix is [B, T, E]

    # reshape to [B, T, nHeads, head_dim]
    #  and then transpose to [B, nHeads, T, head_dim]
    q = q.view(B, T, self.num_heads, self.head_dim).transpose(1,2) # [B, nHeads, T, head_dim]
    k = k.view(B, T, self.num_heads, self.head_dim).transpose(1,2)
    v = v.view(B, T, self.num_heads, self.head_dim).transpose(1,2)

    #n Pytorch's dot-product attention function handles multi-head shapes
    dropp=dropout if self.training==True else 0 # set dropout rate according to train/eval
    out = F.scaled_dot_product_attention(q, k, v, is_causal=True, dropout_p=dropp)

    # recombine heads: (B, nHeads, T, head_dim) -> [B, T, E]
    out = out.transpose(1,2).view(B, T, E)

    # finally, linearly mix the attention heads
    out = self.W0(out)

    return out




class TransformerBlock(nn.Module):
  def __init__(self):
    super().__init__()

    ### attention subblock
    self.layernorm_1 = nn.LayerNorm(embed_dim, eps=1e-5)
    self.attn = MultiHeadAttention()


    ### linear feedforward (MLP) subblock
    self.layernorm_2 = nn.LayerNorm(embed_dim, eps=1e-5)
    # 4x expansion, then back to embedding size
    self.mlp_1 = nn.Linear(embed_dim, 4*embed_dim, bias=True)
    self.gelu  = nn.GELU()
    self.mlp_2 = nn.Linear(4*embed_dim, embed_dim, bias=True)

    #n transformer block dropout
    self.trn_dropout = nn.Dropout(dropout)

  def forward(self, x):

    # attention
    x_att = self.layernorm_1(x) # pre-attention normalization
    x_att = x + self.trn_dropout(self.attn(x_att)) #n attention -> dropout -> add adjustment


    # MLP
    x_ff = self.layernorm_2(x_att) # pre-MLP normalization
    x_ff = self.mlp_2(self.gelu( self.mlp_1(x_ff) )) # expansion-contraction
    x_ff = x_att + self.trn_dropout(x_ff) #n dropout the MLP and add back to the embeddings vectors

    return x_ff

In [None]:
class LanguageModel(nn.Module):
  def __init__(self):
    super().__init__()

    # token + position embeddings
    self.wte = nn.Embedding(n_vocab, embed_dim) # token embedding
    self.wpe = nn.Embedding(seq_len, embed_dim) # position embedding
    #n dropout
    self.emb_dropout = nn.Dropout(dropout)

    # transformer blocks
    self.transformerBlocks = nn.Sequential(*[TransformerBlock() for _ in range(n_blocks)])

    # final layernorm
    self.layernorm_final = nn.LayerNorm(embed_dim, eps=1e-5)

    # lm head, with weights tied to token embedding
    self.final_head = nn.Linear(embed_dim, n_vocab, bias=False)
    self.final_head.weight = nn.Parameter(self.wte.weight)




    # weight initializations
    self.apply(self.weightInits)

  def weightInits(self, module):

    # initialize nn.linear to normal with std=.02
    if isinstance(module, nn.Linear):
      nn.init.normal_(module.weight,mean=0,std=.02)

      # initialize bias terms to zero
      if module.bias is not None:
          nn.init.zeros_(module.bias)

    # nn.Embeddings to Xavier
    if isinstance(module, nn.Embedding):
      nn.init.xavier_normal_(module.weight)
  ### ---




  def forward(self, idx):

    # token + position embeddings (note the device!)
    token_emb = self.wte(idx) # [B, T, E]
    posit_emb = self.wpe(torch.arange(idx.shape[-1],device=device)) # [T, E]
    x = token_emb + posit_emb # [B, T, E]
    x = self.emb_dropout(x) #n dropout after summing E+P

    # pass through each transformer block
    x = self.transformerBlocks(x)

    # final layernorm and unembeddings
    x = self.layernorm_final(x)
    logits = self.final_head(x)  # [B, T, n_vocab]

    #n outputs (no logsoftmax here)
    outputs = logits/np.sqrt(embed_dim)

    return outputs


  def generate(self, idx, temperature, max_new_tokens=50):

    for _ in range(max_new_tokens):

      # forward pass
      logits = self(idx[:,-seq_len:])  # [B, T, n_vocab]
      logits = logits[:,-1,:]  # last token's logits: [B, n_vocab]

      # softmax probability values
      probs = F.softmax(logits/temperature,dim=-1) #n softmax with temperature

      # sample next token
      idx_next = torch.multinomial(probs, num_samples=1) # [B, 1]

      # append
      idx = torch.cat((idx, idx_next), dim=1) # [B, T+1]
    return idx


In [None]:
# create a new instance of the model
model = LanguageModel().to(device)

# Exercise 3: Train on final token

In [None]:
# create the loss and optimizer functions
loss_function = nn.NLLLoss().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=.001, weight_decay=.01)

In [None]:
# get a batch of data
X,y = get_data_batch()

# push it through the model
logits = model(X.to(device))

#n calculate the losses on the final target token (with log-softmax!)
loss = loss_function(F.log_softmax(logits[:,-1,:],dim=-1),y[:,-1].to(device))
loss

# Now train the model!

In [None]:
num_samples = 1234

# initialize losses
train_loss = []
test_loss = []


for sampli in range(num_samples):

  # get a batch of data
  X,y = get_data_batch()

  # clear previous gradients
  model.zero_grad(set_to_none=True)

  # forward pass
  logits = model(X.to(device))

  #n calculate the losses on the final target token (with log-softmax!)
  loss = loss_function(F.log_softmax(logits[:,-1,:],-1),y[:,-1].to(device))

  # backprop
  loss.backward()
  optimizer.step()

  # store the per-sample loss
  train_loss.append( loss.item() )


  # evaluate the model with the test set
  if sampli%80==0:

    with torch.no_grad():
      model.eval() #n deactivate dropout
      X,y = get_data_batch(False) # False -> testset data
      out = model(X.to(device))   # forward pass
      thisloss = loss_function(F.log_softmax(out[:,-1,:],-1),y[:,-1].to(device)) # calculate loss
      test_loss.append( thisloss.item() )
      model.train() #n reactivate dropout

      # update our progress
      print(f'Sample {sampli:4}, train loss: {train_loss[-1]:6.3f}, test loss: {test_loss[-1]:6.3f}')

In [None]:
# plot the losses
plt.plot(train_loss,'k',markersize=8,label='Train loss')
plt.plot(range(0,num_samples,80),test_loss,'r',markerfacecolor='w',markersize=8,label='Test loss')

plt.legend()
plt.gca().set(xlabel='Epoch',ylabel='Loss')
plt.show()

In [None]:
# A little test
prompt = 'I went on holiday to Liliput and'
in2gpt = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

output = model.generate(in2gpt,temperature=1.1,max_new_tokens=100)
print(tokenizer.decode(output[0]))