# Imports

In [1]:
import math, os, torch, inspect, time, tiktoken
import matplotlib.pyplot as plt
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,random_split
import torch.nn.functional as F
torch.manual_seed(42)
torch.set_float32_matmul_precision("medium")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda



# Embeddings, feedforward, Heads

In [2]:
class FeedF_norm(nn.Module):
  def __init__(self, context, emb,  dropout):
    super().__init__()
    self.lin1 = nn.Linear(emb, 4 * emb, bias = False) 
    self.act1 = nn.GELU()
    self.drop= nn.Dropout(dropout)
    self.output_projection = nn.Linear(4 * emb, emb, bias = False)

  def forward(self, x: torch.Tensor):
    x = self.act1(self.lin1(x))
    x = self.drop(self.output_projection(x))
    return x
    
class Attention_block(nn.Module):
  
  def __init__(self,  n_heads: int, emb: int, context: int, dropout: float):
    super().__init__()
    assert emb % n_heads == 0, "embedding dimensionality must be a multiple of n_heads"
    self.emb = emb
    self.nh = n_heads
    self.dropout_p = dropout
    self.projection = nn.Linear(emb, emb*3)
    self.output_projection = nn.Linear(emb, emb)
    self.output_dropout = nn.Dropout(dropout)
    self.flash = hasattr(nn.functional, "scaled_dot_product_attention")
    if not self.flash:
      print("flesh is not working")
      self.register_buffer("causal_mask", torch.tril(torch.ones(self.context, self.contex)
                                                     ).view(1,1, self.context, self.context))
      self.att_drop = nn.Dropout(dropout)
      
  def get_attention(self, query: torch.Tensor, keys: torch.Tensor, values: torch.tensor):
    if self.flash:
      out = torch.nn.functional.scaled_dot_product_attention(query, keys, values, attn_mask=None, dropout_p=self.dropout_p if self.training else 0,
                                                             is_causal=True)
    else: 
      scale = math.sqrt(self.emb)
      att_scores = (query @ keys.transpose(-2,-1))/scale 
      B, NH, C, E = query.shape
      att_scores.masked_fill_(self.causal_mask[:,:,:C, :C] == 0, float('-inf'))
      att_scores = att_scores.softmax(y, dim = -1)
      att_scores = self.att_drop(att_scores)
      out = att_scores @ values
    return out


  def forward(self, x: torch.Tensor):
    B, C, E = x.shape
    q, k, v = self.projection(x).split(self.emb, dim = 2) # batch, context, emb
    q = q.view(B, C, self.nh, E // self.nh).transpose(1,2) # b, nh, c, emb
    k = k.view(B, C, self.nh, E // self.nh).transpose(1,2)
    v = v.view(B, C, self.nh, E // self.nh).transpose(1,2)
    x = self.get_attention(q, k, v) # b, nh, c, c
    x = x.transpose(1,2).contiguous().view(B, C, E)
    x = self.output_dropout(self.output_projection(x))
    return x
  
class TransformerBlock(nn.Module):
  def __init__(self, context, emb, n_heads,dropout):
    super().__init__()
    self.ln1 = nn.LayerNorm(emb, bias = False)
    self.MultiHead = Attention_block(n_heads, emb, context, dropout)
    self.ln2 = nn.LayerNorm(emb, bias = False)
    self.FeedForward = FeedF_norm(context, emb,dropout)
    
  def forward(self, x: torch.Tensor):
    x = x + self.MultiHead(self.ln1(x)) #batch, context, d_model 
    x = x + self.FeedForward(self.ln2(x)) #(B,C,D)
    return x


# Model


In [3]:
class GPT1(nn.Module):
  def __init__(self,context, emb, vocab_size, n_layers, n_heads,  dropout):
    super().__init__()
    self.drop = nn.Dropout(dropout)
    self.emb = nn.Embedding(vocab_size, emb)
    self.pos = nn.Embedding(context, emb)
    self.context = context
    self.TransformerBlocks = nn.ModuleList(
        [TransformerBlock(context, emb, n_heads,dropout)
          for _ in range(n_layers)])
    self.generator = nn.Linear(emb, vocab_size, bias = False)
    self.emb.weight =  self.generator.weight
    self.apply(self._init_weights)
    #normalization for residuals
    for pn, p in self.named_parameters():
            if pn.endswith('output_projection.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * n_layers))
                
  def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)   
             
  def get_num_params(self, non_embedding=True):

    n_params = sum(p.numel() for p in self.parameters())
    if non_embedding:
        n_params -= self.pos.weight.numel()
    return n_params          
            
  def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
    # Getting parameters with requires_grad = True
      param_dict = {pn: p for pn, p in self.named_parameters()}
      param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
      # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
      # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
      decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
      nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
      optim_groups = [
          {'params': decay_params, 'weight_decay': weight_decay},
          {'params': nodecay_params, 'weight_decay': 0.0}
      ]
      num_decay_params = sum(p.numel() for p in decay_params)
      num_nodecay_params = sum(p.numel() for p in nodecay_params)
      print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
      print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
      optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas)
      return optimizer    

  def forward(self, x: torch.Tensor, targets = None):
    B, C = x.shape
    embs = self.emb(x) #batch, seq, emb
    pos = self.pos(torch.tensor([i for i in range(C)],dtype = torch.int64, device=device ))
    x = self.drop(embs + pos)
    for Transf in self.TransformerBlocks:
      x = Transf(x)
    if targets is not None:
        # if we are given some desired targets also calculate the loss
        logits = self.generator(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
    else:
        logits = self.generator(x[:, [-1], :]) 
        loss = None    
    return logits, loss
  
  #generation from previous tokens
  @torch.no_grad()
  def generate(self, idx, max_tokens):
    B = idx.shape[0]
    for _ in range(max_tokens):
      new_idx = idx[:, -self.context:]
      logits, _ = self(new_idx)
      probs = logits.softmax(dim = -1)[:,0,:]
      next_toks = torch.multinomial(probs, num_samples =1) 
      idx = torch.cat((idx, next_toks), dim = 1)
    for i in range(B):
      print(tokenizer.decode(list(idx[i,:])))
  


# Training

In [4]:
params = {
    'context': 256, #context that the model can see
    'emb': 256,
    'vocab_size': 50304, # total number of tokens (gpt2)
    'n_layers': 4, #number of transformer layer
    'n_heads': 8, #number of head per layer
    'dropout': 0.2
}
model = GPT1(**params ) #Initialization of the model with the given parameters
model = model.to(device)

In [5]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
data_dir = 'data'
def get_batch(split):
    if split == 'train':
        data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
    else:
        data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
    ix = torch.randint(len(data) - context, (batch,))
    x = torch.stack([torch.from_numpy((data[i:i+context]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+context]).astype(np.int64)) for i in ix])
    
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

In [6]:
context = 256
batch = 16


learning_rate = 6e-4
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0 #
min_lr = 6e-5
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), "cuda")
warmup_iters = 500
lr_decay_iters = 10000

max_steps = 10000
text = "Hello, I am a language model"
tokenizer= tiktoken.get_encoding("gpt2")
idx = torch.tensor(tokenizer.encode(text), device = device).unsqueeze(0).repeat(5, 1)
tokens_processed = batch * context
def cosine_scheduler(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * (it + 1) / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)


num decayed parameter tensors: 18, with 7,241,728 parameters
num non-decayed parameter tensors: 16, with 3,072 parameters


# Training Loop

In [7]:
train_losses = []
val_losses = []

for iteration in range(max_steps):
    if iteration % 200 != 0 or iteration == 0:
        t0 = time.time()
        data, labels = get_batch("train")
        lr = cosine_scheduler(iteration)
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr
        optimizer.zero_grad()
        logits, loss = model(data, labels)
        loss.backward()
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        torch.cuda.synchronize()
        t1 = time.time()
        dt = t1 -t0
        print(f"step {iteration} | lr : {lr:6f}| loss: { loss.item():.6f} | norm: {norm:.4f} | dt: { dt*1000}ms | ms / tokens_processed: { (dt / tokens_processed):.5f}")
        if iteration % 20 == 0:
            train_losses.append(loss.item())
    else:
        for i in range(10):
            data, labels = get_batch("val")
            logits, loss = model(data, labels)
            print(f"val| loss: { loss.item():.6f}")
            val_losses.append(loss.item())
        model.generate(idx, max_tokens = 20)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
plt.scatter(np.arange(0, len(train_losses), 1), train_losses)

NameError: name 'plt' is not defined