# FINAL GPT MODEL

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import tiktoken
import urllib.request
import zipfile 
import os
from pathlib import Path
# from gpt_download import download_and_load_gpt2
import time
import json
import urllib

In [2]:
file_path = "the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

## GPT Configuration Dictionary

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
 }

## Multi-Head Attention Mechanism

### Dataloader Implementation

In [4]:
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    # Tokenizes the entire text
    token_ids = tokenizer.encode(txt)

    # Uses a sliding window approach to chunk the book into overlapping sequences.
    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1: i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [5]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                 stride=128, shuffle=True, drop_last=True,
                 num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2") # Instantiates the gpt2 tokenizer

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)  # Initialize the Dataset class created earlier

    # Intantiates and provides parameters for the DataLoader python class provided by PyTorch.
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, 
         context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
        "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads   
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)   
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                        diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)   
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)      
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)  
        queries = queries.view(                                             
            b, num_tokens, self.num_heads, self.head_dim    
        )

        keys = keys.transpose(1, 2)   
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)  
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens] 

        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(
        attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)

        context_vec = context_vec.contiguous().view(
            b, num_tokens, self.d_out
        )
        
        context_vec = self.out_proj(context_vec)   
        return context_vec

### GELU Activation Function

In [7]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return 0.5 * x  * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

### Layer Normalization Class

In [8]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

### Neural Network Feed Forward Class

In [9]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(                            # Simply an implemtation of a 3 layered forward neural network.
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),      # This linear portion "expands" the input embedding dimension into the number of nodes of the following hidden layer, in our case it'll 4 times the amount of the emb dim.
            GELU(),                                             # Calls the GELU function on the nodes.
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])       # Does the opposite of the first linear layer, "compressing" from 4 times the size back to the original size of the input/
            )
        
    def forward(self, x):
        return self.layers(x)

## Transformer Implementation

In [10]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"], 
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  
   
        shortcut = x        
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut     
        return x

## GPT Model Implementation

In [11]:
class GPTModel(nn.Module):
    def __init__(self, cfg):        #initializes the token and positional embedding layers using the configurations passed in via cfg.
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(                # Creates transformer blocks equal to that in specified in cfg.
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]    
            )

        self.final_norm = LayerNorm(cfg["emb_dim"])     # Layer normalization is applied.
        self.out_head = nn.Linear(  # linear output head without bias is defined, which projects the transformer’s output into the vocabulary space of the tokenizer to generate logits for each token in the vocabulary.
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
    
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device)
        )

        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [12]:
tokenizer = tiktoken.get_encoding("gpt2")           # Instantiate the GPT-2 tokenizer.
batch = []                                          # Initialize an empty list
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))  # First encodes the text using GPT-2 tokenizer -> converted into a tensor -> is appended into the batch list.
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)                   # Takes in a list of tensors and "stacks" (concatenates) them into a 2D tensor.
print(batch) 

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [13]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)     # Shape: [2, 4, 50257]
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3613,  0.4222, -0.0711,  ...,  0.3483,  0.4661, -0.2838],
         [-0.1792, -0.5660, -0.9485,  ...,  0.0477,  0.5181, -0.3168],
         [ 0.7120,  0.0332,  0.1085,  ...,  0.1018, -0.4327, -0.2553],
         [-1.0076,  0.3418, -0.1190,  ...,  0.7195,  0.4023,  0.0532]],

        [[-0.2564,  0.0900,  0.0335,  ...,  0.2659,  0.4454, -0.6806],
         [ 0.1230,  0.3653, -0.2074,  ...,  0.7705,  0.2710,  0.2246],
         [ 1.0558,  1.0318, -0.2800,  ...,  0.6936,  0.3205, -0.3178],
         [-0.1565,  0.3926,  0.3288,  ...,  1.2630, -0.1858,  0.0388]]],
       grad_fn=<UnsafeViewBackward0>)


### Total Paramters and Trainable Parameters

In [14]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

total_params_gpt2 = (
    total_params - sum(p.numel()
    for p in model.out_head.parameters())
 )

print(f"Number of trainable parameters "
      f"considering weight tying: {total_params_gpt2:,}"
 )

Total number of parameters: 163,009,536
Number of trainable parameters considering weight tying: 124,412,160


### Calculating total size of model weights

In [15]:
total_size_bytes = total_params * 4      
total_size_mb = total_size_bytes / (1024 * 1024)    
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


# Pretraining

## Encoder and Decoder

### Instantiating Our GPT-Model

In [16]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 256,   #! Context length dropped from 1024 -> 256.
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
 }

In [17]:
def generate_text_simple(model, idx,                
                 max_new_tokens, context_size): 
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]   
        with torch.no_grad():
           logits = model(idx_cond)

        logits = logits[:, -1, :]                   
        probas = torch.softmax(logits, dim=-1)          
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)   
        idx = torch.cat((idx, idx_next), dim=1)
    
    return idx

In [18]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)                 # Unsqueeze adds a batch dimension. Shape becomes [1, seq_length]
    return encoded_tensor
 
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)                 # Removes batch dimension, resulting in a 1D tensor.
    return tokenizer.decode(flat.tolist())      # First the tensor is converted to a list of integers and then decoded to human readable text.

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = generate_text_simple(               # Calls and passes parameter values into the generate_text_simple function.
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),    # Is the starting prompt.
    max_new_tokens=10,                                  # Specifies that 10 new tokens should be generated after the first prompt
    context_size=GPT_CONFIG_124M["context_length"]      # Determines the maximum token size to be considered at once.
 )

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))  

Output text:
 Every effort moves youprotect sufferedperformancebenef extracting innateOTAL replied793 gloves


### Cross Entropy

#### Splitting the Training and Validation Data

In [19]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

NameError: name 'probas' is not defined

In [None]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
               [        40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107, 588, 11311]])  #  " really like chocolate"]

In [None]:
with torch.no_grad():    
    logits = model(inputs)
probas = torch.softmax(logits, dim=-1)    
print(probas.shape)
print(probas)

torch.Size([2, 3, 50257])
tensor([[[1.4382e-05, 1.1071e-05, 3.3824e-05,  ..., 1.6957e-05,
          2.2516e-05, 5.1281e-06],
         [1.5546e-05, 5.5489e-06, 8.5387e-06,  ..., 1.1989e-05,
          2.0051e-05, 1.4695e-05],
         [3.6291e-05, 1.5771e-05, 2.3542e-05,  ..., 2.0037e-05,
          9.7197e-06, 1.4231e-05]],

        [[2.9708e-05, 1.3300e-05, 5.1529e-05,  ..., 1.2909e-05,
          3.5289e-05, 1.0048e-05],
         [1.6296e-05, 2.7914e-05, 1.8383e-05,  ..., 5.3830e-06,
          2.7065e-05, 1.3654e-05],
         [3.2494e-05, 1.4264e-05, 3.5843e-05,  ..., 1.3403e-05,
          1.9582e-05, 3.1462e-05]]])


In [None]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)  # Since we have 2 batches, each containing 3 tokens, we received the highest probability value for each token in each batch.
print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[21876],
         [11552],
         [20610]],

        [[  387],
         [50090],
         [26958]]])


In [None]:
# Printing the initial Softmax probability scores
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)
text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([2.9170e-05, 2.5213e-05, 1.4608e-05])
Text 2: tensor([4.1427e-05, 1.7147e-05, 7.0246e-06])


In [None]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [None]:
# It's important to keep track of the dimensions in order to perform cross entropy.
print("Logits shape:", logits.shape)
print("Targets shape:", targets.shape)

In [None]:
torch.manual_seed(123)
train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
 )

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
 )

### Function to train the model

In [None]:
# evaluate_model and generate_and_print_sample functions are not defined yet. 
def train_model_simple(model, train_loader, val_loader,
               optimizer, device, num_epochs,
               eval_freq, eval_iter, start_context, tokenizer):
    
    train_losses, val_losses, track_tokens_seen = [], [], []   
    tokens_seen, global_step = 0, -1
    for epoch in range(num_epochs):   
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()  
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            loss.backward()                    
            optimizer.step()                   
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:   
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)

                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, "
                      f"Val loss {val_loss:.3f}")
                
        generate_and_print_sample(                     
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen