# 3. Implementing a GPT model from scratch to generate text

## 3.1 Coding an LLM Architecture

### 3.1.1 GPT-124M Configuration Setup (Model Hyperparameters)

In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

### 3.1.2 Dummy GPT Model Skeleton (Embeddings, Transformer Blocks, Output Head)

In [2]:
import torch
import torch.nn as nn

class DummyGPTModel(nn.Module):
    """
    A minimal GPT-like model skeleton used for wiring and shape-checking.

    Components:
      - Token embedding and positional embedding
      - Dropout on the summed embeddings
      - A stack of placeholder transformer blocks (DummyTransformerBlock)
      - A final normalization placeholder (DummyLayerNorm)
      - A linear output head producing logits over the vocabulary

    Note:
      DummyTransformerBlock and DummyLayerNorm are identity modules here, so this
      model does not actually transform representations, it mainly demonstrates
      how tensors flow through a GPT-style architecture.
    """
    def __init__(self, cfg):
        """
        Initialize the dummy GPT model.

        Args:
            cfg (dict): Configuration dictionary containing:
                - "vocab_size" (int): Vocabulary size.
                - "emb_dim" (int): Embedding dimension.
                - "context_length" (int): Max sequence length for positional embeddings.
                - "drop_rate" (float): Dropout probability.
                - "n_layers" (int): Number of transformer blocks.
        """
        super().__init__()

        # Map token ids -> token embeddings
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])

        # Positional embeddings for positions [0..context_length-1]
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])

        # Dropout applied after adding token + position embeddings
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        # Stack of dummy transformer blocks (identity forward pass)
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        # Final "layer norm" placeholder (identity forward pass)
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])

        # Output projection to vocabulary logits (no bias, like GPT-style heads)
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        """
        Forward pass.

        Args:
            in_idx (Tensor): Token indices of shape (batch_size, seq_len).

        Returns:
            Tensor: Logits of shape (batch_size, seq_len, vocab_size).
        """
        # Extract batch and sequence length
        batch_size, seq_len = in_idx.shape

        # Token embeddings: (batch_size, seq_len, emb_dim)
        tok_embeds = self.tok_emb(in_idx)

        # Position indices: (seq_len,)
        # Create on the same device as in_idx to avoid device mismatch
        pos_ids = torch.arange(seq_len, device=in_idx.device)

        # Positional embeddings: (seq_len, emb_dim)
        pos_embeds = self.pos_emb(pos_ids)

        # Broadcast-add positional embeddings across the batch dimension
        # Result: (batch_size, seq_len, emb_dim)
        x = tok_embeds + pos_embeds

        # Apply dropout to embeddings
        x = self.drop_emb(x)

        # Pass through the (dummy) transformer blocks
        x = self.trf_blocks(x)

        # Apply the (dummy) final normalization
        x = self.final_norm(x)

        # Project to vocabulary logits
        logits = self.out_head(x)

        return logits


class DummyTransformerBlock(nn.Module):
    """
    Placeholder transformer block.

    In a real Transformer block, you'd have:
      - Multi-head self-attention (with causal masking)
      - Feed-forward network (MLP)
      - Residual connections and layer norms

    Here, forward returns x unchanged.
    """
    def __init__(self, cfg):
        """
        Initialize the dummy block.

        Args:
            cfg (dict): Model configuration (unused here, kept for API consistency).
        """
        super().__init__()

    def forward(self, x):
        """
        Identity forward pass.

        Args:
            x (Tensor): Input tensor.

        Returns:
            Tensor: Same as input.
        """
        return x


class DummyLayerNorm(nn.Module):
    """
    Placeholder LayerNorm.

    In a real implementation, this would normalize activations across the
    embedding dimension. Here, forward returns x unchanged.
    """
    def __init__(self, normalized_shape, eps=1e-5):
        """
        Initialize the dummy layer norm.

        Args:
            normalized_shape (int or tuple): Expected normalized shape (unused here).
            eps (float): Numerical stability constant (unused here).
        """
        super().__init__()

    def forward(self, x):
        """
        Identity forward pass.

        Args:
            x (Tensor): Input tensor.

        Returns:
            Tensor: Same as input.
        """
        return x

### 3.1.3 Tokenizing Text Inputs into a Batch Using GPT-2 Tokenizer

In [3]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


### 3.1.4 Running a Forward Pass Through the Dummy GPT Model

In [4]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)
