# Chapter 4 - Exercises

> Author : Badr TAJINI - Large Language model (LLMs) - ESIEE 2024-2025

---

# Exercise 4.1: Parameters in the feed forward versus attention module

**Key Exercise Question: How do the parameter counts differ between the `feed-forward` neural network module and `multi-head attention` mechanism in our transformer architecture?**

*Methodological Approach:*
The investigation focuses on a systematic computational analysis of parameter allocation across two critical transformer neural network components:

1. **Feed-Forward Neural Network Module**
   - Characterization: Nonlinear transformation module
   - Primary computational function: Introducing network complexity and representational capacity
   - Parametric considerations: Linear transformation layers, activation functions

2. **Multi-Head Attention Mechanism**
   - Characterization: Contextual feature interaction module
   - Primary computational function: Capturing inter-token relational dynamics
   - Parametric considerations: Projection matrices, attention computation

*Analytical Objectives:*
- Quantify the exact number of trainable parameters in each architectural component
- Comparative assessment of parametric complexity
- Understand the relative computational resource allocation

*Theoretical Implications:*
- Insights into architectural parameter efficiency
- Empirical understanding of transformer module design
- Potential implications for model optimization and architectural design

*Computational Methodology:*
1. Enumerate parameters in `feed-forward` module
2. Enumerate parameters in `multi-head attention` module
3. Perform comparative statistical analysis
4. Interpret parametric distribution characteristics

*Recommended Investigative Approach:*
- Utilize precise computational tracing
- Consider layer-specific parameter counting
- Account for bias terms and weight matrices

In [2]:
import torch
import torch.nn as nn

In [3]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

In [5]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

In the feed forward module we will only take the embedding dimension and the activation function as parameters. This because the first and last steps of the module is to modify the dimension of the batch given in input. The middle step is to apply the activation function to the batch.

In the multihead attention module, we will have more parameters. We have the input and output dimension, the context length, the number of head, the potential dropout and the potential bias. This module objective is to calculate the context vector, over multiple head. This require a lot of operation, for example calculating K,V and Q matrix.

In [6]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [7]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 1,          # Number of attention heads
    "n_layers": 1,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
# We simplify the configuration to see more easily the number of parameters
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

In [8]:
# Transformer module

for name,paramaters in model.trf_blocks.named_parameters():
  print(name)
  print(paramaters.shape)
  print(paramaters.numel())

0.att.W_query.weight
torch.Size([768, 768])
589824
0.att.W_key.weight
torch.Size([768, 768])
589824
0.att.W_value.weight
torch.Size([768, 768])
589824
0.att.out_proj.weight
torch.Size([768, 768])
589824
0.att.out_proj.bias
torch.Size([768])
768
0.ff.layers.0.weight
torch.Size([3072, 768])
2359296
0.ff.layers.0.bias
torch.Size([3072])
3072
0.ff.layers.2.weight
torch.Size([768, 3072])
2359296
0.ff.layers.2.bias
torch.Size([768])
768
0.norm1.scale
torch.Size([768])
768
0.norm1.shift
torch.Size([768])
768
0.norm2.scale
torch.Size([768])
768
0.norm2.shift
torch.Size([768])
768


At the opposite of how the modules were looking, it turns out the feed forward module result in more trainable parameters than the attention module. If we count for each, for the example above, we have :

2360064 Parameters for the attention module.

4722432 Parameters for the feedforward module.

And the more layers and head we had, the bigger the difference will be.

# Exercise 4.2: Initialize larger GPT models

- **GPT2-small** (the 124M configuration we already implemented):
    - "emb_dim" = 768
    - "n_layers" = 12
    - "n_heads" = 12

- **GPT2-medium:**
    - "emb_dim" = 1024
    - "n_layers" = 24
    - "n_heads" = 16

- **GPT2-large:**
    - "emb_dim" = 1280
    - "n_layers" = 36
    - "n_heads" = 20

- **GPT2-XL:**
    - "emb_dim" = 1600
    - "n_layers" = 48
    - "n_heads" = 25

**Key Exercise Question: Can you systematically scale the GPT-2 model architecture from the small configuration to medium, large, and XL variants by exclusively modifying the configuration parameters?**

*Architectural Scaling Challenge:*
This exercise explores the methodological expansion of the GPT-2 model across different scales, demonstrating how architectural complexity can be incrementally increased through strategic parameter modifications.

*Model Variants to Implement:*
1. **GPT-2 Small (Current Implementation)**
   - Embedding Dimensions ("emb_dim"): 768
   - Transformer Blocks ("n_layers"): 12
   - Multi-Head Attention Heads ("n_heads"): 12

2. **GPT-2 Medium**
   - Embedding Dimensions ("emb_dim"): 1,024
   - Transformer Blocks ("n_layers"): 24
   - Multi-Head Attention Heads ("n_heads"): 16

3. **GPT-2 Large**
   - Embedding Dimensions ("emb_dim"): 1,280
   - Transformer Blocks ("n_layers"): 36
   - Multi-Head Attention Heads ("n_heads"): 20

4. **GPT-2 XL**
   - Embedding Dimensions ("emb_dim"): 1,600
   - Transformer Blocks ("n_layers"): 48
   - Multi-Head Attention Heads ("n_heads"): 25

*Methodological Constraints:*
- Modify only the configuration file
- Utilize the existing `GPTModel` class without code alterations
- Demonstrate parameter scaling capabilities
- Calculate total parameters for each model variant

**Bonus Challenge:**
**Compute the total number of trainable parameters for each model variant, highlighting the exponential growth in model complexity.**



In [9]:
GPT_CONFIG_SMALL = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
GPT_CONFIG_MEDIUM = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 1024,         # Embedding dimension
    "n_heads": 16,          # Number of attention heads
    "n_layers": 24,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
GPT_CONFIG_LARGE = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 1280,         # Embedding dimension
    "n_heads": 20,          # Number of attention heads
    "n_layers": 36,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
GPT_CONFIG_XL = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 1600,         # Embedding dimension
    "n_heads": 25,          # Number of attention heads
    "n_layers": 48,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

As long as "emb_dim"%"n_head" = 0 we can scale the gpt-2 architecture. After that it only depends of the RAM available on the computer.

In [10]:
modelSmall = GPTModel(GPT_CONFIG_SMALL)
#modelMedium = GPTModel(GPT_CONFIG_MEDIUM)
#modelLarge = GPTModel(GPT_CONFIG_LARGE)
#modelXL = GPTModel(GPT_CONFIG_XL)

In [11]:
total_params = sum(p.numel() for p in modelSmall.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,009,536


In [18]:
del modelSmall

In [12]:
modelMedium = GPTModel(GPT_CONFIG_MEDIUM)

In [13]:
total_params = sum(p.numel() for p in modelMedium.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 406,212,608


In [16]:
del modelMedium

In [14]:
modelLarge = GPTModel(GPT_CONFIG_LARGE)

In [15]:
total_params = sum(p.numel() for p in modelLarge.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 838,220,800


In [17]:
del modelLarge

In [20]:
modelXL = GPTModel(GPT_CONFIG_XL)

In [21]:
total_params = sum(p.numel() for p in modelXL.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 1,637,792,000


In [22]:
del modelXL

We can see here, that for each successive model, the number of parameters keep being multiply by 2, to reach a total of 1.6M for the model XL. At this point google collab can barely stock all of them in the memory.

# Exercise 4.3: Using separate dropout parameters

**Key Exercise Question: How can we enhance the dropout configuration of the GPT model by implementing layer-specific dropout rates?**

*Architectural Dropout Refinement:*
The current implementation employs a uniform dropout rate across multiple model components, which presents an opportunity for more nuanced regularization strategies. This exercise challenges you to develop a more sophisticated approach to dropout implementation within neural network architectures.

*Dropout Localization:*
Three critical architectural components require distinct dropout configurations:
1. Embedding Layer
2. Shortcut (Residual) Connections
3. Multi-Head Attention Module

*Methodological Approach:*
You must modify the existing `GPT_CONFIG_124M` configuration to:
- Replace the monolithic `drop_rate` parameter
- Introduce a hierarchical dropout configuration
- Maintain the overall structural integrity of the model architecture

*Conceptual Challenge:*
The exercise requires a deep understanding of:
- Regularization techniques in neural network design
- The functional role of dropout in different architectural components
- Systematic configuration of model hyperparameters

In [26]:
GPT_CONFIG_DROPOUT = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate_emb": 0.1,       # Dropout rate for Embedding
    "drop_rate_shortcut": 0.2,   # Dropout rate for Shortcut
    "drop_rate_attention": 0.3,    # Dropout rate for Attention
    "qkv_bias": False       # Query-Key-Value bias
}

In [27]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate_attention"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate_shortcut"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

In [28]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate_emb"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

We modify the configuration file, the TransformerBlock class and the GPTModel class, for each to have a specific dropout parameter, and not a common one.