Exercise 4.1 Number of parameters in feed forward and attention modules
Calculate and compare the number of parameters that are contained in the feed forward module and those that are contained in the multi-head attention module.

Notes:
transformer block = attention + feedforward

In [None]:
import torch
import torch.nn as nn
import math

# ------------------------------
# 1. GELU 
# ------------------------------
class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            math.sqrt(2 / math.pi) * (x + 0.044715 * x.pow(3))
        ))

# ------------------------------
# 2. FeedForward 
# ------------------------------
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.net(x)

# ------------------------------
# 3. LayerNorm（ no bias）
# ------------------------------
class LayerNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-5):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(emb_dim))
        self.bias = nn.Parameter(torch.zeros(emb_dim))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False)
        return self.weight * (x - mean) / torch.sqrt(var + self.eps) + self.bias

# ------------------------------
# 4. MultiHeadAttention
# ------------------------------
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout, qkv_bias):
        super().__init__()
        self.num_heads = num_heads
        self.d_head = d_in // num_heads

        self.qkv_proj = nn.Linear(d_in, 3 * d_in, bias=qkv_bias)
        self.out_proj = nn.Linear(d_in, d_out)

        self.attn_drop = nn.Dropout(dropout)
        self.resid_drop = nn.Dropout(dropout)

        self.register_buffer("mask", torch.tril(torch.ones(context_length, context_length)).unsqueeze(0).unsqueeze(0))

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv_proj(x)
        qkv = qkv.view(B, T, 3, self.num_heads, self.d_head).transpose(1, 3)
        q, k, v = qkv.unbind(dim=2)

        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.d_head)
        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
        att = torch.softmax(att, dim=-1)
        att = self.attn_drop(att)

        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_drop(self.out_proj(y))
        return y

# ------------------------------
# 5. Transformer Block
# ------------------------------
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        x = x + self.drop(self.attn(self.norm1(x)))
        x = x + self.drop(self.ff(self.norm2(x)))
        return x

# ------------------------------
# 6. GPTModel
# ------------------------------
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        B, T = in_idx.shape
        tok_emb = self.tok_emb(in_idx)
        pos_emb = self.pos_emb(torch.arange(T, device=in_idx.device))
        x = self.drop_emb(tok_emb + pos_emb)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


In [21]:
# From Page 95
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}


In [22]:
model = GPTModel(GPT_CONFIG_124M)

In [30]:
# Theoretical calculations
print("Theoretical parameter count:")
# Take these two values to calculate the number of parameters
emb_dim = GPT_CONFIG_124M["emb_dim"]
n_heads = GPT_CONFIG_124M["n_heads"]

# Feedforward parameters calculation:
# Feedforward has two linear layers:
feedforward_parameters = emb_dim * (4 * emb_dim) * 2
print(f"Feedforward parameters: {feedforward_parameters}")

# Attention parameters calculation:
# Attention has four  Linear layers: input parameters(qkv) + output parameters
input_parameters = emb_dim * (3 * emb_dim) 
output_parameters = emb_dim * emb_dim

attention_parameters = input_parameters + output_parameters
print(f"Attention parameters: {attention_parameters}")


Theoretical parameter count:
Feedforward parameters: 4718592
Attention parameters: 2359296


In [None]:
# Actual calculations
print("Actual parameter count: ")

print("For the whole Transformer block:")
ffn_params = sum(p.numel() for p in model.trf_blocks[0].ff.parameters())
attn_params = sum(p.numel() for p in model.trf_blocks[0].attn.parameters())

print(f"FeedForward parameters: {ffn_params:,}")
print(f"Attention parameters: {attn_params:,}")

attn_proj_params = (
    model.trf_blocks[0].attn.qkv_proj.weight.numel()
    + model.trf_blocks[0].attn.out_proj.weight.numel()
)

ff_proj_params = (
    model.trf_blocks[0].ff.net[0].weight.numel() +
    model.trf_blocks[0].ff.net[2].weight.numel()
)

print("Only for Feedforward and Attention layers:")
print(f"FeedForward weights (no bias): {ff_proj_params:,}")
print(f"Attention weights (no bias): {attn_proj_params:,}")


Actual parameter count: 
For the whole Transformer block:
FeedForward parameters: 4,722,432
Attention parameters: 2,360,064
Only for Feedforward and Attention layers:
FeedForward weights (no bias): 4,718,592
Attention weights (no bias): 2,359,296


Exercise 4.2 Initializing larger GPT models
We initialized a 124-million-parameter GPT model, which is known as “GPT-2 small.” Without making any code modifications besides updating the configuration file, use the GPTModel class to implement GPT-2 medium (using 1,024-dimensional embeddings, 24 transformer blocks, 16 multi-head attention heads),  GPT-2 large (1,280-dimensional embeddings, 36 transformer blocks, 20 multi-head attention heads), and GPT-2 XL (1,600-dimensional embeddings, 48 transformer blocks, 25 multi-head attention heads). As a bonus, calculate the total number of parameters in each GPT model.

In [None]:
# Configurations for different sizes of GPT models
GPT_CONFIG_MEDIUM = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,   
    "n_heads": 16,     
    "n_layers": 24,    
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT_CONFIG_LARGE = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1280,
    "n_heads": 20,
    "n_layers": 36,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT_CONFIG_XL = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1600,
    "n_heads": 25,
    "n_layers": 48,
    "drop_rate": 0.1,
    "qkv_bias": False
}


In [32]:
model_medium = GPTModel(GPT_CONFIG_MEDIUM)
model_large = GPTModel(GPT_CONFIG_LARGE)
model_xl = GPTModel(GPT_CONFIG_XL)

In [41]:
# Count parameters:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def count_tied_parameters(model):
    output_params = sum(p.numel() for p in model.out_head.parameters())
    return count_parameters(model) - output_params

In [None]:
print("The total number of parameters: ")
print("GPT-2 Medium:", count_parameters(model_medium))
print("GPT-2 Large:", count_parameters(model_large))
print("GPT-2 XL:", count_parameters(model_xl))

GPT-2 Medium: 406212608
GPT-2 Large: 838220800
GPT-2 XL: 1637792000


Exercise 4.3 Using separate dropout parameters
At the beginning of this chapter, we defined a global drop_rate setting in the GPT_CONFIG_124M dictionary to set the dropout rate in various places throughout the GPTModel architecture. Change the code to specify a separate dropout value for the various dropout layers throughout the model architecture. (Hint: there are three distinct places where we used dropout layers: the embedding layer, shortcut layer, and multi-head attention module.)

In order to do that we need a new TransformerBlock and GPTModel

In [44]:
class TransformerBlockSeparateDropout(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"], 
            dropout=cfg["drop_attn"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_resid"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x


In [45]:
class GPTModelSeparateDropout(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_emb"])
        self.trf_blocks = nn.Sequential(
            *[TransformerBlockSeparateDropout(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


In [46]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_emb": 0.1,
    "drop_attn": 0.2,
    "drop_resid": 0.15,
    "qkv_bias": False
}


In [47]:
model = GPTModelSeparateDropout(GPT_CONFIG_124M)
