<b><u>Exercise 4.1</u></b>: Number of parameters in feed forward and attention modules

Calculate and compare the number of parameters that are contained in the feed forward module and those in the multi-head attention module.

In [1]:
# Fix path to be able to import classes
import sys
from pathlib import Path

# Add the src folder to the Python path
src_path = Path("../src").resolve()  # Adjust the relative path based on where your notebook is
sys.path.append(str(src_path))

In [2]:
import tiktoken
import torch

tokeniser = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokeniser.encode(txt1)))
batch.append(torch.tensor(tokeniser.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [3]:
from Chapter04 import GPTModel, GPT_CONFIG_124M

torch.manual_seed(42)
model = GPTModel(GPT_CONFIG_124M)

In [4]:
total_tblock = sum(p.numel() for p in model.trf_blocks.parameters())
print(f"Total number of parameters in Transformer block: {total_tblock:,}")

per_layer = round(total_tblock / 12)
print(f"Total number of parameters in block per layer: {per_layer:,}")

Total number of parameters in Transformer block: 85,026,816
Total number of parameters in block per layer: 7,085,568


In [5]:
# Easier to do it separately
from Chapter04 import TransformerBlock

block = TransformerBlock(GPT_CONFIG_124M)

ff_params = sum(p.numel() for p in block.ff.parameters())
print(f"Total number of parameters in feed forward module: {ff_params:,}")

mha_params = sum(p.numel() for p in block.att.parameters())
print(f"Total number of parameters in attention module: {mha_params:,}")

Total number of parameters in feed forward module: 4,722,432
Total number of parameters in attention module: 2,360,064


The feed forward module contains approx. twice as many parameters as the attention module. 

<b><u>Exercise 4.2</u></b>: Initialising larger GPT models

We initialised a 124-million-parameter GPT model. Without making any code modifications besides updating the configuration file, implement a GPT-2 medium, large, and XL model. 

In [6]:
# Medium model
GPT_CONFIG_124M["emb_dim"] = 1_024
GPT_CONFIG_124M["n_layers"] = 24
GPT_CONFIG_124M["n_heads"] = 16
GPT_CONFIG_124M


{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 1024,
 'n_heads': 16,
 'n_layers': 24,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [7]:
model_m = GPTModel(GPT_CONFIG_124M)
model_m

GPTModel(
  (tok_emb): Embedding(50257, 1024)
  (pos_emb): Embedding(1024, 1024)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=1024, out_features=1024, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU()
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linea

In [10]:
# of parameters in medium-sized model
total_params = sum(p.numel() for p in model_m.parameters())
total_params_m = (
    total_params - sum(p.numel()
    for p in model_m.out_head.parameters())
)
print(f"Number of trainable parameters "
      f"considering weight tying: {total_params_m:,}")

total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"Total sizes of the model: {total_size_mb:.2f} MB")

Number of trainable parameters considering weight tying: 354,749,440
Total sizes of the model: 1549.58 MB


In [11]:
# Large model
GPT_CONFIG_124M["emb_dim"] = 1_280
GPT_CONFIG_124M["n_layers"] = 36
GPT_CONFIG_124M["n_heads"] = 20
GPT_CONFIG_124M


{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 1280,
 'n_heads': 20,
 'n_layers': 36,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [12]:
model_l = GPTModel(GPT_CONFIG_124M)

# of parameters in large-sized model
total_params = sum(p.numel() for p in model_l.parameters())
total_params_l = (
    total_params - sum(p.numel()
    for p in model_l.out_head.parameters())
)
print(f"Number of trainable parameters "
      f"considering weight tying: {total_params_l:,}")

total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"Total sizes of the model: {total_size_mb:.2f} MB")

Number of trainable parameters considering weight tying: 773,891,840
Total sizes of the model: 3197.56 MB


In [13]:
# XL model
GPT_CONFIG_124M["emb_dim"] = 1_600
GPT_CONFIG_124M["n_layers"] = 48
GPT_CONFIG_124M["n_heads"] = 25
GPT_CONFIG_124M


{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 1600,
 'n_heads': 25,
 'n_layers': 48,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [14]:
model_xl = GPTModel(GPT_CONFIG_124M)

# of parameters in large-sized model
total_params = sum(p.numel() for p in model_xl.parameters())
total_params_xl = (
    total_params - sum(p.numel()
    for p in model_xl.out_head.parameters())
)
print(f"Number of trainable parameters "
      f"considering weight tying: {total_params_xl:,}")

total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"Total sizes of the model: {total_size_mb:.2f} MB")

Number of trainable parameters considering weight tying: 1,557,380,800
Total sizes of the model: 6247.68 MB
