In [1]:
V = 50257 # Vocab Size
T = 1024 # Context Length
L = 48 # Num Layers
d = 1600 # Model Dim
H = 25 # Num Heads
d_ff = 6400 # Feedforward Dimension

In [2]:
# (a) Parameter Count

# Embedding (Token + Positional)
token = V * d
positional = T * d

embedding_total = token + positional

# Per Layer (Attention + MLP)

# Attention (4 Matrices of shape (d, d) -> Q, K, V, Output)
attention = 4 * (d * d)

# MLP (2 Matrices of shape (d, d_ff) and (d_ff, d))
mlp = 2 * (d * d_ff)

# Layer Normalization (2 per layer)
layer_norm = 2 * (2 * d)  # 2 parameters (scale & shift)

param_per_layer = attention + mlp + layer_norm

total = embedding_total + (param_per_layer * L)

print(f"Total Parameters: {total / 1e9:.2f} Billion")

# Memory Load
total_bytes = total * 4  # 4 bytes per parameter (float32)
total_gb = total_bytes / (1024 ** 3)

print(f"Memory Load: {total_gb:.2f} GB")

Total Parameters: 1.56 Billion
Memory Load: 5.80 GB


In [3]:
# (b) Matrix Multiplication to complete a forward pass
# How many FLOPs matrix multiplications are needed

num_flops = 2 * T * L * (
    (4 * d * d) +  # Attention
    (2 * d * d_ff)  # MLP
)

print(f"Total FLOPs for Forward Pass: {num_flops / 1e12:.2f} TFLOPs")

Total FLOPs for Forward Pass: 3.02 TFLOPs


In [4]:
# (c) Which parts of the model require the most FLOPs?

print(f"Flops in Attention: { (2 * T * L * 4 * d * d) / num_flops * 100:.2f}%")
print(f"Flops in MLP: { (2 * T * L * 2 * d * d_ff) / num_flops * 100:.2f}%")

Flops in Attention: 33.33%
Flops in MLP: 66.67%


In [5]:
# (d) Analysis 1 (GPT-2 Small)
V = 50257 # Vocab Size
T = 1024 # Context Length
L = 12 # Num Layers
d = 768 # Model Dim
H = 12 # Num Heads
d_ff = 6400 # Feedforward Dimension

# (a) Parameter Count

# Embedding (Token + Positional)
token = V * d
positional = T * d

embedding_total = token + positional

# Per Layer (Attention + MLP)

# Attention (4 Matrices of shape (d, d) -> Q, K, V, Output)
attention = 4 * (d * d)

# MLP (2 Matrices of shape (d, d_ff) and (d_ff, d))
mlp = 2 * (d * d_ff)

# Layer Normalization (2 per layer)
layer_norm = 2 * (2 * d)  # 2 parameters (scale & shift)

param_per_layer = attention + mlp + layer_norm

total = embedding_total + (param_per_layer * L)

print(f"Total Parameters: {total / 1e9:.2f} Billion")

# Memory Load
total_bytes = total * 4  # 4 bytes per parameter (float32)
total_gb = total_bytes / (1024 ** 3)

print(f"Memory Load: {total_gb:.2f} GB")

# (b) Matrix Multiplication to complete a forward pass
# How many FLOPs matrix multiplications are needed

num_flops = 2 * T * L * (
    (4 * d * d) +  # Attention
    (2 * d * d_ff)  # MLP
)

print(f"Total FLOPs for Forward Pass: {num_flops / 1e12:.2f} TFLOPs")

# (c) Which parts of the model require the most FLOPs?

print(f"Flops in Attention: { (2 * T * L * 4 * d * d) / num_flops * 100:.2f}%")
print(f"Flops in MLP: { (2 * T * L * 2 * d * d_ff) / num_flops * 100:.2f}%")

Total Parameters: 0.19 Billion
Memory Load: 0.69 GB
Total FLOPs for Forward Pass: 0.30 TFLOPs
Flops in Attention: 19.35%
Flops in MLP: 80.65%


In [6]:
# Analysis 2 (GPT-2 Medium)
V = 50257 # Vocab Size
T = 1024 # Context Length
L = 24 # Num Layers
d = 1024 # Model Dim
H = 16 # Num Heads
d_ff = 6400 # Feedforward Dimension

# (a) Parameter Count

# Embedding (Token + Positional)
token = V * d
positional = T * d

embedding_total = token + positional

# Per Layer (Attention + MLP)

# Attention (4 Matrices of shape (d, d) -> Q, K, V, Output)
attention = 4 * (d * d)

# MLP (2 Matrices of shape (d, d_ff) and (d_ff, d))
mlp = 2 * (d * d_ff)

# Layer Normalization (2 per layer)
layer_norm = 2 * (2 * d)  # 2 parameters (scale & shift)

param_per_layer = attention + mlp + layer_norm

total = embedding_total + (param_per_layer * L)

print(f"Total Parameters: {total / 1e9:.2f} Billion")

# Memory Load
total_bytes = total * 4  # 4 bytes per parameter (float32)
total_gb = total_bytes / (1024 ** 3)

print(f"Memory Load: {total_gb:.2f} GB")


num_flops = 2 * T * L * (
    (4 * d * d) +  # Attention
    (2 * d * d_ff)  # MLP
)

print(f"Total FLOPs for Forward Pass: {num_flops / 1e12:.2f} TFLOPs")

print(f"Flops in Attention: { (2 * T * L * 4 * d * d) / num_flops * 100:.2f}%")
print(f"Flops in MLP: { (2 * T * L * 2 * d * d_ff) / num_flops * 100:.2f}%")

Total Parameters: 0.47 Billion
Memory Load: 1.74 GB
Total FLOPs for Forward Pass: 0.85 TFLOPs
Flops in Attention: 24.24%
Flops in MLP: 75.76%


In [7]:
# Analysis 2 (GPT-2 Large)
V = 50257 # Vocab Size
T = 1024 # Context Length
L = 36 # Num Layers
d = 1280 # Model Dim
H = 20 # Num Heads
d_ff = 6400 # Feedforward Dimension

# (a) Parameter Count

# Embedding (Token + Positional)
token = V * d
positional = T * d

embedding_total = token + positional

# Per Layer (Attention + MLP)

# Attention (4 Matrices of shape (d, d) -> Q, K, V, Output)
attention = 4 * (d * d)

# MLP (2 Matrices of shape (d, d_ff) and (d_ff, d))
mlp = 2 * (d * d_ff)

# Layer Normalization (2 per layer)
layer_norm = 2 * (2 * d)  # 2 parameters (scale & shift)

param_per_layer = attention + mlp + layer_norm

total = embedding_total + (param_per_layer * L)

print(f"Total Parameters: {total / 1e9:.2f} Billion")

# Memory Load
total_bytes = total * 4  # 4 bytes per parameter (float32)
total_gb = total_bytes / (1024 ** 3)

print(f"Memory Load: {total_gb:.2f} GB")


num_flops = 2 * T * L * (
    (4 * d * d) +  # Attention
    (2 * d * d_ff)  # MLP
)

print(f"Total FLOPs for Forward Pass: {num_flops / 1e12:.2f} TFLOPs")


print(f"Flops in Attention: { (2 * T * L * 4 * d * d) / num_flops * 100:.2f}%")
print(f"Flops in MLP: { (2 * T * L * 2 * d * d_ff) / num_flops * 100:.2f}%")

Total Parameters: 0.89 Billion
Memory Load: 3.32 GB
Total FLOPs for Forward Pass: 1.69 TFLOPs
Flops in Attention: 28.57%
Flops in MLP: 71.43%


In [11]:
# (e) Increase context length to 16,384. How does FlOPs for one forward pass change? 
# How does relative contribution of FLOPs of the model components change?

V = 50257 # Vocab Size
T = 16384 # Context Length
L = 48 # Num Layers
d = 1600 # Model Dim
H = 25 # Num Heads
d_ff = 6400 # Feedforward Dimension

flops_linear_per_layer = 2 * T * ( (4*d*d) + (2*d*d_ff) )

flops_quadratic_per_layer = 2 * (2 * T * T * d)

flops_logits = 2 * T * d * V

total_flops = L * (flops_linear_per_layer + flops_quadratic_per_layer) + flops_logits

prop_linear = (L * flops_linear_per_layer) / total_flops
prop_quadratic = (L * flops_quadratic_per_layer) / total_flops

print(f"Total FLOPs: {total_flops / 1e12:.2f} TFLOPs")
print(f"Relative Contribution of Components:")
print(f"  - Linear Projections (Dense Layers): {prop_linear:.1%}")
print(f"  - Attention Mechanism (Quadratic):   {prop_quadratic:.1%}")

Total FLOPs: 133.42 TFLOPs
Relative Contribution of Components:
  - Linear Projections (Dense Layers): 36.2%
  - Attention Mechanism (Quadratic):   61.8%
