In [1]:
import torch
import torch.nn as nn

# 1. The FeedForwardBlock class 
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)    # W1: d_model → d_ff, B1: d_ff
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)    # W2: d_ff → d_model, B2: d_model
    
    def forward(self, x):
        # Shape: (batch, seq_len, d_model) → (batch, seq_len, d_ff) → (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [2]:
print("=== TRANSFORMER FEEDFORWARD BLOCK STEP-BY-STEP ===\n")

# 2. Hyperparameters (tiny for easy reading)
d_model = 4    # Input/output dimension
d_ff = 8       # Hidden dimension (usually 4x larger)
dropout = 0.1  # Dropout rate
batch_size = 2
seq_len = 3

=== TRANSFORMER FEEDFORWARD BLOCK STEP-BY-STEP ===



In [3]:

# 3. Create tiny sample input: (batch=2, seq=3, d_model=4)
x = torch.arange(24).float().reshape(batch_size, seq_len, d_model)
print("Step 1 - Input x shape:", x.shape)
print("Input x:\n", x)
print()

Step 1 - Input x shape: torch.Size([2, 3, 4])
Input x:
 tensor([[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]],

        [[12., 13., 14., 15.],
         [16., 17., 18., 19.],
         [20., 21., 22., 23.]]])



In [6]:
# 4. Create the FeedForward block
ff_block = FeedForwardBlock(d_model=d_model, d_ff=d_ff, dropout=dropout)
print("Step 2 - Created FeedForwardBlock(d_model=4, d_ff=8)")
print("linear_1 weights shape:", ff_block.linear_1.weight.shape)  # [8, 4]
print("linear_2 weights shape:", ff_block.linear_2.weight.shape)  # [4, 8]
print("linear_1 bias shape:", ff_block.linear_1.bias.shape)  # [8]
print("linear_2 bias shape:", ff_block.linear_2.bias.shape)  # [4]
print()

Step 2 - Created FeedForwardBlock(d_model=4, d_ff=8)
linear_1 weights shape: torch.Size([8, 4])
linear_2 weights shape: torch.Size([4, 8])
linear_1 bias shape: torch.Size([8])
linear_2 bias shape: torch.Size([4])



In [16]:
# 5. STEP 1: linear_1(x) - First linear expansion
# nn.Linear: x @ W^T + b  (matrix multiplication + bias)
# Shape: (2,3,4) → (2,3,8)
print('original input :',x)
hidden = ff_block.linear_1(x)
print("Step 3 - After linear_1(x): shape", hidden.shape)
print('Output after passing through the first linear layer',hidden)
print("hidden (first few values):\n", hidden[:,:,:2])  # Show first 2 dims for readability
print()

original input : tensor([[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]],

        [[12., 13., 14., 15.],
         [16., 17., 18., 19.],
         [20., 21., 22., 23.]]])
Step 3 - After linear_1(x): shape torch.Size([2, 3, 8])
Output after passing through the first linear layer tensor([[[-2.2255, -0.1673,  1.5966, -1.5649, -0.1716,  0.4000,  0.1738,
          -0.5781],
         [-3.7350,  0.6146,  4.0118, -2.1933,  1.8573,  3.0914,  3.0974,
           0.1763],
         [-5.2446,  1.3965,  6.4269, -2.8217,  3.8862,  5.7829,  6.0210,
           0.9307]],

        [[-6.7541,  2.1784,  8.8421, -3.4501,  5.9151,  8.4744,  8.9446,
           1.6852],
         [-8.2637,  2.9603, 11.2573, -4.0785,  7.9440, 11.1658, 11.8682,
           2.4396],
         [-9.7732,  3.7423, 13.6725, -4.7069,  9.9728, 13.8573, 14.7918,
           3.1940]]], grad_fn=<ViewBackward0>)
hidden (first few values):
 tensor([[[-2.2255, -0.1673],
         [-3.7350,  0.6146],
         [-5

In [17]:
# 6. STEP 2: torch.relu() - ReLU activation
# ReLU: max(0, x) - kills all negative values
# Shape unchanged: (2,3,8)
relu_out = torch.relu(hidden)
print("Step 4 - After ReLU: kills negatives")
print("Before ReLU (had negatives):", (hidden < 0).sum().item(), "negative values")
print("After ReLU (all non-negative):", (relu_out < 0).sum().item(), "negative values")
print('Output after Relu',relu_out)
print("relu_out (first few):\n", relu_out[:,:,:2])
print()

Step 4 - After ReLU: kills negatives
Before ReLU (had negatives): 15 negative values
After ReLU (all non-negative): 0 negative values
Output after Relu tensor([[[ 0.0000,  0.0000,  1.5966,  0.0000,  0.0000,  0.4000,  0.1738,
           0.0000],
         [ 0.0000,  0.6146,  4.0118,  0.0000,  1.8573,  3.0914,  3.0974,
           0.1763],
         [ 0.0000,  1.3965,  6.4269,  0.0000,  3.8862,  5.7829,  6.0210,
           0.9307]],

        [[ 0.0000,  2.1784,  8.8421,  0.0000,  5.9151,  8.4744,  8.9446,
           1.6852],
         [ 0.0000,  2.9603, 11.2573,  0.0000,  7.9440, 11.1658, 11.8682,
           2.4396],
         [ 0.0000,  3.7423, 13.6725,  0.0000,  9.9728, 13.8573, 14.7918,
           3.1940]]], grad_fn=<ReluBackward0>)
relu_out (first few):
 tensor([[[0.0000, 0.0000],
         [0.0000, 0.6146],
         [0.0000, 1.3965]],

        [[0.0000, 2.1784],
         [0.0000, 2.9603],
         [0.0000, 3.7423]]], grad_fn=<SliceBackward0>)



In [21]:
# 7. STEP 3: self.dropout() - Randomly zero some values during training
# nn.Dropout: randomly sets p% of elements to 0, scales rest by 1/(1-p)
# Shape unchanged: (2,3,8)
dropout_out = ff_block.dropout(relu_out)
print('before droput',relu_out)
print("Linear Layer - Relu, before droput :", (relu_out == 0).sum().item(), "zeros")
print("*"*30)
print("Step 5 - After dropout (training mode):")
print("Some values zeroed out:", (dropout_out == 0).sum().item(), "zeros")
print('after droput',dropout_out)
print("*"*30)
print("dropout_out (first few):\n", dropout_out[:,:,:2])
print()

before droput tensor([[[ 0.0000,  0.0000,  1.5966,  0.0000,  0.0000,  0.4000,  0.1738,
           0.0000],
         [ 0.0000,  0.6146,  4.0118,  0.0000,  1.8573,  3.0914,  3.0974,
           0.1763],
         [ 0.0000,  1.3965,  6.4269,  0.0000,  3.8862,  5.7829,  6.0210,
           0.9307]],

        [[ 0.0000,  2.1784,  8.8421,  0.0000,  5.9151,  8.4744,  8.9446,
           1.6852],
         [ 0.0000,  2.9603, 11.2573,  0.0000,  7.9440, 11.1658, 11.8682,
           2.4396],
         [ 0.0000,  3.7423, 13.6725,  0.0000,  9.9728, 13.8573, 14.7918,
           3.1940]]], grad_fn=<ReluBackward0>)
Linear Layer - Relu, before droput : 15 zeros
******************************
Step 5 - After dropout (training mode):
Some values zeroed out: 21 zeros
after droput tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.4444,  0.0000,
           0.0000],
         [ 0.0000,  0.6829,  4.4575,  0.0000,  0.0000,  0.0000,  0.0000,
           0.1959],
         [ 0.0000,  1.5517,  7.1411,  0.0000,  4.3

In [22]:
# 8. STEP 4: linear_2() - Second linear projection back to d_model
# Shape: (2,3,8) → (2,3,4)
output = ff_block.linear_2(dropout_out)
print("Step 6 - After linear_2(): back to d_model=4")
print("Final output shape:", output.shape)
print("Final output:\n", output)
print()

Step 6 - After linear_2(): back to d_model=4
Final output shape: torch.Size([2, 3, 4])
Final output:
 tensor([[[-0.1719,  0.0875,  0.0182,  0.1130],
         [-0.5034,  0.4950, -1.1832,  0.4113],
         [-0.4243,  0.4544,  1.0347,  1.3692]],

        [[-0.4609,  0.3650,  1.8865,  1.9850],
         [-0.4975,  0.2756,  2.7382,  2.6008],
         [ 0.8554, -2.1799,  8.1412,  2.1299]]], grad_fn=<ViewBackward0>)



In [23]:
# 9. END-TO-END: Just call the module!
print("Step 7 - END-TO-END: ff_block(x)")
direct_output = ff_block(x)
print("direct_output shape:", direct_output.shape)
print("Matches step-by-step?", torch.allclose(output, direct_output))
print()

Step 7 - END-TO-END: ff_block(x)
direct_output shape: torch.Size([2, 3, 4])
Matches step-by-step? False



In [24]:
# 10. Compare input vs output (shows transformation)
print("Step 8 - INPUT vs OUTPUT comparison:")
print("Input range:  ", x.min().item(), "to", x.max().item())
print("Output range:", output.min().item(), "to", output.max().item())
print("Output is NON-LINEAR transformation of input!")

Step 8 - INPUT vs OUTPUT comparison:
Input range:   0.0 to 23.0
Output range: -2.1798744201660156 to 8.14122200012207
Output is NON-LINEAR transformation of input!


Step 3 - After linear_1: (2,3,8)  ← EXPANDED from 4→8 dims
Step 4 - After ReLU: all ≥0      ← NON-LINEARITY introduced
Step 5 - After dropout: ~10% zeros ← REGULARIZATION
Step 6 - After linear_2: (2,3,4) ← BACK to original dim
Key insights from "Attention is All You Need"
Paper reference (Section 3.3):
FFN(x) = max(0, xW₁ + b₁)W₂ + b₂

Expands to higher dimension (d_ff = 2048 typically, 4x d_model=512)

ReLU adds non-linearity

Contracts back to d_model

Pointwise (independent per position)

Why this design?
Expansion: More capacity for complex transformations

ReLU: Non-linearity (linear→linear = boring)

Dropout: Prevents overfitting

Back to d_model: Compatible with attention residuals

## Understanding Importance of Bias Term 

In [7]:
# NO BIAS: y = w1*x1 + w2*x2
# All decision boundaries MUST pass through (0,0)

import matplotlib.pyplot as plt
import numpy as np

x1 = np.array([1, 2, 3])
x2 = np.array([2, 4, 6])
y = np.array([0, 1, 1])  # Want to separate these

# Without bias: line must go through origin
w1, w2 = 1, 0.5
line_no_bias = w1 * x1 + w2 * x2  # Always passes through (0,0)
print("Without bias at (0,0):", w1*0 + w2*0)  # 0

Matplotlib is building the font cache; this may take a moment.


Without bias at (0,0): 0.0


In [25]:
# Problem: If your data doesn't cluster around origin, no-bias model struggles.

# WITH bias: Can shift anywhere
# python
# WITH BIAS: y = w1*x1 + w2*x2 + b
b = -1.0  # Shift the line down
line_with_bias = w1 * x1 + w2 * x2 + b
print("With bias at (0,0):", b)  # -1.0 (shifted!)

With bias at (0,0): -1.0


In [26]:
# In neural networks (your FeedForward)
# Each nn.Linear(d_model, d_ff) automatically includes bias:

linear_1 = nn.Linear(4, 8)  # Has weight (8,4) + bias (8,)
# Computation: x @ W^T + b  ← b shifts every output neuron


In [27]:
# Without bias (nn.Linear(..., bias=False)):

# hidden[i] = sum(x[j] * W[i,j])  # Must be 0 when x=0
# With bias:

# hidden[i] = sum(x[j] * W[i,j]) + b[i]  # Can be anything when x=0

x_all_zero = torch.zeros(2, 3, 4)  # All inputs = 0
print('Input Tensor',x_all_zero)
ff_block = FeedForwardBlock(4, 8, 0.1) ## input dim , hidden dim , dropout prob
zero_output = ff_block(x_all_zero)
print('Output after FFN',zero_output)
print("Input all zeros → Output:", zero_output[0,0])  # NOT zero! Thanks to biases

Input Tensor tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])
Output after FFN tensor([[[ 0.0029, -0.0089,  0.2006, -0.1386],
         [ 0.0018, -0.0075,  0.2000, -0.1376],
         [ 0.0029, -0.0089,  0.2006, -0.1386]],

        [[ 0.0979,  0.0076,  0.2264, -0.2689],
         [ 0.0029, -0.0089,  0.2006, -0.1386],
         [ 0.0370,  0.0981,  0.0927, -0.2013]]], grad_fn=<ViewBackward0>)
Input all zeros → Output: tensor([ 0.0029, -0.0089,  0.2006, -0.1386], grad_fn=<SelectBackward0>)


Biases ensure even zero-input produces meaningful activation.

Why biases matter in Transformers
Residual connections: x + FF(x) needs FF(x) to have flexibility

Non-zero activations: Even if attention output is small, FF can still activate

Shift invariance: Model learns patterns regardless of input scale/origin