In [27]:
import torch
import torch.nn as nn

# 1. Create the LayerNorm module
class LayerNormalization(nn.Module):
    def __init__(self, eps: float = 1e-5) -> None:  # Fixed eps to 1e-5 (standard value)
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))  # Learnable scale
        self.bias = nn.Parameter(torch.zeros(1))  # Learnable shift
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias



In [28]:
# 2. Create tiny sample data: batch=2, seq_len=3, features=4
print("=== STEP-BY-STEP LAYER NORMALIZATION / FEATURE NORMALIZATION ===\n")

x = torch.tensor([[[1.0, 2.0, 3.0, 4.0],      # Sentence 1 -eg : dogs love meat
                   [5.0, 6.0, 7.0, 8.0],
                   [9.0, 10.0, 11.0, 12.0]],

                  [[13.0, 14.0, 15.0, 16.0],    # Sentence 2 -eg : cats love fish
                   [17.0, 18.0, 19.0, 20.0],
                   [21.0, 22.0, 23.0, 24.0]]])
print("Step 1 - Input x shape:", x.shape)
print("Input x:\n", x)
print()

=== STEP-BY-STEP LAYER NORMALIZATION / FEATURE NORMALIZATION ===

Step 1 - Input x shape: torch.Size([2, 3, 4])
Input x:
 tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]],

        [[13., 14., 15., 16.],
         [17., 18., 19., 20.],
         [21., 22., 23., 24.]]])



In [29]:
# 3. Create LayerNorm
layer_norm = LayerNormalization(eps=1e-5)
print("Step 2 - Initial alpha:", layer_norm.alpha.item())
print("Initial bias:", layer_norm.bias.item())
print(layer_norm)

Step 2 - Initial alpha: 1.0
Initial bias: 0.0
LayerNormalization()


In [30]:
### numpy implementation on single array / token 
import numpy as np
np.mean([ 1.,  2.,  3.,  4.]) , np.std([ 1.,  2.,  3.,  4.])
data = np.array([1, 2, 3, 4])
print(np.std(data))        #  (population std, ddof=0)
print(np.std(data, ddof=1)) # (sample std)
normalized=(data-np.mean(data)) / np.std(data,ddof=1)+layer_norm.eps
print(normalized)
output = layer_norm.alpha.item() * normalized + layer_norm.bias.item()
print(output)

1.118033988749895
1.2909944487358056
[-1.161885   -0.38728833  0.38730833  1.161905  ]
[-1.161885   -0.38728833  0.38730833  1.161905  ]


In [31]:
# 4. Compute mean along LAST dimension (features=dim=-1)
mean = x.mean(dim=-1, keepdim=True)
print("Step 3 - Mean along features (dim=-1):")
print("mean shape:", mean.shape)
print(mean)
print()

Step 3 - Mean along features (dim=-1):
mean shape: torch.Size([2, 3, 1])
tensor([[[ 2.5000],
         [ 6.5000],
         [10.5000]],

        [[14.5000],
         [18.5000],
         [22.5000]]])



In [32]:
# 5. Compute std along LAST dimension
std = x.std(dim=-1, keepdim=True)
print("Step 4 - Std along features (dim=-1):")
print("std shape:", std.shape)
print(std)
print()

Step 4 - Std along features (dim=-1):
std shape: torch.Size([2, 3, 1])
tensor([[[1.2910],
         [1.2910],
         [1.2910]],

        [[1.2910],
         [1.2910],
         [1.2910]]])



In [33]:
# 6. Normalize: (x - mean) / (std + eps)
normalized = (x - mean) / (std + layer_norm.eps)
print("Step 5 - Normalized: (x-mean)/(std+eps)")
print("normalized shape:", normalized.shape)
print(normalized)
print()

Step 5 - Normalized: (x-mean)/(std+eps)
normalized shape: torch.Size([2, 3, 4])
tensor([[[-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619]],

        [[-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619]]])



In [34]:
# 7. Apply learnable alpha and bias
output = layer_norm.alpha * normalized + layer_norm.bias
print("Step 6 - Final output: alpha * normalized + bias")
print("output shape:", output.shape)
print(output)
print()
# y = α × [(x - μ) / (σ + ε)] + β

Step 6 - Final output: alpha * normalized + bias
output shape: torch.Size([2, 3, 4])
tensor([[[-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619]],

        [[-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619]]], grad_fn=<AddBackward0>)



In [35]:

# 8. Verify: each row should have mean≈0, std≈1
print("Step 7 - VERIFICATION (should be mean≈0, std≈1 per row):")
print("Means per row:", output.mean(dim=-1))
print("Stds per row:", output.std(dim=-1))
print()




Step 7 - VERIFICATION (should be mean≈0, std≈1 per row):
Means per row: tensor([[0., 0., 0.],
        [0., 0., 0.]], grad_fn=<MeanBackward1>)
Stds per row: tensor([[1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000]], grad_fn=<StdBackward0>)



In [36]:
# 9. Using the module directly
print("Step 8 - Using layer_norm(x) directly:")
result = layer_norm(x)
print(result)
print("result == output?", torch.allclose(result, output))

Step 8 - Using layer_norm(x) directly:
tensor([[[-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619]],

        [[-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619]]], grad_fn=<AddBackward0>)
result == output? True
