In [22]:
# In this notebook, you learn:
#
# 1) What is Layer Normalization?
# 2) How to use Layer Normalization in Transformers?

In [2]:
import torch
from torch import nn

In [3]:
# Resources to go through before continuing further in this notebook:
#
# 1) https://www.youtube.com/watch?v=tNIpEZLv_eg
#       -- This video explains the concept of Batch Normalization in a very simple way.
#       -- It is useful to understand Batch Normalization before understanding Layer Normalization.
# 2) https://www.youtube.com/watch?v=em6dfRxYkYU
#       -- This video gives intuition on why Batch Normalization works.
# 3) https://www.kaggle.com/code/halflingwizard/how-does-layer-normalization-work
#       -- This blog explains the concept of Layer Normalization.
# 4) https://www.youtube.com/watch?v=2V3Uduw1zwQ&t=103s
#       -- This video runs Layer Normalization on a simple example and explains the results.
# 5) https://leimao.github.io/blog/Layer-Normalization/
#       -- Explains the mathematics behind Layer Normalization.

## [torch.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html#torch.nn.LayerNorm)

In [4]:
# t1 contains 4 inputs and 5 features.
# Each row represents an input and each column represents a feature.
t1 = torch.arange(start=0, end=20, step=1, dtype=torch.float32).reshape(4, 5)
print("shape: ", t1.shape)
print("t1: ", t1)

shape:  torch.Size([4, 5])
t1:  tensor([[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.],
        [10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.]])


In [5]:
# Layer Normalization is applied on the features of the each input independently. It takes each
# row (corresponding to a single input), calculates the mean and standard deviation of the features
# of that input, and normalizes the features of that input using the mean and standard deviation.
# The formula for Layer Normalization is:
# Layer Normalization = (gamma * (x - mean) / sqrt(variance + epsilon)) + beta
# 
# where: x is the input, gamma and beta are learnable parameters, mean and variance are the mean and
# variance of the features of the input, and epsilon is a small number to avoid division by zero.

# normalized_shape corresponds to the number of features.
layer_norm_1 = nn.LayerNorm(normalized_shape=5)
print("layer_norm: ", layer_norm_1)
print("-" * 150)
# Note the size of the weights and bias. The size of the weights is equal to the number of features.
# Each feature has a gamma and a beta exclusively associated with it i.e., the model learns optimal 
# scaling and shifting factors for each feature separately.
print("shape of gamma or weights: ", layer_norm_1.weight.shape)
print("gamma or weights: ", layer_norm_1.weight)
print("-" * 150)
print("shape of beta or bias: ", layer_norm_1.bias.shape)
print("beta or bias: ", layer_norm_1.bias)

layer_norm:  LayerNorm((5,), eps=1e-05, elementwise_affine=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of gamma or weights:  torch.Size([5])
gamma or weights:  Parameter containing:
tensor([1., 1., 1., 1., 1.], requires_grad=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of beta or bias:  torch.Size([5])
beta or bias:  Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)


In [6]:
# Lets take the first input and apply Layer Normalization on it and verify the results with manual calculation.
# The first input is: [0, 1, 2, 3, 4]
# The mean of the features of the first input is: (0 + 1 + 2 + 3 + 4) / 5 = 2
# The standard deviation of the features of the first input is: sqrt(((0 - 2)^2 + (1 - 2)^2 + (2 - 2)^2 + (3 - 2)^2 + (4 - 2)^2) / 5) = sqrt(2)
# The normalized features of the first input are: [(0 - 2) / sqrt(2), (1 - 2) / sqrt(2), (2 - 2) / sqrt(2), (3 - 2) / sqrt(2), (4 - 2) / sqrt(2)]
# The normalized features of the first input are: [-1.41, -0.71, 0, 0.71, 1.41]
t1_normalized_1 = layer_norm_1(t1)
print("shape: ", t1_normalized_1.shape)
print("normalized_input: ", t1_normalized_1)

shape:  torch.Size([4, 5])
normalized_input:  tensor([[-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
        [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
        [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
        [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142]],
       grad_fn=<NativeLayerNormBackward0>)


In [7]:
# Layer normalization acts on the elements of the last dimension of the input tensor (shape doesn't matter).
# Now, lets try and see what Layer Normalization does to a 3D tensor. 
t2 = torch.arange(start=0, end=24, step=1, dtype=torch.float32).reshape(2, 3, 4)
print("shape: ", t2.shape)
print("t2: ", t2)

shape:  torch.Size([2, 3, 4])
t2:  tensor([[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]],

        [[12., 13., 14., 15.],
         [16., 17., 18., 19.],
         [20., 21., 22., 23.]]])


In [8]:
layer_norm_2_1 = nn.LayerNorm(normalized_shape=4)
print("layer_norm: ", layer_norm_2_1)
print("-" * 150)
print("shape of gamma or weights: ", layer_norm_2_1.weight.shape)
print("gamma or weights: ", layer_norm_2_1.weight)
print("-" * 150)
print("shape of beta or bias: ", layer_norm_2_1.bias.shape)
print("beta or bias: ", layer_norm_2_1.bias)
print("-" * 150)
# This is the same as above shown for t1. We have normalized across the last dimension.
t2_normalized_1 = layer_norm_2_1(t2)
print("shape: ", t2_normalized_1.shape)
print("t2_normalized_1: ", t2_normalized_1)

layer_norm:  LayerNorm((4,), eps=1e-05, elementwise_affine=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of gamma or weights:  torch.Size([4])
gamma or weights:  Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of beta or bias:  torch.Size([4])
beta or bias:  Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
shape:  torch.Size([2, 3, 4])
t2_normalized_1:  tensor([[[-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416]],

        [[-1.3416, -0.4472,  0.4472,  1

In [9]:
layer_norm_2_error = nn.LayerNorm(normalized_shape=3)
print("layer_norm: ", layer_norm_2_error)
# This raises error as expected. By default, Layer Norm tries to normalize across the last dimension.
# The normalized_shape passed to LayerNorm should be the size of the last dimension of the input tensor.
# If the expected shape (normalized_shape or number of features) doesn't match the last dimension of 
# the input tensor, then it raises an error.
t2_normalized_error = layer_norm_2_error(t2)

layer_norm:  LayerNorm((3,), eps=1e-05, elementwise_affine=True)


RuntimeError: Given normalized_shape=[3], expected input with shape [*, 3], but got input of size[2, 3, 4]

In [10]:
# Now, lets try to understand what does it mean to apply Layer Normalization across multiple dimensions. For now, 
# consider we have some random input tensor and we want to apply Layer Normalization to the input. 
# Here, by default, LayerNorm normalizes across the "last" 2 dimensions since the 'normalized_shape' has shape 2 (2 dimensions).
layer_norm_2_2 = nn.LayerNorm(normalized_shape=[3, 4])
print("layer_norm_2_2: ", layer_norm_2_2)
print("-" * 150)
# Note the size of the weights and bias. The size of the weights is equal to the number of features or 
# normalized_shape. Each feature has a gamma and a beta associated with it i.e., the model learns optimal 
# scaling and shifting factors for each feature separately.
print("shape of gamma or weights: ", layer_norm_2_2.weight.shape)
print("gamma or weights: ", layer_norm_2_2.weight)
print("-" * 150)
print("shape of beta or bias: ", layer_norm_2_2.bias.shape)
print("beta or bias: ", layer_norm_2_2.bias)
print("-" * 150)
# We obtain groups of 2D tensors since we consider the last 2 dimensions. For each group, mean and variance are calculated
# independently which are then used to normalize the values within that group. So, each element is only present in
# 1 group and normalized by only one set of mean and variance.
# For example: in t2, 
# the first group is [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]] and 
# the second group is [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]. 
#
# Now, consideing the first group, the mean is (0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11) / 12 = 5.5 and 
# the variance is sqrt(((0 - 5.5)^2 + (1 - 5.5)^2 + (2 - 5.5)^2 + (3 - 5.5)^2 + (4 - 5.5)^2 + (5 - 5.5)^2 + (6 - 5.5)^2 + (7 - 5.5)^2 + (8 - 5.5)^2 + (9 - 5.5)^2 + (10 - 5.5)^2 + (11 - 5.5)^2) / 12) = 3.45
# 
# The normalized features of the first group are (ignoring the eplison and gamma and beta for now):
# [[(0 - 5.5) / 3.45, (1 - 5.5) / 3.45, (2 - 5.5) / 3.45, (3 - 5.5) / 3.45, (4 - 5.5) / 3.45, (5 - 5.5) / 3.45, (6 - 5.5) / 3.45, (7 - 5.5) / 3.45, (8 - 5.5) / 3.45, (9 - 5.5) / 3.45, (10 - 5.5) / 3.45, (11 - 5.5) / 3.45]]
t2_normalized_2 = layer_norm_2_2(t2)
print("shape: ", t2_normalized_2.shape)
print("t2_normalized_2: ", t2_normalized_2)

layer_norm_2_2:  LayerNorm((3, 4), eps=1e-05, elementwise_affine=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of gamma or weights:  torch.Size([3, 4])
gamma or weights:  Parameter containing:
tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]], requires_grad=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
shape of beta or bias:  torch.Size([3, 4])
beta or bias:  Parameter containing:
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]], requires_grad=True)
------------------------------------------------------------------------------------------------------------------------------------------------------
shape:  torch.Size([2, 3, 4])
t2_normalized_2:  tensor([[[-1.5933, -1.3036, -1.0139, -0.7242],
         [-0.43

In [11]:
# Similarly if the normalized_shape is of size 3, then the normalization is done across the last 3 dimensions.
# This is similar to applying LayerNorm on the last 2 dimensions as shown above.

## Layer Normalization In Transformers

In [40]:
# Layer Normalization in transformers is used the same way as shown above. The only difference is that the
# input to the LayerNorm is the output of the Multi-Head Attention or Feed Forward Neural Network. So, the
# input is a 3D tensor and LayerNorm is applied on the last dimension. The normalized_shape is the size of
# the last dimension of the input tensor.
#
# Although the 'Annotated Transformer' uses its own implementation to apply LayerNorm, we will be using the
# PyTorch's implementation of LayerNorm.

In [35]:
batch_size = 3
sequence_length = 4
d_model = 5

In [36]:
transformer_input = torch.arange(start=0, end=batch_size * sequence_length * d_model, step=1, dtype=torch.float32).reshape(batch_size, sequence_length, d_model)
print("shape: ", transformer_input.shape)
print("transformer_input: ", transformer_input)

shape:  torch.Size([3, 4, 5])
transformer_input:  tensor([[[ 0.,  1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.,  9.],
         [10., 11., 12., 13., 14.],
         [15., 16., 17., 18., 19.]],

        [[20., 21., 22., 23., 24.],
         [25., 26., 27., 28., 29.],
         [30., 31., 32., 33., 34.],
         [35., 36., 37., 38., 39.]],

        [[40., 41., 42., 43., 44.],
         [45., 46., 47., 48., 49.],
         [50., 51., 52., 53., 54.],
         [55., 56., 57., 58., 59.]]])


In [38]:
transformer_layer_norm = nn.LayerNorm(normalized_shape=d_model)
print("transformer_layer_norm: ", transformer_layer_norm)
print("weights: ", transformer_layer_norm.weight)
print("bias: ", transformer_layer_norm.bias)

transformer_layer_norm:  LayerNorm((5,), eps=1e-05, elementwise_affine=True)
weights:  Parameter containing:
tensor([1., 1., 1., 1., 1.], requires_grad=True)
bias:  Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)


In [39]:
normalized_output = transformer_layer_norm(transformer_input)
print("shape: ", normalized_output.shape)
print("normalized_input: ", normalized_output)

shape:  torch.Size([3, 4, 5])
normalized_input:  tensor([[[-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
         [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
         [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
         [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142]],

        [[-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
         [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
         [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
         [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142]],

        [[-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
         [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
         [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142],
         [-1.4142, -0.7071,  0.0000,  0.7071,  1.4142]]],
       grad_fn=<NativeLayerNormBackward0>)
