<a href="https://colab.research.google.com/github/Madelavishnu/Madelavishnu/blob/main/LayerNormalizaton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üê≥Layer Normalization in Transformers

In [1]:
import torch
from torch import nn

In [2]:
inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
B, S, E = inputs.size()
inputs = inputs.reshape(S, B, E)
inputs.size()

torch.Size([2, 1, 3])

In [3]:
parameter_shape = inputs.size()[-2: ]

gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))

In [6]:
gamma.size(), beta.size(), parameter_shape

(torch.Size([1, 3]), torch.Size([1, 3]), torch.Size([1, 3]))

In [7]:
dims = [-(i+1) for i in range(len(parameter_shape))]
dims

[-1, -2]

In [19]:
mean = torch.mean(inputs, dim = dims, keepdim = True)
mean, mean.size()

(tensor([[[0.2000]],
 
         [[0.2333]]]),
 torch.Size([2, 1, 1]))

In [28]:
var  = ((inputs - mean) ** 2).mean(dim = dims, keepdim = True)
epsilon = 1e-5
std = (var+epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [30]:
y = (inputs -mean)/std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [32]:
out = gamma*y+beta
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

# Let's put everthing together


In [23]:
import torch
from torch import nn

class LayerNormalization():
  def __init__(self, parameters_shape , eps = 1e-5):
    self.parameters_shape = parameters_shape
    self.eps = eps
    self.gamma = nn.Parameter(torch.ones(parameters_shape))
    self.beta = nn.Parameter(torch.zeros(parameters_shape))

  def forward(self, inputs):

    dims = [-(i+1) for i in range(len(self.parameters_shape))]
    mean = torch.mean(inputs, dim = dims, keepdim = True)
    print(f"mean size : {mean.size()}\nmean:{mean}")

    var = ((inputs - mean)**2).mean(dim = dims, keepdim = True)
    std = (var +self.eps).sqrt()
    print(f"standard deviation size:{std.size()}\nstd :{std}")

    y = (inputs-mean)/std
    print(f"y: {y.size()}\n {y}")

    out = self.gamma *y +self.beta
    print(f"out size :{out.size()}\n{out}")

    return out

In [24]:
batch_size = 3
sentence_len = 5
emb_dim = 8
inputs = torch.randn(sentence_len, batch_size, emb_dim)
print(f"inputs size :{inputs.size()}\n {inputs}")

inputs size :torch.Size([5, 3, 8])
 tensor([[[ 0.1578, -0.0063, -0.6864, -0.3699,  0.4621, -1.4733, -0.2831,
          -0.1553],
         [-0.4221,  1.2015,  0.5613, -1.9831, -0.7751, -0.4068,  0.7893,
          -0.1701],
         [ 1.1578, -0.8943,  2.2790,  1.0251, -1.3740,  0.1210,  0.7259,
           0.8086]],

        [[-0.1330, -1.6829, -0.4352,  0.8030,  0.1125, -0.5690, -1.5121,
          -1.4894],
         [ 0.2070,  0.4706, -1.6977,  0.8182, -1.4005, -0.3583,  0.1588,
           1.2888],
         [ 0.1236, -0.4120, -0.1626, -1.5393, -0.0802, -0.6680, -0.0327,
          -0.4106]],

        [[ 0.3315, -1.4713, -2.3360,  0.3725,  2.0596, -0.2009,  0.5601,
          -1.4081],
         [-1.0530,  0.0803, -0.1793,  0.7817,  0.3521, -0.7367, -0.8444,
           0.3163],
         [ 0.9567, -0.4104,  0.0084, -0.0930,  0.3840,  0.1952,  0.2243,
           1.1295]],

        [[-0.4410, -1.1722,  0.1514,  0.2176, -0.5945, -1.0387,  0.5285,
           0.3629],
         [ 1.3458, -0.7498, 

In [25]:
layer_norm = LayerNormalization(inputs.size()[-1:])


In [26]:
out = layer_norm.forward(inputs)

mean size : torch.Size([5, 3, 1])
mean:tensor([[[-0.2943],
         [-0.1506],
         [ 0.4811]],

        [[-0.6133],
         [-0.0641],
         [-0.3977]],

        [[-0.2616],
         [-0.1604],
         [ 0.2993]],

        [[-0.2482],
         [ 0.3836],
         [ 0.2631]],

        [[ 0.2931],
         [ 0.3284],
         [-0.2593]]])
standard deviation size:torch.Size([5, 3, 1])
std :tensor([[[0.5509],
         [0.9425],
         [1.0964]],

        [[0.8304],
         [0.9720],
         [0.4922]],

        [[1.3182],
         [0.6157],
         [0.4857]],

        [[0.6113],
         [0.9565],
         [0.8593]],

        [[0.9878],
         [0.8583],
         [1.0292]]])
y: torch.Size([5, 3, 8])
 tensor([[[ 0.8205,  0.5227, -0.7117, -0.1372,  1.3729, -2.1399,  0.0203,
           0.2524],
         [-0.2880,  1.4347,  0.7554, -1.9443, -0.6626, -0.2718,  0.9973,
          -0.0207],
         [ 0.6172, -1.2545,  1.6398,  0.4961, -1.6921, -0.3284,  0.2232,
           0.2987]],