# Layer Normalization

In [1]:
import torch
from torch import nn

In [4]:
inputs= torch.tensor([[[.2,.1,.3],[.5,.1,.1]]])
print(inputs.size())
B,S,E= inputs.size()
inputs=inputs.reshape(S,B,E)
inputs, inputs.size()

torch.Size([1, 2, 3])


(tensor([[[0.2000, 0.1000, 0.3000]],
 
         [[0.5000, 0.1000, 0.1000]]]),
 torch.Size([2, 1, 3]))

In [6]:
parameter_shape= inputs.size()[-2:]
parameter_shape

torch.Size([1, 3])

In [8]:
gamma= nn.Parameter(torch.ones(parameter_shape))
gamma

Parameter containing:
tensor([[1., 1., 1.]], requires_grad=True)

In [9]:
beta= nn.Parameter(torch.zeros(parameter_shape))

In [10]:
dims=[-(i+1) for i in range(len(parameter_shape))]
dims

[-1, -2]

In [13]:
mean= inputs.mean(dims,keepdim=True)
mean, mean.shape  

(tensor([[[0.2000]],
 
         [[0.2333]]]),
 torch.Size([2, 1, 1]))

In [15]:
var=((inputs-mean)**2).mean(dims,keepdim=True)
print(var)
epsilon= 1e-5
std=(var+epsilon).sqrt()
std

tensor([[[0.0067]],

        [[0.0356]]])


tensor([[[0.0817]],

        [[0.1886]]])

In [17]:
y= (inputs-mean)/std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [18]:
out = gamma*y + beta
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

In [19]:
import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

In [20]:
batch_size = 3
sentence_length = 5
embedding_dim = 8 
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 0.5448, -0.1171,  0.7527,  0.9059,  0.2313,  0.8846,  0.9386,
          -0.6847],
         [-0.5084,  0.8072,  0.4242, -1.2370, -1.0891, -0.3457, -1.9590,
          -0.2516],
         [ 0.4447, -0.0488, -2.0287,  0.8738, -0.5496, -0.8255,  0.4173,
          -0.6267]],

        [[ 0.7027,  0.3805, -1.2301, -1.4588,  1.6288,  1.4541, -2.3146,
           0.0253],
         [ 1.5661, -1.7011,  0.6189,  1.7349, -1.6977, -1.3949, -1.3301,
           0.3633],
         [-0.3668,  0.5905, -0.0704,  0.8422, -1.4134, -0.9281, -1.3643,
          -1.7392]],

        [[-0.5054, -0.5495, -1.9035, -1.2181, -0.0794,  0.7089, -0.3922,
          -0.6577],
         [-0.3423,  0.2957,  0.5221, -0.9552,  2.4069, -0.8084,  0.3504,
           0.2769],
         [ 0.3362,  1.6006, -2.2938,  0.1955,  0.5853, -0.2018, -0.7920,
           1.7263]],

        [[ 1.3085,  1.4727, -1.3062, -1.6162, -0.7670,  0.9304, -0.5287,
           0.3105],
         [-0.5746,  0.0801, 

In [23]:
norm= LayerNormalization(parameters_shape=inputs.size()[-2:])
norm.forward(inputs)

Mean 
 (torch.Size([5, 1, 1])): 
 tensor([[[-0.1270]],

        [[-0.2959]],

        [[-0.0706]],

        [[ 0.0548]],

        [[ 0.3332]]])
Standard Deviation 
 (torch.Size([5, 1, 1])): 
 tensor([[[0.8649]],

        [[1.2368]],

        [[1.0536]],

        [[1.0299]],

        [[1.2342]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 0.7767,  0.0114,  1.0170,  1.1942,  0.4143,  1.1695,  1.2320,
          -0.6449],
         [-0.4410,  1.0800,  0.6372, -1.2834, -1.1124, -0.2530, -2.1182,
          -0.1441],
         [ 0.6609,  0.0904, -2.1988,  1.1571, -0.4887, -0.8077,  0.6293,
          -0.5779]],

        [[ 0.8074,  0.5469, -0.7553, -0.9402,  1.5562,  1.4149, -1.6321,
           0.2597],
         [ 1.5055, -1.1361,  0.7396,  1.6419, -1.1334, -0.8886, -0.8361,
           0.5330],
         [-0.0573,  0.7167,  0.1823,  0.9202, -0.9035, -0.5111, -0.8638,
          -1.1669]],

        [[-0.4127, -0.4545, -1.7397, -1.0891, -0.0083,  0.7398, -0.3052,
          -0.5573],
         [-0.257

tensor([[[ 0.7767,  0.0114,  1.0170,  1.1942,  0.4143,  1.1695,  1.2320,
          -0.6449],
         [-0.4410,  1.0800,  0.6372, -1.2834, -1.1124, -0.2530, -2.1182,
          -0.1441],
         [ 0.6609,  0.0904, -2.1988,  1.1571, -0.4887, -0.8077,  0.6293,
          -0.5779]],

        [[ 0.8074,  0.5469, -0.7553, -0.9402,  1.5562,  1.4149, -1.6321,
           0.2597],
         [ 1.5055, -1.1361,  0.7396,  1.6419, -1.1334, -0.8886, -0.8361,
           0.5330],
         [-0.0573,  0.7167,  0.1823,  0.9202, -0.9035, -0.5111, -0.8638,
          -1.1669]],

        [[-0.4127, -0.4545, -1.7397, -1.0891, -0.0083,  0.7398, -0.3052,
          -0.5573],
         [-0.2578,  0.3476,  0.5625, -0.8396,  2.3515, -0.7003,  0.3996,
           0.3298],
         [ 0.3862,  1.5862, -2.1101,  0.2525,  0.6225, -0.1246, -0.6847,
           1.7055]],

        [[ 1.2174,  1.3768, -1.3216, -1.6225, -0.7980,  0.8502, -0.5666,
           0.2483],
         [-0.6111,  0.0245,  0.6735,  1.4463, -1.3517,  1.7238, 