In [19]:
import torch
import torch.nn.functional as F
from torch import Tensor, nn

In [2]:
bn1 = nn.BatchNorm1d(10)
# (10) x 2
bn2 = nn.BatchNorm2d(3)
# (3) x 2
ln1 = nn.LayerNorm(10)
# (10) x 2
ln2 = nn.LayerNorm((3, 4))
# (3 x 4) x 2

In [3]:
torch.random.manual_seed(0)

x11 = torch.randn(16, 10)
x12 = torch.randn(16, 10, 20)
x2 = torch.randn(16, 3, 32, 32)
x31 = torch.randn(16, 60, 10)
x32 = torch.randn(16, 60, 3, 4)

In [4]:
output11 = bn1(x11)
output12 = bn1(x12)
output2 = bn2(x2)
output31 = ln1(x31)
output32 = ln2(x32)

for output in [output11, output12, output2, output31, output32]:
    print(output.size())

torch.Size([16, 10])
torch.Size([16, 10, 20])
torch.Size([16, 3, 32, 32])
torch.Size([16, 60, 10])
torch.Size([16, 60, 3, 4])


# Batch Normalization

$$
    y = \frac{x-E[x]}{\sqrt{Var[x] + \epsilon}} \cdot \gamma + \beta
$$

+ calculated along features.
+ If 10 features exist, then 10 times of above calculation is executed in parallel.

## 1D

### vector data

In [28]:
mean_estimate = x11.mean(dim=0, keepdim=True)
var_estimate = x11.var(dim=0, keepdim=True, unbiased=False)

my_output11 = (x11 - mean_estimate) / torch.sqrt(var_estimate + bn1.eps)
round(F.mse_loss(output11, my_output11).item(), 6)

0.0

### matrix data

In [31]:
mean_estimate = x12.mean(dim=[0, 2], keepdim=True)
var_estimate = x12.var(dim=[0, 2], keepdim=True, unbiased=False)

my_output12 = (x12 - mean_estimate) / torch.sqrt(var_estimate + bn1.eps)
round(F.mse_loss(output12, my_output12).item(), 6)

0.0

## 2D

In [32]:
mean_estimate = x2.mean(dim=[0, 2, 3], keepdim=True)
var_estimate = x2.var(dim=[0, 2, 3], keepdim=True, unbiased=False)

my_output2 = (x2 - mean_estimate) / torch.sqrt(var_estimate + bn2.eps)
round(F.mse_loss(output2, my_output2).item(), 6)

0.0

# Layer Normalization

+ seems to be much more proper for NLP data

## vector data

In [38]:
mean_estimate = x31.mean(dim=2, keepdim=True)
var_estimate = x31.var(dim=2, keepdim=True, unbiased=False)

my_output31 = (x31 - mean_estimate) / torch.sqrt(var_estimate + ln1.eps)
round(F.mse_loss(output31, my_output31).item(), 6)

0.0

In [44]:
x31_sample = x31[0, 0]
my_output31_sample = (x31_sample - x31_sample.mean()) / torch.sqrt(x31_sample.var(unbiased=False) + ln1.eps)
round(F.mse_loss(output31[0, 0], my_output31_sample).item(), 6)

0.0

In [43]:
output31[0, 0]

tensor([ 0.1511,  2.1891, -0.2169, -1.0667, -0.4760,  0.6376, -1.7909,  0.3655,
         0.1511,  0.0560], grad_fn=<SelectBackward0>)

## matrix data

In [40]:
mean_estimate = x32.mean(dim=[2, 3], keepdim=True)
var_estimate = x32.var(dim=[2, 3], keepdim=True, unbiased=False)

my_output32 = (x32 - mean_estimate) / torch.sqrt(var_estimate + ln2.eps)
round(F.mse_loss(output32, my_output32).item(), 6)

0.0