In [2]:
'''
本节讲解LayerNorm 的使用，均已NLP中的例子讲解。
LayerNorm 中的 normalized_shape 参数表示在什么维度做均值和方差的计算。
'''
import torch as t
# (batch_size, seq_len, emb_dim)
a = t.randn(2,5,10)
# elementwise_affine 表示是否有双仿射的转换参数，这里设置为False，是为了做下面的对比
# 10 代表就是最后embedding维度做均值和方差，那么这样得到的均值和方差的大小 (2,5)。这样(a-Mean)/sqrt(Var) 就是
# 最后LayerNorm 的值了
ln = t.nn.LayerNorm(10,elementwise_affine=False)
print(a)

tensor([[[ 0.0681,  0.2976, -0.9554, -0.8669, -0.1848,  1.4334, -0.2745,
          -0.2192, -0.1230, -0.5360],
         [ 1.0103, -1.3483,  1.8934,  0.0286,  0.6348,  0.0994,  1.3042,
           0.3001,  0.7366,  0.1204],
         [-1.6593,  0.5226,  0.3781, -0.9619, -1.0888, -0.3680,  1.8228,
           1.2497, -0.1636,  0.6319],
         [-0.3766,  1.4101,  0.7446, -0.7036, -0.1220,  2.4454, -1.9116,
          -0.7363,  0.8857, -1.8490],
         [-0.5305, -0.9742,  0.5887,  0.4808, -0.9718,  0.3426,  1.1690,
          -1.0753,  1.3477,  0.7213]],

        [[-0.3370, -0.4236, -2.3417,  0.1807, -0.3706, -0.2489, -0.3786,
          -0.0561, -0.0498,  0.2140],
         [-1.4466,  0.2551,  0.5458, -0.0162,  1.1776, -1.0256, -1.4984,
          -0.7556,  0.0465, -0.0612],
         [ 0.3875,  0.6042, -0.3165, -1.6586,  0.3647, -1.6719, -1.0361,
          -1.2787,  1.6920, -1.2060],
         [ 0.0542, -0.1122, -0.8964, -0.5361, -0.5247, -0.3065, -0.2570,
          -0.8534, -0.8811, -0.5434],

In [3]:
out = ln(a)
print(out)

tensor([[[ 0.3194,  0.6784, -1.2816, -1.1432, -0.0762,  2.4550, -0.2166,
          -0.1300,  0.0205, -0.6256],
         [ 0.6408, -2.1984,  1.7039, -0.5409,  0.1889, -0.4556,  0.9946,
          -0.2141,  0.3113, -0.4304],
         [-1.6383,  0.4698,  0.3302, -0.9645, -1.0871, -0.3907,  1.7260,
           1.1723, -0.1932,  0.5754],
         [-0.2680,  1.0799,  0.5778, -0.5147, -0.0760,  1.8609, -1.4261,
          -0.5394,  0.6843, -1.3789],
         [-0.7341, -1.2427,  0.5490,  0.4253, -1.2400,  0.2669,  1.2142,
          -1.3587,  1.4191,  0.7010]],

        [[ 0.0641, -0.0616, -2.8445,  0.8152,  0.0153,  0.1918,  0.0038,
           0.4717,  0.4807,  0.8635],
         [-1.4025,  0.6395,  0.9884,  0.3140,  1.7465, -0.8972, -1.4646,
          -0.5733,  0.3892,  0.2599],
         [ 0.7415,  0.9425,  0.0885, -1.1563,  0.7204, -1.1686, -0.5789,
          -0.8040,  1.9515, -0.7365],
         [ 1.7218,  1.1910, -1.3100, -0.1609, -0.1245,  0.5715,  0.7292,
          -1.1728, -1.2612, -0.1841],

In [9]:
# 求出a的均值和方差。（维度是最后一个维度）
m = a.mean(-1).unsqueeze(-1)
var = t.var(a, dim=-1,unbiased=False).unsqueeze(-1)
print(m)
print(var)
(a-m)/t.sqrt(var)

tensor([[[-0.1360],
         [ 0.4779],
         [ 0.0363],
         [-0.0213],
         [ 0.1098]],

        [[-0.3812],
         [-0.2779],
         [-0.4120],
         [-0.4856],
         [-0.5018]]])
tensor([[[0.4087],
         [0.6901],
         [1.0713],
         [1.7570],
         [0.7609]],

        [[0.4750],
         [0.6945],
         [1.1623],
         [0.0983],
         [0.5463]]])


tensor([[[ 0.3194,  0.6784, -1.2816, -1.1432, -0.0762,  2.4550, -0.2166,
          -0.1300,  0.0205, -0.6256],
         [ 0.6408, -2.1984,  1.7039, -0.5409,  0.1889, -0.4557,  0.9946,
          -0.2141,  0.3113, -0.4304],
         [-1.6383,  0.4698,  0.3302, -0.9645, -1.0871, -0.3907,  1.7260,
           1.1723, -0.1932,  0.5754],
         [-0.2680,  1.0799,  0.5778, -0.5147, -0.0760,  1.8609, -1.4261,
          -0.5394,  0.6843, -1.3789],
         [-0.7341, -1.2427,  0.5490,  0.4253, -1.2400,  0.2669,  1.2142,
          -1.3587,  1.4191,  0.7010]],

        [[ 0.0641, -0.0616, -2.8445,  0.8152,  0.0153,  0.1918,  0.0038,
           0.4717,  0.4807,  0.8635],
         [-1.4025,  0.6396,  0.9884,  0.3140,  1.7465, -0.8973, -1.4646,
          -0.5733,  0.3892,  0.2599],
         [ 0.7415,  0.9425,  0.0885, -1.1563,  0.7204, -1.1686, -0.5789,
          -0.8040,  1.9515, -0.7365],
         [ 1.7219,  1.1911, -1.3100, -0.1609, -0.1246,  0.5715,  0.7292,
          -1.1729, -1.2613, -0.1841],

In [20]:
# 现在搞一个有参数的LayerNorm
# 可以发现LayerNorma 中的参数 gamma 和 beta 是normalized_shape 维度的。为什么是这个维度？
# 因为后面要用到 x-(Mean)/sqrt(var) * gamma + beta 这个，因为x是normalized_shape 维度的。
b = t.randn(2,5,10)
ln2 = t.nn.LayerNorm(10)
out = ln2(b)

# 可以看到这个参数是可以学习的
for i in ln2.parameters():
    print(i)

Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)
