In [None]:
import torch
import torch.nn as nn

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.gamma = nn.Parameter(torch.ones(hidden_size), requires_grad=True)
        self.beta = nn.Parameter(torch.zeros(hidden_size), requires_grad=True)


    def forward(self, X):
        mu = X.mean(dim=-1, keepdim=True)
        theta = X.var(dim=-1, keepdim=True, unbiased=False) #有偏估计增加数值稳定性

        return (X - mu) / torch.sqrt(theta + 1e-5) * self.gamma + self.beta


# 测试
X = torch.tril(torch.ones(2, 3, 4))
print("输入张量 X:")
print(X)

X = torch.tril(torch.ones(2,3,4))
layer_norm = LayerNorm(X.shape[-1])
print(f'归一化后的张量: {layer_norm(X)}')


输入张量 X:
tensor([[[1., 0., 0., 0.],
         [1., 1., 0., 0.],
         [1., 1., 1., 0.]],

        [[1., 0., 0., 0.],
         [1., 1., 0., 0.],
         [1., 1., 1., 0.]]])
归一化后的张量: tensor([[[ 1.7320, -0.5773, -0.5773, -0.5773],
         [ 1.0000,  1.0000, -1.0000, -1.0000],
         [ 0.5773,  0.5773,  0.5773, -1.7320]],

        [[ 1.7320, -0.5773, -0.5773, -0.5773],
         [ 1.0000,  1.0000, -1.0000, -1.0000],
         [ 0.5773,  0.5773,  0.5773, -1.7320]]], grad_fn=<AddBackward0>)


In [56]:
#LayerNorm的改进 RMSNorm
class RMSNorm(nn.Module):
    def __init__(self, hidden_size:int):
        super().__init__()
        self.hidden_size = hidden_size
        self.gamma = nn.Parameter(torch.ones(hidden_size), requires_grad=True)

    def forward(self, X):
        
        rms = torch.sqrt(
            torch.sum(X ** 2, dim=-1, keepdim=True)  /  X.shape[-1]  + 1e-6
            ) 
        normalized_X = X / rms * self.gamma
        return normalized_X

X = torch.randn(2, 3, 4)  # 示例输入，形状为 (batch_size, sequence_length, hidden_size)
rms_norm = RMSNorm(hidden_size=X.shape[-1])
normalized_X = rms_norm(X)
print("输入 X 的形状:", X.shape)
print("归一化后的 X 的形状:", normalized_X.shape)

输入 X 的形状: torch.Size([2, 3, 4])
归一化后的 X 的形状: torch.Size([2, 3, 4])
