###   权值初始化

```

在搭建好网络模型之后，一个重要的步骤就是对网络模型中的权值进行初始化。适当的权值初始化可以加快模型的收敛，而不恰当的权值初始化可能引发梯度消失或者梯度爆炸，最终导致模型无法收敛。下面分 3 部分介绍。第一部分介绍不恰当的权值初始化是如何引发梯度消失与梯度爆炸的，第二部分介绍常用的 Xavier 方法与 Kaiming 方法，第三部分介绍 PyTorch 中的 10 种初始化方法。

### 梯度消失与梯度爆炸
考虑一个 3 层的全连接网络。

$H{1}=X \times W{1}$，$H{2}=H{1} \times W{2}$，$Out=H{2} \times W_{3}$

![image](https://image.zhangxiann.com/20200630085446.png "image")

$\begin{aligned} \Delta \mathrm{W}{2} &=\frac{\partial \mathrm{Loss}}{\partial \mathrm{W}{2}}=\frac{\partial \mathrm{Loss}}{\partial \mathrm{out}}  \frac{\partial \mathrm{out}}{\partial \mathrm{H}_{2}}  \frac{\partial \mathrm{H}{2}}{\partial \mathrm{w}{2}} \ &=\frac{\partial \mathrm{Loss}}{\partial \mathrm{out}}  \frac{\partial \mathrm{out}}{\partial \mathrm{H}_{2}}  \mathrm{H}_{1} \end{aligned}$

所以$\Delta \mathrm{W}{2}$依赖于前一层的输出$H{1}$。如果$H{1}$ 趋近于零，那么$\Delta \mathrm{W}{2}$也接近于 0，造成梯度消失。如果$H{1}$ 趋近于无穷大，那么$\Delta \mathrm{W}{2}$也接近于无穷大，造成梯度爆炸。要避免梯度爆炸或者梯度消失，就要严格控制网络层输出的数值范围。

In [21]:
# -*- coding: utf-8 -*-
"""
通用函数
"""


import torch
import random
import numpy as np
from PIL import Image
import torchvision.transforms as transforms


def transform_invert(img_, transform_train):
    """
    将data 进行反transfrom操作
    :param img_: tensor
    :param transform_train: torchvision.transforms
    :return: PIL image
    """
    if 'Normalize' in str(transform_train):
        norm_transform = list(filter(lambda x: isinstance(x, transforms.Normalize), transform_train.transforms))
        mean = torch.tensor(norm_transform[0].mean, dtype=img_.dtype, device=img_.device)
        std = torch.tensor(norm_transform[0].std, dtype=img_.dtype, device=img_.device)
        img_.mul_(std[:, None, None]).add_(mean[:, None, None])

    img_ = img_.transpose(0, 2).transpose(0, 1)  # C*H*W --> H*W*C
    if 'ToTensor' in str(transform_train):
        img_ = img_.detach().numpy() * 255

    if img_.shape[2] == 3:
        img_ = Image.fromarray(img_.astype('uint8')).convert('RGB')
    elif img_.shape[2] == 1:
        img_ = Image.fromarray(img_.astype('uint8').squeeze())
    else:
        raise Exception("Invalid img shape, expected 1 or 3 in axis 2, but got {}!".format(img_.shape[2]) )

    return img_


def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


In [22]:
# 下面构建 100 层全连接网络，先不适用非线性激活函数，每层的权重初始化为服从$N(0,1)$的正态分布，输出数据使用随机初始化的数据。
import torch
import torch.nn as nn


set_seed(1)  # 设置随机种子


class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)


        return x

    def initialize(self):
        for m in self.modules():
            # 判断这一层是否为线性层，如果为线性层则初始化权值
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight.data)    # normal: mean=0, std=1

layer_nums = 100
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<MmBackward0>)


也就是数据太大(梯度爆炸)或者太小(梯度消失)了。接下来我们在forward()函数中判断每一次前向传播的输出的标准差是否为 nan，如果是 nan 则停止前向传播。

In [23]:
# 下面构建 100 层全连接网络，先不适用非线性激活函数，每层的权重初始化为服从$N(0,1)$的正态分布，输出数据使用随机初始化的数据。
import torch
import torch.nn as nn


set_seed(1)  # 设置随机种子


class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            # 判断这一层是否为线性层，如果为线性层则初始化权值
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight.data)    # normal: mean=0, std=1

layer_nums = 100
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)

layer:0, std:15.959932327270508
layer:1, std:256.6237487792969
layer:2, std:4107.24560546875
layer:3, std:65576.8125
layer:4, std:1045011.875
layer:5, std:17110408.0
layer:6, std:275461440.0
layer:7, std:4402537984.0
layer:8, std:71323615232.0
layer:9, std:1148104736768.0
layer:10, std:17911758454784.0
layer:11, std:283574846619648.0
layer:12, std:4480599540629504.0
layer:13, std:7.196814275405414e+16
layer:14, std:1.1507761512626258e+18
layer:15, std:1.853110740188555e+19
layer:16, std:2.9677722308204246e+20
layer:17, std:4.780376223769898e+21
layer:18, std:7.613223480799065e+22
layer:19, std:1.2092652108825478e+24
layer:20, std:1.9232569606642055e+25
layer:21, std:3.134467063655912e+26
layer:22, std:5.014437175989598e+27
layer:23, std:8.066615144249704e+28
layer:24, std:1.2392661553516338e+30
layer:25, std:1.9455688099759845e+31
layer:26, std:3.0238180658999113e+32
layer:27, std:4.950357571077011e+33
layer:28, std:8.150924530001331e+34
layer:29, std:1.3229830735592165e+36
layer:30, s

上述是没有使用非线性变换的实验结果，如果在forward()中添加非线性变换tanh，每一层的输出方差还是会越来越小，会导致梯度消失。因此出现了 Xavier 初始化方法与 Kaiming 初始化方法。

### Xavier 方法与 Kaiming 方法

Xavier 方法

Xavier 是 2010 年提出的，针对有非线性激活函数时的权值初始化方法，目标是保持数据的方差维持在 1 左右，主要针对饱和激活函数如 sigmoid 和 tanh 等。同时考虑前向传播和反向传播，需要满足两个等式：$\boldsymbol{n}{\boldsymbol{i}} * \boldsymbol{D}(\boldsymbol{W})=\mathbf{1}$和$\boldsymbol{n}{\boldsymbol{i+1}} * \boldsymbol{D}(\boldsymbol{W})=\mathbf{1}$，可得：$D(W)=\frac{2}{n{i}+n{i+1}}$。为了使 Xavier 方法初始化的权值服从均匀分布，假设$W$服从均匀分布$U[-a, a]$，那么方差 $D(W)=\frac{(-a-a)^{2}}{12}=\frac{(2 a)^{2}}{12}=\frac{a^{2}}{3}$，令$\frac{2}{n{i}+n{i+1}}=\frac{a^{2}}{3}$，解得：$\boldsymbol{a}=\frac{\sqrt{6}}{\sqrt{n{i}+n{i+1}}}$，所以$W$服从分布$U\left[-\frac{\sqrt{6}}{\sqrt{n{i}+n{i+1}}}, \frac{\sqrt{6}}{\sqrt{n{i}+n{i+1}}}\right]$

In [24]:
# 下面构建 100 层全连接网络，先不适用非线性激活函数，每层的权重初始化为服从$N(0,1)$的正态分布，输出数据使用随机初始化的数据。
import torch
import torch.nn as nn


set_seed(1)  # 设置随机种子


class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            # 判断这一层是否为线性层，如果为线性层则初始化权值
            if isinstance(m, nn.Linear):
                a = np.sqrt(6 / (self.neural_num + self.neural_num))
                # 把 a 变换到 tanh，计算增益
                tanh_gain = nn.init.calculate_gain('tanh')
                a *= tanh_gain
                nn.init.uniform_(m.weight.data, -a, a)

layer_nums = 100
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)

layer:0, std:1.6565721035003662
layer:1, std:2.7454750537872314
layer:2, std:4.626405239105225
layer:3, std:7.675012111663818
layer:4, std:13.035057067871094
layer:5, std:21.013654708862305
layer:6, std:35.22600555419922
layer:7, std:59.19900131225586
layer:8, std:97.7145767211914
layer:9, std:160.1146697998047
layer:10, std:268.224853515625
layer:11, std:453.31414794921875
layer:12, std:764.7789306640625
layer:13, std:1246.519287109375
layer:14, std:2084.412841796875
layer:15, std:3492.153076171875
layer:16, std:5920.9140625
layer:17, std:9637.7041015625
layer:18, std:16439.984375
layer:19, std:28287.46484375
layer:20, std:46427.4375
layer:21, std:77196.75
layer:22, std:126378.0234375
layer:23, std:211954.15625
layer:24, std:346803.1875
layer:25, std:582732.3125
layer:26, std:981679.0625
layer:27, std:1634631.125
layer:28, std:2753212.25
layer:29, std:4657875.0
layer:30, std:7713320.5
layer:31, std:12499062.0
layer:32, std:21196508.0
layer:33, std:35260592.0
layer:34, std:59367156.0
l

### PyTorch 也提供了 Xavier 初始化方法，可以直接调用：
```python
tanh_gain = nn.init.calculate_gain('tanh')
nn.init.xavier_uniform_(m.weight.data, gain=tanh_gain)

### 常用初始化方法
```
1.Xavier 均匀分布
2.Xavier 正态分布
3.Kaiming 均匀分布
4.Kaiming 正态分布
5.均匀分布
6.正态分布
7.常数分布
8.正交矩阵初始化
9.单位矩阵初始化
10.稀疏矩阵初始化