## 权值初始化方法

### 梯度爆炸实验

In [2]:
import os
import torch
import random
import numpy as np
import torch.nn as nn

In [3]:
class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.relu(x)

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight.data)    # normal: mean=0, std=1

In [4]:
layer_nums = 100
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)  # 输出为NaN

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<ReluBackward0>)


### 通过使用合适的初始化方差避免梯度爆炸

In [9]:
class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight.data, std=np.sqrt(1/self.neural_num))  # std

In [10]:
layer_nums = 100
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)  # 输出为NaN

tensor([[-0.0790,  0.5840, -0.2799,  ...,  0.1225,  0.2502,  0.1782],
        [-0.2215,  0.8377, -0.5547,  ...,  0.2425, -0.7018,  1.1625],
        [ 0.2962,  0.0258, -0.0139,  ..., -0.0770, -0.1920, -0.2415],
        ...,
        [-0.3592,  0.2852, -0.4397,  ..., -0.2645, -0.6763, -0.2311],
        [ 0.4422,  0.5176,  0.0465,  ...,  0.0168,  0.3626,  0.4444],
        [-0.5831, -1.7075,  0.4821,  ...,  0.4343, -1.6414, -1.0328]],
       grad_fn=<MmBackward>)


### 梯度消失现象

In [18]:
class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([
            nn.Linear(neural_num, neural_num, bias=False)
            for i in range(layers)
        ])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.relu(x)  # 导致梯度消失
        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight.data, std=np.sqrt(1 / self.neural_num))  # std

In [19]:
layer_nums = 100
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)  # 输出接近于0

tensor([[0.0000e+00, 7.6612e-16, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 6.2243e-16, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 5.8723e-16, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 5.3192e-16, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 7.0749e-16, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 6.4460e-16, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]], grad_fn=<ReluBackward0>)


**增益计算**

    torch.nn.init.caculate_gain(nonlinearity, param=None)
    
    - 计算特定激活函数方差的变化尺度
    
    - nonlinearity  激活函数名称
    
    - param    激活函数的参数

In [29]:
x = torch.randn(10000)
out = torch.tanh(x)

gain = x.std() / out.std()
print('gain:{}'.format(gain))

tanh_gain = nn.init.calculate_gain('tanh')
print('tanh_gain in PyTorch:', tanh_gain)

gain:1.6008168458938599
tanh_gain in PyTorch: 1.6666666666666667


**Xavier初始化方法**

    torch.nn.init.xavier_uniform_(tensor, gain=1)

    torch.nn.init.xavier_normal_(tensor, gain=1)
    
**功能**

    维持梯度保持在固定区间内
    
**原理**

    使用设定的范围初始化均匀分布，一般不适合与relu结合使用


**初始化的实现**

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                # Xavier初始化原理
                a = np.sqrt(6 / (self.neural_num + self.neural_num))
                tanh_gain = nn.init.calculate_gain('tanh')
                a *= tanh_gain
                nn.init.uniform_(m.weight.data, -a, a)

In [25]:
class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.relu(x)  # 配合 relu

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                tanh_gain = nn.init.calculate_gain('relu')
                nn.init.xavier_uniform_(m.weight.data, gain=tanh_gain)

In [26]:
layer_nums = 100
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)  # 输出接近于0

tensor([[1.6384, 0.4085, 0.1534,  ..., 1.2654, 0.0000, 0.2298],
        [0.9316, 0.1920, 0.1118,  ..., 0.7108, 0.0000, 0.1555],
        [0.7651, 0.2095, 0.0279,  ..., 0.5919, 0.0000, 0.1411],
        ...,
        [0.8130, 0.1709, 0.0551,  ..., 0.6211, 0.0000, 0.1722],
        [1.3152, 0.3116, 0.1029,  ..., 0.9901, 0.0000, 0.1942],
        [1.3572, 0.3087, 0.0561,  ..., 1.0867, 0.0000, 0.2738]],
       grad_fn=<ReluBackward0>)


**kaiming初始化方法**

    torch.nn.init.kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu')
    
    torch.nn.init.kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu')
    
**特点**

    基于relu激活函数提出的初始化方法

In [27]:
class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.relu(x)  # 配合 relu

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight.data)

In [28]:
layer_nums = 100
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

output = net(inputs)
print(output)  # 输出接近于0

tensor([[0.0000, 0.1933, 0.0000,  ..., 0.2536, 0.0000, 0.0000],
        [0.0000, 0.2283, 0.0000,  ..., 0.2499, 0.0000, 0.0000],
        [0.0000, 0.2110, 0.0000,  ..., 0.2771, 0.0000, 0.0000],
        ...,
        [0.0000, 0.2107, 0.0000,  ..., 0.2763, 0.0000, 0.0000],
        [0.0000, 0.2317, 0.0000,  ..., 0.3331, 0.0000, 0.0000],
        [0.0000, 0.2147, 0.0000,  ..., 0.2976, 0.0000, 0.0000]],
       grad_fn=<ReluBackward0>)


**均匀分布**

    torch.nn.init.uniform_(tensor, a=0, b=1)
    
**正态分布**

    torch.nn.init.normal_(tensor, mean=0, std=1)
    
**常数**

    torch.nn.init.constant_(tensor, val)
    
**单位矩阵初始化**

    torch.nn.init.eye_(tensor)
    
**正交初始化**

    torch.nn.init.orthogonal_(tensor, gain=1)
    
**稀疏初始化**

    torch.nn.init.sparse_(tensor, sparsity, std=0.01)

### 单层初始化

In [5]:
conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
nn.init.xavier_uniform_(conv1.weight)
print(conv1.weight)
nn.init.constant_(conv1.bias, 0.1)
print(conv1.bias)

Parameter containing:
tensor([[[[ 0.0175,  0.0093,  0.0396,  ...,  0.0313, -0.0003,  0.0153],
          [-0.0039, -0.0056,  0.0201,  ...,  0.0214, -0.0345,  0.0068],
          [-0.0064,  0.0138, -0.0350,  ..., -0.0128, -0.0297, -0.0345],
          ...,
          [ 0.0383,  0.0420, -0.0271,  ..., -0.0331,  0.0039, -0.0321],
          [-0.0080, -0.0219,  0.0263,  ..., -0.0294,  0.0237, -0.0319],
          [-0.0212,  0.0002,  0.0208,  ...,  0.0399, -0.0014,  0.0211]],

         [[ 0.0337, -0.0185, -0.0076,  ..., -0.0284,  0.0043, -0.0256],
          [-0.0373, -0.0103,  0.0378,  ..., -0.0105, -0.0104,  0.0053],
          [ 0.0426, -0.0061, -0.0005,  ..., -0.0348, -0.0038, -0.0127],
          ...,
          [ 0.0080,  0.0298,  0.0316,  ..., -0.0164, -0.0237,  0.0207],
          [-0.0329,  0.0420, -0.0300,  ..., -0.0288,  0.0389, -0.0265],
          [-0.0415, -0.0118, -0.0039,  ..., -0.0111,  0.0357,  0.0143]],

         [[ 0.0122, -0.0120,  0.0289,  ..., -0.0065, -0.0219, -0.0176],
        

        0.1000], requires_grad=True)


### 模型初始化

In [6]:
from torch.nn import init


#define the initial function to init the layer's parameters for the network
def weigth_init(m):
    if isinstance(m, nn.Conv2d):
        init.xavier_uniform_(m.weight.data)
        init.constant_(m.bias.data, 0.1)
    elif isinstance(m, nn.BatchNorm2d):
        m.weight.data.fill_(1)
        m.bias.data.zero_()
    elif isinstance(m, nn.Linear):
        m.weight.data.normal_(0, 0.01)
        m.bias.data.zero_()

In [8]:
net = Net()
net.apply(weights_init) #apply函数会递归地搜索网络内的所有module并把参数表示的函数应用到所有的module上。