In [10]:
import os
import torch
import random
import numpy as np
import torch.nn as nn


import sys



class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight.data) # normal mean=0 std =1

# flag = 0
flag = 1 

if flag:
    layer_nums = 100
    neural_nums = 256
    batch_size = 16

    net = MLP(neural_nums, layer_nums)
    net.initialize()

    inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

    output = net(inputs)
    print(output)

# ======================================= calculate gain =======================================

# flag = 0
flag = 1

if flag:

    x = torch.randn(10000)
    out = torch.tanh(x)

    gain = x.std() / out.std()
    print('gain:{}'.format(gain))

    tanh_gain = nn.init.calculate_gain('tanh')
    print('tanh_gain in PyTorch:', tanh_gain)


layer:0, std:15.693282127380371
layer:1, std:257.3644104003906
layer:2, std:4144.98974609375
layer:3, std:66771.90625
layer:4, std:1068706.0
layer:5, std:17136650.0
layer:6, std:278820256.0
layer:7, std:4468012544.0
layer:8, std:69514346496.0
layer:9, std:1092346314752.0
layer:10, std:17636903616512.0
layer:11, std:289505256931328.0
layer:12, std:4537680192864256.0
layer:13, std:7.217537922105344e+16
layer:14, std:1.1572829923563274e+18
layer:15, std:1.860563449903946e+19
layer:16, std:3.016055744637929e+20
layer:17, std:4.767136766765335e+21
layer:18, std:7.661239058586238e+22
layer:19, std:1.2002903654875299e+24
layer:20, std:1.9519994090662047e+25
layer:21, std:3.230122286115252e+26
layer:22, std:5.012900045699424e+27
layer:23, std:8.096685285066025e+28
layer:24, std:1.2773981691117822e+30
layer:25, std:2.043817002410246e+31
layer:26, std:3.1763388552963288e+32
layer:27, std:5.224243758158646e+33
layer:28, std:8.233667451875134e+34
layer:29, std:1.3369123352671744e+36
layer:30, std:

可以看到output 在第31层数值非常大，后面每一层值都是nan,也就是数据可能非常大或者非常小，超出了数据表示范围

![image.png](attachment:image.png)

![image.png](attachment:image.png)

可以看到每一层的标准差扩大了16倍，也就是节点数量的开根号后的值

In [11]:
import os
import torch
import random
import numpy as np
import torch.nn as nn


import sys



class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
#             x = torch.relu(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight.data, std=np.sqrt(1/ self.neural_num))
#                 nn.init.kaiming_normal_(m.weight.data)

# flag = 0
flag = 1

if flag:
    layer_nums = 100
    neural_nums = 256
    batch_size = 16

    net = MLP(neural_nums, layer_nums)
    net.initialize()

    inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

    output = net(inputs)
    print(output)

# ======================================= calculate gain =======================================

# flag = 0
flag = 1

if flag:

    x = torch.randn(10000)
    out = torch.tanh(x)

    gain = x.std() / out.std()
    print('gain:{}'.format(gain))

    tanh_gain = nn.init.calculate_gain('tanh')
    print('tanh_gain in PyTorch:', tanh_gain)


layer:0, std:0.9809120297431946
layer:1, std:0.9776952266693115
layer:2, std:0.9906021356582642
layer:3, std:0.9860875010490417
layer:4, std:0.9707934260368347
layer:5, std:0.9612700343132019
layer:6, std:0.9706950783729553
layer:7, std:0.9686921238899231
layer:8, std:0.96637362241745
layer:9, std:0.9612252712249756
layer:10, std:0.9523577690124512
layer:11, std:0.951198399066925
layer:12, std:0.9416150450706482
layer:13, std:0.9486039280891418
layer:14, std:0.9446241855621338
layer:15, std:0.9556341171264648
layer:16, std:0.9560676217079163
layer:17, std:0.9399266839027405
layer:18, std:0.9507010579109192
layer:19, std:0.925083339214325
layer:20, std:0.9344996809959412
layer:21, std:0.9225611686706543
layer:22, std:0.9260435700416565
layer:23, std:0.9156519770622253
layer:24, std:0.925703227519989
layer:25, std:0.9534597992897034
layer:26, std:0.9697750210762024
layer:27, std:0.9732164144515991
layer:28, std:0.994419276714325
layer:29, std:1.0291340351104736
layer:30, std:1.0113346576

修改标注差分布：nn.init.normal_(m.weight.data, std=np.sqrt(1/ self.neural_num)) 使得输出层标准差保持在1附近


![image.png](attachment:image.png)

In [13]:
import os
import torch
import random
import numpy as np
import torch.nn as nn


import sys



class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.tanh(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight.data, std=np.sqrt(1/ self.neural_num))
#                 nn.init.kaiming_normal_(m.weight.data)

# flag = 0
flag = 1

if flag:
    layer_nums = 100
    neural_nums = 256
    batch_size = 16

    net = MLP(neural_nums, layer_nums)
    net.initialize()

    inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

    output = net(inputs)
    print(output)

# ======================================= calculate gain =======================================

# flag = 0
flag = 1

if flag:

    x = torch.randn(10000)
    out = torch.tanh(x)

    gain = x.std() / out.std()
    print('gain:{}'.format(gain))

    tanh_gain = nn.init.calculate_gain('tanh')
    print('tanh_gain in PyTorch:', tanh_gain)


layer:0, std:0.626349925994873
layer:1, std:0.4836758077144623
layer:2, std:0.4007655382156372
layer:3, std:0.35104629397392273
layer:4, std:0.3106172978878021
layer:5, std:0.283788800239563
layer:6, std:0.26573288440704346
layer:7, std:0.2489820271730423
layer:8, std:0.24524204432964325
layer:9, std:0.2323124259710312
layer:10, std:0.2209138125181198
layer:11, std:0.20790161192417145
layer:12, std:0.1978636234998703
layer:13, std:0.1948394775390625
layer:14, std:0.18970972299575806
layer:15, std:0.18426227569580078
layer:16, std:0.1811440885066986
layer:17, std:0.17229381203651428
layer:18, std:0.1688223034143448
layer:19, std:0.1612623929977417
layer:20, std:0.15471863746643066
layer:21, std:0.15183833241462708
layer:22, std:0.1485540270805359
layer:23, std:0.14486639201641083
layer:24, std:0.14289799332618713
layer:25, std:0.1417391151189804
layer:26, std:0.138749897480011
layer:27, std:0.13706253468990326
layer:28, std:0.13847070932388306
layer:29, std:0.1339610368013382
layer:30, 

考虑具有激活层的全连接网络，可以看到方差逐渐表现，出现梯度消失的现象

layer:0, std:0.6297637224197388
layer:1, std:0.4919561445713043
layer:2, std:0.413028359413147
layer:3, std:0.3573648929595947
layer:4, std:0.3211178779602051
layer:5, std:0.28825968503952026
layer:6, std:0.27039477229118347
layer:7, std:0.24841433763504028
layer:8, std:0.23883788287639618
layer:9, std:0.2206953912973404
layer:10, std:0.21062862873077393
layer:11, std:0.20042605698108673
layer:12, std:0.19451259076595306
layer:13, std:0.18645402789115906
layer:14, std:0.1818782389163971
layer:15, std:0.1768379807472229
layer:16, std:0.17014896869659424
layer:17, std:0.16486340761184692
layer:18, std:0.1615687757730484
layer:19, std:0.158018559217453
layer:20, std:0.1488620936870575
layer:21, std:0.14283587038516998
layer:22, std:0.14085212349891663
layer:23, std:0.13648323714733124
layer:24, std:0.1309627741575241
layer:25, std:0.131567120552063
layer:26, std:0.12906454503536224
layer:27, std:0.1281326860189438
layer:28, std:0.12676246464252472
layer:29, std:0.12355190515518188
layer:3

Xavier初始化

![image.png](attachment:image.png)

Xavier初始化，手动初始化

In [14]:
import os
import torch
import random
import numpy as np
import torch.nn as nn


import sys



class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.tanh(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                a = np.sqrt(6 / (self.neural_num + self.neural_num))
                tanh_gain = nn.init.calculate_gain('tanh')
                a *= tanh_gain
                nn.init.uniform_(m.weight.data, -a, a)

# flag = 0
flag = 1

if flag:
    layer_nums = 100
    neural_nums = 256
    batch_size = 16

    net = MLP(neural_nums, layer_nums)
    net.initialize()

    inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

    output = net(inputs)
    print(output)

# ======================================= calculate gain =======================================

# flag = 0
flag = 1

if flag:

    x = torch.randn(10000)
    out = torch.tanh(x)

    gain = x.std() / out.std()
    print('gain:{}'.format(gain))

    tanh_gain = nn.init.calculate_gain('tanh')
    print('tanh_gain in PyTorch:', tanh_gain)

layer:0, std:0.7616087198257446
layer:1, std:0.689491331577301
layer:2, std:0.6750748753547668
layer:3, std:0.6658603549003601
layer:4, std:0.6552332639694214
layer:5, std:0.656459629535675
layer:6, std:0.6558413505554199
layer:7, std:0.6555054783821106
layer:8, std:0.6524512767791748
layer:9, std:0.6569223999977112
layer:10, std:0.6535593271255493
layer:11, std:0.6544753909111023
layer:12, std:0.6586756706237793
layer:13, std:0.6570520401000977
layer:14, std:0.6585668325424194
layer:15, std:0.6527202725410461
layer:16, std:0.6531647443771362
layer:17, std:0.6501080989837646
layer:18, std:0.6510689854621887
layer:19, std:0.6499229073524475
layer:20, std:0.6534895896911621
layer:21, std:0.6518459320068359
layer:22, std:0.655834972858429
layer:23, std:0.6604207158088684
layer:24, std:0.6566107273101807
layer:25, std:0.650166928768158
layer:26, std:0.6476086378097534
layer:27, std:0.6511222720146179
layer:28, std:0.6472839713096619
layer:29, std:0.6535072922706604
layer:30, std:0.64756649

Xavier初始化，pytorch 内置的初始化方法

In [15]:
import os
import torch
import random
import numpy as np
import torch.nn as nn


import sys



class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.tanh(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight.data, gain=tanh_gain)

# flag = 0
flag = 1

if flag:
    layer_nums = 100
    neural_nums = 256
    batch_size = 16

    net = MLP(neural_nums, layer_nums)
    net.initialize()

    inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

    output = net(inputs)
    print(output)

# ======================================= calculate gain =======================================

# flag = 0
flag = 1

if flag:

    x = torch.randn(10000)
    out = torch.tanh(x)

    gain = x.std() / out.std()
    print('gain:{}'.format(gain))

    tanh_gain = nn.init.calculate_gain('tanh')
    print('tanh_gain in PyTorch:', tanh_gain)

layer:0, std:0.7607788443565369
layer:1, std:0.6905145049095154
layer:2, std:0.6680914759635925
layer:3, std:0.6585116386413574
layer:4, std:0.6576955914497375
layer:5, std:0.6566594839096069
layer:6, std:0.6520739793777466
layer:7, std:0.6564092636108398
layer:8, std:0.6483346819877625
layer:9, std:0.6555622816085815
layer:10, std:0.6452757716178894
layer:11, std:0.6522858142852783
layer:12, std:0.6593003869056702
layer:13, std:0.6533424258232117
layer:14, std:0.6581434607505798
layer:15, std:0.6526800990104675
layer:16, std:0.6536223888397217
layer:17, std:0.6455777883529663
layer:18, std:0.6522363424301147
layer:19, std:0.6618847250938416
layer:20, std:0.6567735075950623
layer:21, std:0.6577458381652832
layer:22, std:0.6539013981819153
layer:23, std:0.6588467359542847
layer:24, std:0.6615076661109924
layer:25, std:0.6632481813430786
layer:26, std:0.6553447842597961
layer:27, std:0.6494207978248596
layer:28, std:0.657860517501831
layer:29, std:0.6600174307823181
layer:30, std:0.65510

虽然Xavier针对饱和的激活函数:sigmoid，tanh 函数提出了有效的初始化方法，而随处AlexNet中的Relu函数的广泛使用，非饱和函数不再适用
，针对非饱和问题，Relu机器变种提出Kaiming初始化方法


![image.png](attachment:image.png)

In [17]:
import os
import torch
import random
import numpy as np
import torch.nn as nn


import sys



class MLP(nn.Module):
    def __init__(self, neural_num, layers):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.neural_num = neural_num

    def forward(self, x):
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.relu(x)

            print("layer:{}, std:{}".format(i, x.std()))
            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break

        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                #手动计算的kaiming初始化方法
#                 nn.init.normal_(m.weight.data, std=np.sqrt(2 / self.neural_num))
                  
                # Pyrorch 中自带的kaiming初始化方法
                nn.init.kaiming_normal_(m.weight.data)


# flag = 0
flag = 1

if flag:
    layer_nums = 100
    neural_nums = 256
    batch_size = 16

    net = MLP(neural_nums, layer_nums)
    net.initialize()

    inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

    output = net(inputs)
    print(output)

# ======================================= calculate gain =======================================

# flag = 0
flag = 1

if flag:

    x = torch.randn(10000)
    out = torch.tanh(x)

    gain = x.std() / out.std()
    print('gain:{}'.format(gain))

    tanh_gain = nn.init.calculate_gain('tanh')
    print('tanh_gain in PyTorch:', tanh_gain)

layer:0, std:0.8176382184028625
layer:1, std:0.8264673352241516
layer:2, std:0.8489429950714111
layer:3, std:0.9077277779579163
layer:4, std:0.9197606444358826
layer:5, std:0.881599485874176
layer:6, std:0.8742204308509827
layer:7, std:0.8941138386726379
layer:8, std:0.9417102932929993
layer:9, std:0.9661489725112915
layer:10, std:1.089404821395874
layer:11, std:1.2634639739990234
layer:12, std:1.269165277481079
layer:13, std:1.298508644104004
layer:14, std:1.2737568616867065
layer:15, std:1.4499799013137817
layer:16, std:1.221543550491333
layer:17, std:1.1838197708129883
layer:18, std:1.0438848733901978
layer:19, std:1.0343197584152222
layer:20, std:1.0427991151809692
layer:21, std:1.0278586149215698
layer:22, std:1.0794451236724854
layer:23, std:1.0894345045089722
layer:24, std:1.0889734029769897
layer:25, std:0.9979874491691589
layer:26, std:0.878072202205658
layer:27, std:0.882300615310669
layer:28, std:0.8978822231292725
layer:29, std:0.8269890546798706
layer:30, std:0.91801422834