# BatchNorm批量归一化层

## 对卷积层做批量归一化，一般是在卷积之后，激活函数之前。而如果卷积的输出有多通道，则分别对每个通道做批量归一化

In [1]:
import sys
sys.path.append('../')

In [2]:
import gluonbook as gb
import mxnet as mx
from mxnet import gluon,nd,autograd,init
from mxnet.gluon import nn,data as gdata,loss as gloss

  from ._conv import register_converters as _register_converters


In [3]:
#预测的时候使用移动平均估计来估算整个训练集的样本均值和方差
def batch_norm(X,gamma,beta,moving_mean, moving_var, eps, momentum):
    if not autograd.is_training():
        #如果是预测模式下，使用移动平均
        X_hat = (X-moving_mean)/nd.sqrt(moving_var+eps)
    else:
        assert len(X.shape) in (2,4)
        #使用全连接层的情况
        if len(X.shape)==2:
            mean = X.mean(axis = 0)
            var = ((X-mean)**2).mean(axis=0)
        else:
            #使用二维卷积的情况下，分别对通道维上（axis=1）上做均值和方差
            mean = X.mean(axis = (0,2,3),keepdims =True)
            var  = ((X-mean)**2).mean(axis=(0,2,3),keepdims = True)
        #训练模式下使用均值和方差做归一化
        X_hat = (X-mean)/nd.sqrt(var+eps)
        
        #更新移动方差的值
        moving_mean = momentum*moving_mean+(1.0-momentum)*mean
        moving_var = momentum*moving_var + (1.0-momentum)*var
    #这里使用了广播机制，不要考虑batch_size这一维度
    Y = gamma*X_hat+beta
    #print('gamma.shape',gamma.shape,'beta.shape',beta.shape,'X_hat.shape',X_hat.shape)
    return Y,moving_mean,moving_var

In [4]:
class BatchNorm(nn.Block):
    def __init__(self,num_features,num_dims,**kwarg):
        super(BatchNorm,self).__init__(**kwarg)
        if num_dims==2:
            shape =(1,num_features)
        else:
            shape = (1,num_features,1,1)
        
        self.gamma = self.params.get('gamma',shape=shape,init=init.One())
        self.beta = self.params.get('beta',shape=shape,init=init.Zero())
        self.moving_mean = nd.zeros(shape=shape,ctx=mx.gpu())
        self.moving_var = nd.zeros(shape=shape,ctx=mx.gpu())
        
    def forward(self,X):
        #对输入进行批量归一化运算
        Y,self.moving_mean,self.moving_var = batch_norm(X,self.gamma.data(),self.beta.data(),
                                                        self.moving_mean,self.moving_var,eps=1e-5,momentum=0.9)
        return Y

In [5]:
X = nd.random.uniform(shape=(10,4,96,96),ctx=mx.gpu())
bc = BatchNorm(X.shape[1],len(X.shape))
bc.initialize(ctx=mx.gpu())

In [6]:
bc(X)


[[[[2.11445984e+02 5.50527039e+01 1.21748482e+02 ... 2.45513954e+01
    2.43180939e+02 1.33797531e+02]
   [1.75504944e+02 1.36014297e+02 2.09094219e+01 ... 1.80901520e+02
    2.03125610e+02 1.92475677e+02]
   [2.33037354e+02 2.36600311e+02 1.51984329e+02 ... 2.89971191e+02
    2.35748962e+02 4.58838196e+01]
   ...
   [1.98529221e+02 2.82599030e+02 2.65139038e+02 ... 7.44706345e+01
    2.74270172e+02 2.75404022e+02]
   [2.63827454e+02 3.72663498e+01 1.83355331e+02 ... 2.42908463e+02
    2.69487274e+02 6.64557343e+01]
   [9.77139740e+01 1.58002625e+02 3.24777222e+00 ... 9.99013214e+01
    4.68813477e+01 7.71815948e+01]]

  [[3.04227631e+02 2.28910309e+02 1.41808792e+02 ... 1.06261757e+02
    1.02973022e+02 2.15423248e+02]
   [2.70982361e+02 2.66822693e+02 2.75968903e+02 ... 2.47513092e+02
    2.77490509e+02 2.13826599e+02]
   [1.76617451e+01 8.64161301e+00 3.13797150e+02 ... 9.28982010e+01
    2.13478027e+02 2.48735641e+02]
   ...
   [7.96426239e+01 2.36160385e+02 1.44544174e+02 ... 2.2

##  使用BN的 LeNet

In [7]:
LeNet_BatchNorm = nn.Sequential()


In [8]:
LeNet_BatchNorm.add(nn.Conv2D(6,kernel_size=5),
                    BatchNorm(6,4),
                    nn.Activation('sigmoid'),
                    nn.MaxPool2D(pool_size=2,strides=2),
                    nn.Conv2D(16,kernel_size=5),
                    BatchNorm(16,4),
                    nn.Activation('sigmoid'),
                    nn.MaxPool2D(pool_size=2, strides=2),
                    nn.Dense(120),
                    BatchNorm(120, num_dims=2),
                    nn.Activation('sigmoid'),
                    nn.Dense(84),
                    BatchNorm(84, num_dims=2),
                    nn.Activation('sigmoid'),
                    nn.Dense(10)
                    )

In [9]:
lr,batch_size,num_epochs,ctx=1.0,256,10,gb.try_gpu()
LeNet_BatchNorm.initialize(init=init.Xavier(),ctx=mx.gpu(),force_reinit=True)
trainer = gluon.Trainer(LeNet_BatchNorm.collect_params(),'sgd',{'learning_rate':lr})
train_iter,test_iter = gb.load_data_fashion_mnist(batch_size,resize=None)

In [10]:
gb.train_ch5(LeNet_BatchNorm,train_iter,test_iter,batch_size,trainer,ctx,num_epochs)

training on gpu(0)
epoch 1, loss 0.6593, train acc 0.765, test acc 0.701, time 8.6 sec
epoch 2, loss 0.3950, train acc 0.857, test acc 0.813, time 8.3 sec
epoch 3, loss 0.3455, train acc 0.875, test acc 0.834, time 8.3 sec
epoch 4, loss 0.3202, train acc 0.884, test acc 0.875, time 8.3 sec
epoch 5, loss 0.3046, train acc 0.888, test acc 0.834, time 8.3 sec
epoch 6, loss 0.2887, train acc 0.896, test acc 0.886, time 8.4 sec
epoch 7, loss 0.2754, train acc 0.899, test acc 0.844, time 8.4 sec
epoch 8, loss 0.2675, train acc 0.903, test acc 0.883, time 8.5 sec
epoch 9, loss 0.2607, train acc 0.904, test acc 0.830, time 8.4 sec
epoch 10, loss 0.2533, train acc 0.907, test acc 0.874, time 8.3 sec


## 加入BN层之后，模型的收敛速度提高

## 使用Gluon接口来搭建 LeNet

In [11]:
LeNet = nn.Sequential()

In [16]:
LeNet.add(nn.Conv2D(channels=6,kernel_size=5),
          nn.BatchNorm(),
          nn.Activation('sigmoid'),
          nn.MaxPool2D(pool_size=2,strides=2),
          nn.Conv2D(channels=16,kernel_size=5),
          nn.BatchNorm(),
          nn.Activation('sigmoid'),
          nn.MaxPool2D(pool_size=2,strides=2),
          nn.Dense(120),
          nn.BatchNorm(),
          nn.Activation('sigmoid'),
          nn.Dense(64),
          nn.BatchNorm(),
          nn.Activation('sigmoid'),
          nn.Dense(10)
        )

In [19]:
LeNet.initialize(init=init.Xavier(),ctx=ctx,force_reinit=True)
trainer = gluon.Trainer(LeNet.collect_params(),'sgd',{'learning_rate':lr})
gb.train_ch5(LeNet,train_iter,test_iter,batch_size,trainer,ctx,num_epochs)

training on gpu(0)
epoch 1, loss 0.6162, train acc 0.782, test acc 0.840, time 5.7 sec
epoch 2, loss 0.3909, train acc 0.860, test acc 0.842, time 5.7 sec
epoch 3, loss 0.3447, train acc 0.875, test acc 0.854, time 5.7 sec
epoch 4, loss 0.3230, train acc 0.883, test acc 0.868, time 5.9 sec
epoch 5, loss 0.3038, train acc 0.888, test acc 0.880, time 6.0 sec
epoch 6, loss 0.2913, train acc 0.893, test acc 0.859, time 6.4 sec
epoch 7, loss 0.2776, train acc 0.898, test acc 0.895, time 6.5 sec
epoch 8, loss 0.2705, train acc 0.900, test acc 0.876, time 5.7 sec
epoch 9, loss 0.2624, train acc 0.902, test acc 0.893, time 5.7 sec
epoch 10, loss 0.2519, train acc 0.906, test acc 0.890, time 5.7 sec



这是数据的原因。当维度为2的时候，维度是(batch, num_features)，每一列都认为是一列特征，不同特征间的分布、相关性都是未知的，所以一般不对所有特征做BN，但是可以认为相同的特征来自相同的分布，所以在一个batch里面每一个feature单独做BN。
当维度为4的时候，（batch, channel, height, weight)，不同的channel数据的分布可能不同，所以不对不同通道的数据做BN，同时对于图像数据，都是像素点，会假设相同通道的像素点取自相同的分布，所以对于四维的数据，同一个batch 里面，每个通道单独做BN。

个人理解：不同的BN，基于不同的数据特征假设。