In [1]:
#批量归一层
import d2lzh as d2l
from mxnet import autograd,gluon,init,nd
from mxnet.gluon import nn
def batch_norm(X,gamma,beta,moving_mean,moving_var,eps,momentum):
    #autograd判断当前模式为训练模式还是测试模式
    if not autograd.is_training():
        # 如果是在预测模式下，直接使⽤传⼊的移动平均所得的均值和⽅差
        X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2,4)
        if len(X.shape) == 2:
            # 使⽤全连接层的情况，计算特征维上的均值和⽅差
            mean = X.mean(axis=0)
            var = ((X - mean) ** 2).mean(axis=0)
        else:
            # 使⽤⼆维卷积层的情况，计算通道维上（axis=1）的均值和⽅差。这⾥我们需要保持
            # X的形状以便后⾯可以做⼴播运算
            mean = X.mean(axis=(0, 2, 3), keepdims=True)
            var = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True)
        # 训练模式下⽤当前的均值和⽅差做标准化
        X_hat = (X - mean) / nd.sqrt(var + eps)
        # 更新移动平均的均值和⽅差
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var    
    Y = gamma * X_hat + beta # 拉伸和偏移
    return Y, moving_mean, moving_var



In [2]:
class BatchNorm(nn.Block):
    def __init__(self,num_features,num_dims,**kwargs):
        super(BatchNorm,self).__init__(**kwargs)
        if num_dims == 2:
            shape = (1,num_features)
        else:
            shape = (1,num_features,1,1)
        #参与求梯度和迭代的拉伸和偏移参数，分别初始化为1和0
        self.gamma = self.params.get('gamma',shape=shape,init = init.One())
        self.beta = self.params.get('beta',shape=shape,init=init.Zero())
        #不参与求梯度和迭代的变量全初始化为0
        self.moving_mean = nd.zeros(shape)
        self.moving_var = nd.zeros(shape)
        
    def forward(self,X):
        #显存复制
        if self.moving_mean.context != X.context:
            self.moving_mean = self.moving_mean.copyto(X.context)
            self.moving_var = self.moving_var.copyto(X.context)
        #保存
        Y,self.moving_mean,self.moving_var = batch_norm(X,self.gamma.data(),self.beta.data(),
                                                       self.moving_mean,self.moving_var,eps=1e-5,momentum=0.9)
        return Y
    

In [4]:
net = nn.Sequential()
net.add(nn.Conv2D(6,kernel_size=5),
        BatchNorm(6,num_dims=4),
        nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size=2,strides=2),
       nn.Conv2D(16,kernel_size=5),
       BatchNorm(16,num_dims=4),
       nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size=2,strides=2),
       nn.Dense(120),
       BatchNorm(120,num_dims=2),
       nn.Activation('sigmoid'),
        nn.Dense(84),
        BatchNorm(84, num_dims=2),
        nn.Activation('sigmoid'),
        nn.Dense(10))

In [5]:
lr, num_epochs, batch_size, ctx = 1.0, 5, 256, d2l.try_gpu()
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
num_epochs)

training on gpu(0)
epoch 1, loss 0.6659, train acc 0.762, test acc 0.811, time 7.6 sec
epoch 2, loss 0.3903, train acc 0.860, test acc 0.833, time 6.6 sec
epoch 3, loss 0.3462, train acc 0.874, test acc 0.833, time 6.7 sec
epoch 4, loss 0.3180, train acc 0.884, test acc 0.822, time 6.6 sec
epoch 5, loss 0.3029, train acc 0.890, test acc 0.886, time 6.6 sec


In [9]:
print(net[1].gamma.data().reshape((-1,)),net[1].beta.data().reshape((-1,)))


[2.1301334 1.5873271 1.5992993 1.588231  1.0599676 1.3605769]
<NDArray 6 @gpu(0)> 
[ 1.361746    0.04332499  0.22823651  0.850483   -0.69039077 -1.5197439 ]
<NDArray 6 @gpu(0)>


In [10]:
#Gluon 简洁实现，自动延后初始化获取特征和维度参数
net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size=5),
        nn.BatchNorm(),
        nn.Activation('sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Conv2D(16, kernel_size=5),
        nn.BatchNorm(),
        nn.Activation('sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Dense(120),
        nn.BatchNorm(),
        nn.Activation('sigmoid'),
        nn.Dense(84),
        nn.BatchNorm(),
        nn.Activation('sigmoid'),
        nn.Dense(10))

In [11]:
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
                num_epochs)

training on gpu(0)
epoch 1, loss 0.6467, train acc 0.773, test acc 0.853, time 5.3 sec
epoch 2, loss 0.3973, train acc 0.856, test acc 0.860, time 4.8 sec
epoch 3, loss 0.3500, train acc 0.873, test acc 0.859, time 4.9 sec
epoch 4, loss 0.3220, train acc 0.884, test acc 0.860, time 4.9 sec
epoch 5, loss 0.3033, train acc 0.890, test acc 0.873, time 4.8 sec
