[View in Colaboratory](https://colab.research.google.com/github/x110/DLToolboxImg/blob/master/BatchNormMxnet.ipynb)

# BatchNorm Layer in Mxnet

In [0]:
!pip install mxnet

Collecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/71/64/49c5125befd5e0f0e17f115d55cb78080adacbead9d19f253afd0157656a/mxnet-1.3.0.post0-py2.py3-none-manylinux1_x86_64.whl (27.7MB)
[K    100% |████████████████████████████████| 27.8MB 1.2MB/s 
Installing collected packages: mxnet
Successfully installed mxnet-1.3.0.post0


In [0]:
import mxnet as mx
import numpy as np
np.random.seed(313)

In [0]:
#generatedata
N=10
x = np.random.randn(N,1)
noise = .25*np.random.randn(N,1)
y=3*x+2+noise

In [0]:
#define Iterator
train_iter = mx.io.NDArrayIter(x, y,batch_size=2)
input_shapes = dict(train_iter.provide_data+train_iter.provide_label)
print(input_shapes)

{'data': (2, 1), 'softmax_label': (2, 1)}


In [0]:
#define network
source = mx.sym.Variable("data")
label = mx.sym.Variable("softmax_label")
network = mx.sym.BatchNorm(source)
network=mx.sym.LinearRegressionOutput(network,label)

In [0]:
exe = network.simple_bind(ctx=mx.cpu(), **input_shapes)

In [0]:
arg_arrays = dict(zip(network.list_arguments(), exe.arg_arrays))
data = arg_arrays[train_iter.provide_data[0][0]]
label = arg_arrays[train_iter.provide_label[0][0]]

In [0]:
#forward pass
train_iter.reset()
batch = train_iter.next()
data[:] = batch.data[0]
label[:] = batch.label[0]
y = exe.forward(is_train=True)
y

[
 [[-0.9871021]
  [ 0.9871021]]
 <NDArray 2x1 @cpu(0)>]

In [0]:
#forwardpass manually
xi = data.asnumpy()
a = np.mean(xi)
b = np.var(xi)
xnorm = (xi-a)/np.sqrt(b+1e-5)
beta, alpha = exe.arg_dict['batchnorm0_beta'].asnumpy(),exe.arg_dict['batchnorm0_gamma'].asnumpy()
ynorm = alpha * xnorm+beta
ynorm

array([[-0.9998685],
       [ 0.9998685]], dtype=float32)

In [0]:
exe.arg_dict

{'batchnorm0_beta': 
 [0.]
 <NDArray 1 @cpu(0)>, 'batchnorm0_gamma': 
 [1.]
 <NDArray 1 @cpu(0)>, 'data': 
 [[-0.1593055]
  [ 0.2306561]]
 <NDArray 2x1 @cpu(0)>, 'softmax_label': 
 [[1.3864261]
  [2.8935425]]
 <NDArray 2x1 @cpu(0)>}

In [0]:
#backwardpass
exe.backward()

In [0]:
exe.grad_dict

{'batchnorm0_beta': 
 [-4.2799687]
 <NDArray 1 @cpu(0)>, 'batchnorm0_gamma': 
 [0.]
 <NDArray 1 @cpu(0)>, 'data': 
 [[-0.0303027]
  [ 0.0303027]]
 <NDArray 2x1 @cpu(0)>, 'softmax_label': 
 [[0.]
  [0.]]
 <NDArray 2x1 @cpu(0)>}

In [0]:
#backwardpass manually
2*np.mean((ynorm-label.asnumpy())),2*np.mean((ynorm-label.asnumpy())*xnorm)
#the first gradient is correct but the second is not

(-4.279968738555908, 0.4925558567047119)

I computed the gradients as:
$$E=\frac{1}{N}\sum(ynorm_n-label_n)^2$$
$$\frac{\partial E}{\partial \gamma} = \frac{2}{N}\sum((ynorm_n-label_n)*xnorm)$$
$$ ynorm_n = \gamma  (xnorm_n) + \beta$$