# Weight  Initialization and Batch Normalization

In [None]:
# !nvcc --version
!pip install mxnet-cu100

In [None]:
import mxnet as mx
from mxnet import gluon, init, nd, autograd
from mxnet.gluon import data as gdata, nn, loss as gloss, utils as gutils
import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import sys
import time

def try_gpu():
    """If GPU is available, return mx.gpu(0); else return mx.cpu()."""
    try:
        ctx = mx.gpu()
        _ = nd.array([0], ctx=ctx)
    except mx.base.MXNetError:
        ctx = mx.cpu()
    return ctx

def train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
              num_epochs):
    """Train and evaluate a model with CPU or GPU."""
    print('training on', ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X, y = X.as_in_context(ctx), y.as_in_context(ctx)
            with autograd.record():
                y_hat = net(X)            
                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
              'time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc,
                 time.time() - start))
        
def evaluate_accuracy(data_iter, net, ctx=[mx.cpu()]):
    """Evaluate accuracy of a model on the given data set."""
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    acc_sum, n = nd.array([0]), 0
    for batch in data_iter:
        features, labels, _ = _get_batch(batch, ctx)
        for X, y in zip(features, labels):
            y = y.astype('float32')
            acc_sum += (net(X).argmax(axis=1) == y).sum().copyto(mx.cpu())
            n += y.size
        acc_sum.wait_to_read()
    return acc_sum.asscalar() / n

def _get_batch(batch, ctx):
    """Return features and labels on ctx."""
    features, labels = batch
    if labels.dtype != features.dtype:
        labels = labels.astype(features.dtype)
    return (gutils.split_and_load(features, ctx),
            gutils.split_and_load(labels, ctx), features.shape[0])

def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join(
        '~', '.mxnet', 'datasets', 'fashion-mnist')):
    root = os.path.expanduser(root)  # Expand the user path '~'.
    transformer = []
    if resize:
        transformer += [gdata.vision.transforms.Resize(resize)]
    transformer += [gdata.vision.transforms.ToTensor()]
    transformer += [gdata.vision.transforms.Normalize(0.13, 0.31)]
    transformer = gdata.vision.transforms.Compose(transformer)
    mnist_train = gdata.vision.FashionMNIST(root=root, train=True)
    mnist_test = gdata.vision.FashionMNIST(root=root, train=False)
    num_workers = 0 if sys.platform.startswith('win32') else 4
    train_iter = gdata.DataLoader(
        mnist_train.transform_first(transformer), batch_size, shuffle=True,
        num_workers=num_workers)
    test_iter = gdata.DataLoader(
        mnist_test.transform_first(transformer), batch_size, shuffle=False,
        num_workers=num_workers)
    return train_iter, test_iter

Now you need to use different weight initialization method and batch normalization skill to let the 60 layers model can be trained.

In [None]:
lr, num_epochs, ctx = 1e-3, 20, try_gpu()
#############################################################################
# TODO: Use batch normalization  skill to let the model can be trained      #                                  
#############################################################################
net = nn.Sequential()

for i in range(60):
    net.add(nn.Dense(100, activation="tanh"))    
net.add(nn.Dense(10))
#############################################################################

# https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.initializer.Initializer
#############################################################################
# TODO: Use different weight initialization method to let the model can be  #
# trained                                                                   #
#############################################################################
net.initialize(force_reinit=True, ctx=ctx, init=init.Normal())
#############################################################################

In [None]:

# https://mxnet.incubator.apache.org/versions/0.11.0/api/python/optimization.html#mxnet.optimizer.Optimizer
adam_optimizer = mx.optimizer.Adam(learning_rate=lr, beta1=0.8, beta2=0.9)
batch_size = 128
train_iter, test_iter = load_data_fashion_mnist(batch_size)
trainer = gluon.Trainer(net.collect_params(), optimizer=adam_optimizer)
train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
          num_epochs)

Brfore doning the homework, here are some tips you neeed to know

1. get the every layer output result.

In [None]:
net = nn.Sequential()
for i in range(10):
    net.add(nn.Dense(10, activation="tanh"))    
net.add(nn.Dense(10))
net.initialize(force_reinit=True, init=init.Normal())

In [None]:
x = nd.random.uniform(-1,1,(1,10))
y = net(x)
print(y)

you can use net[i] to get the every layer output result.

In [None]:
x = nd.random.uniform(-1,1,(1,10))
for i in range(10):
    print(x)
    x = net[i](x)

2. how to use batchNorm layer.

When you forward the network without using "with autograd.record():", the batchNorm layer may out of function

In [None]:
import numpy as np

net = nn.Sequential()
net.add(nn.Dense(100, activation="tanh"))
net.add(nn.BatchNorm())
net.initialize(force_reinit=True, init=init.Normal())

x = nd.random.uniform(-1,1,(128,100))
y = net(x)

print("x var:" + str(np.var(x.asnumpy())))
print("y var:" + str(np.var(y.asnumpy())))

In [None]:
import numpy as np

net = nn.Sequential()
net.add(nn.Dense(100, activation="tanh"))
net.add(nn.BatchNorm())
net.initialize(force_reinit=True, init=init.Normal())

x = nd.random.uniform(-1,1,(128,100))
with autograd.record():
    y = net(x)

print("x var:" + str(np.var(x.asnumpy())))
print("y var:" + str(np.var(y.asnumpy())))

# Homework
1. Plot the value distribution picture of output and output gradient in very layer. (In two different weight initialization methods, which are normal and Xavier weight initialization)
2. Plot the variance value picture of output and output gradient in very layer. (In two different weight initialization methods, which are normal and Xavier weight initialization)
3. Plot the variance value picture of output and output gradient in very layer. (without and with batchNorm in every layer)
4. Plot the variance value picture of output in very layer. (In two different weight initialization method, which are normal and Xavier weight initialization. Every layer has batchNorm layer)

## 1. Plot the value distribution picture of output and output gradient in very layer. (In two different weight initialization methods, which are normal and Xavier weight initialization)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from mxnet import nd, gluon, init, autograd
import mxnet as mx
from mxnet import gluon, init, nd, autograd
from mxnet.gluon import data as gdata, nn, loss as gloss, utils as gutils

In [None]:
net = nn.Sequential()
net.add(nn.Dense(200, activation='tanh', use_bias=False),
        nn.Dense(200, activation='tanh', use_bias=False),
        nn.Dense(200, activation='tanh', use_bias=False),
        nn.Dense(200, activation='tanh', use_bias=False)
       )

interval = 50
x = nd.random.uniform(-1,1,(1,100))
net.initialize(force_reinit=True, init=init.Normal())
###########################################################################################
# TODO: """Normal normalization"""
###########################################################################################


In [None]:
net = nn.Sequential()
net.add(nn.Dense(200, activation='tanh', use_bias=False),
        nn.Dense(200, activation='tanh', use_bias=False),
        nn.Dense(200, activation='tanh', use_bias=False),
        nn.Dense(200, activation='tanh', use_bias=False)
       )

interval = 50
x = nd.random.uniform(-1,1,(1,100))
net.initialize(force_reinit=True, init=init.Xavier())
###########################################################################################
# TODO: """Xavier normalization"""
###########################################################################################  


## 2. Plot the variance value picture of output and output gradient in very layer. (In two different weight initialization methods, which are normal and Xavier weight initialization)

In [None]:
layer_num = 60
net = nn.Sequential()
for i in range(layer_num):
    net.add(nn.Dense(100, activation="tanh", use_bias=False))
x = nd.random.uniform(-1,1,(1,100))
net.initialize(force_reinit=True, init=init.Normal())
###########################################################################################
# TODO: """Normal normalization"""
###########################################################################################


In [None]:
layer_num = 60
net = nn.Sequential()
for i in range(layer_num):
    net.add(nn.Dense(100, activation="tanh", use_bias=False))
x = nd.random.uniform(-1,1,(1,100))
net.initialize(force_reinit=True, init=init.Xavier())
###########################################################################################
# TODO: """Xavier normalization"""
###########################################################################################


In [None]:
###########################################################################################
# TODO: """ Compare two results """
###########################################################################################

## 3. Plot the variance value picture of output and output gradient in very layer. (In Xavier weight initialization with batchNorm layer and without batchNorm layer)

In [None]:
layer_num = 500
net = nn.Sequential()
for i in range(layer_num):
    net.add(nn.Dense(100, activation="tanh", use_bias=False))
net.initialize(force_reinit=True, init=init.Xavier())
x = nd.random.uniform(-1,1,(32, 100))
###########################################################################################
# TODO: """ Without BatchNorm normalization"""
###########################################################################################


In [None]:
layer_num = 500
net = nn.Sequential()
for i in range(layer_num):
    net.add(nn.Dense(100, activation="tanh", use_bias=False))    
    net.add(nn.BatchNorm())
net.initialize(force_reinit=True, init=init.Xavier())
x = nd.random.uniform(-1,1,(32, 100))
###########################################################################################
# TODO: """ BatchNorm normalization """
###########################################################################################


In [None]:
###########################################################################################
# TODO: """ Compare two results"""
###########################################################################################


## 4. Plot the variance value picture of output in very layer. (In two different weight initialization method, which are normal and Xavier weight initialization. Every layer has batchNorm layer)

In [None]:
layer_num = 100
net = nn.Sequential()
for i in range(layer_num):
    net.add(nn.Dense(100, activation="tanh", use_bias=False))    
    net.add(nn.BatchNorm())
net.initialize(force_reinit=True, init=init.Normal())
x = nd.random.uniform(-1,1,(128, 100))
###########################################################################################
# TODO: """ Normal initialization """
###########################################################################################


In [None]:
layer_num = 100
net = nn.Sequential()
for i in range(layer_num):
    net.add(nn.Dense(100, activation="tanh", use_bias=False))    
    net.add(nn.BatchNorm())
net.initialize(force_reinit=True, init=init.Xavier())
x = nd.random.uniform(-1,1,(32, 100))
###########################################################################################
# TODO: """ Xavier initialization """
###########################################################################################
