# Things you should know about nromalization

## Start from a common problem in NN training

Data may have different var and mean across batch, which might lead to training problem:
1. the var and mean diverse too much between batch, which lead to multi-scale feature, thus parameters
2. the multi-scale parameter add difficulty to grad dscent optimization

## method for batch normalization

suppose we have data $Z_{ij}$ of shape (i: feature_dim, j: batch_size)

to normalize on the second dimensiton: $Z'_{ij} = \frac{Z_{ij}-\sum_j Z_{ij}/N}{\sqrt{\sum_j(Z_{ij}-\sum_j Z_{ij}/N)^2/N + \epsilon}}$

this is the same as

$Z'_{ij} = \frac{Z_{ij}-\mu_i}{\sqrt{\sigma_i^2 + \epsilon}}$, where $\mu_i = \sum_j Z_{ij}/N$, $\sigma_i^2 = \sum_j(Z_{ij}-\mu_i)^2/N$

To still encourage var and mean adjust,

$Z''_{ij} = G_i Z'_{ij} + B_i$

Rewrite the function $Z''_{ij} = f_{G_i, B_i}(Z_{ij})$ as batch normalization, it has number of feature params, that is to say each dim of feature the normalization is dependent.

## where to add batch norm in NN?

Usually batch normalizaition is added before activation function, after linear layer.

$y = Activation(BatchNorm(WX))$

There is additional normalization parameter $G_i$, $B_i$ to train. This part is named normalized activation according to origional paper.

## does it work?

Normalized data makes gradient descent learning easier, take this view for each layer, we would like to normalize all the layer by mean and var of the whole dataset during training.

But, it is never possilble to calculate mean and var for all the data, since each update requires a full loop of dataset. And the convariance adds difficiulty to normalizatin, since it might not be invertible. Thus, BN makes two assumption: 1. each dimension of data is independent. 2. the batch data is a good approximation of full data on mean and var.

suppose we want to predict a of data [a, a+1, a+2] given the seq.

This makes a bad case for applying batch noralization, because the mean and var of data are not the same and each dimension is not depentent.

## a test experiment: 

To apply batch normalization to convolutional neural network. Suppose we have p*q feature map after convolution, then we hace p*q channels and they are normalized eperately?


There is a condebase do really similar thing: https://github.com/udacity/deep-learning/blob/master/batch-norm/Batch_Normalization_Lesson.ipynb

## reference

[Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/pdf/1502.03167)



In [None]:
# -*- coding: utf-8 -*-
# @File  : mnist.py
# @Author: lizhen
# @Date  : 2020/2/4
# @Desc  : 工具类，用于解析mnist数据集

import urllib.request # python3
import os.path
import gzip
import pickle
import os
import numpy as np

# http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
# http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
# http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
# http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz


url_base = "http://yann.lecun.com/exdb/mnist/"
key_file = {
        'train_img':'train-images-idx3-ubyte.gz',
        'train_label':'train-labels-idx1-ubyte.gz',
        'test_img':'t10k-images-idx3-ubyte.gz',
        'test_label':'t10k-labels-idx1-ubyte.gz'
        }

dataset_dir=os.path.dirname(os.path.abspath(__file__))
save_file=dataset_dir + "/mnist.pkl"

train_num = 60000;
test_num  = 10000;
img_dim   = (1, 28, 28)
img_size  = 28*28;


def _download(file_name):
    """
    :param file_name: 下载mnist的文件
    :return: null
    """
    file_path = os.path.join(dataset_dir, file_name)

    if os.path.exists(file_path):
        return

    print("downloading"+file_name+ "...")
    urllib.request.urlretrieve(url_base + file_name , file_path)
    print("Done.")

def download_mnist():
    """

    :return:
    """
    for file_name in key_file.values():
        _download(file_name);

def _load_label(file_name):
    """
    解析标签
    :param file_name:
    :return:
    """
    file_path = dataset_dir+'/'+ file_name

    print("converting "+file_name+" to numpy Array.")
    with gzip.open(file_path) as f:
        labels = np.frombuffer(f.read(), np.uint8, offset=8)
    print("Done")

    return labels

def _load_img(file_name):
    """
    解析 压缩的图片
    :param file_name:
    :return:
    """
    file_path = dataset_dir +'/' + file_name

    print("converting "+ file_name + "to numpy Array")
    with gzip.open(file_path) as f:
        data = np.frombuffer(f.read(), np.uint8, offset=16) # 16*8=
    data = data.reshape(-1, img_size) # N, (W*H*C)=[N,28*28*1]
    print("Done")

    return data

def _convert_numpy():
    """
     解析 image和label，将其转换为numpy
    """
    dataset = {}
    dataset['train_img'] = _load_img(key_file['train_img'])
    dataset['train_label'] = _load_label(key_file['train_label'])
    dataset['test_img'] = _load_img(key_file['test_img'])
    dataset['test_label'] = _load_label(key_file['test_label'])

    return dataset

def init_mnist():
    """
    初始化mnist数据集：
    1. 下载mnist，
    2. 以二进制的方式读取，并转换成numpy的ndarray对象
    3. 将转换后的ndarray 序列化

    :return:
    """
    print("download mnist dataset...")
    download_mnist()
    print("convert to numpy array...")
    dataset = _convert_numpy()
    print("creating pickle file ...")
    with open(save_file, 'wb') as f:
        pickle.dump(dataset, f, -1)
    print("Done!")

def _change_one_hot_label(Y):
    T = np.zeros((Y.size,10))
    for idx,row in enumerate(T):
        row[Y[idx]] = 1
    return T

def load_mnist(normalize=True, flatten=True, one_hot_label=False):
    """

    :param normalize: 将数据标准化到0.0~1.0
    :param flatten: 是否要将数据拉伸层1D数组的形式
    :param one_hot_label:
    :return: (训练数据, 训练标签), (测试数据, 测试label)
    """


    if not os.path.exists(save_file):
        init_mnist()

    with open(save_file,'rb') as f:
        dataset = pickle.load(f)

    if normalize:
        for key in ('train_img','test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /=255.0
    if one_hot_label:
        dataset['train_label'] = _change_one_hot_label(dataset['train_label'])
        dataset['test_label']  = _change_one_hot_label(dataset['test_label'])

    if not flatten:
        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].reshape(-1,1,28,28) # NCHW

    return (dataset['train_img'],dataset['train_label']),(dataset['test_img'], dataset['test_label'])

if __name__ == '__main__':
    init_mnist()

In [None]:
# -*- coding: utf-8 -*-
# @File  : optimizer.py
# @Author: lizhen
# @Date  : 2020/2/7
# @Desc  : 对应opt
import numpy as np
class BaseOpts:
    def update(self, params, grads):
        pass

class SGD(BaseOpts):

    """
    梯度下降（Stochastic Gradient Descent）：

    """

    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key] 


class Momentum(BaseOpts):

    """
    Momentum SGD：
    """

    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():                                
                self.v[key] = np.zeros_like(val)
                
        for key in params.keys():
            self.v[key] = self.momentum*self.v[key] - self.lr*grads[key] 
            params[key] += self.v[key]


class Nesterov(BaseOpts):
    """
    Nesterov's Accelerated Gradient: http://arxiv.org/abs/1212.0901
    """


    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
            
        for key in params.keys():
            self.v[key] *= self.momentum
            self.v[key] -= self.lr * grads[key]
            params[key] += self.momentum * self.momentum * self.v[key]
            params[key] -= (1 + self.momentum) * self.lr * grads[key]


class AdaGrad(BaseOpts):

    """
    AdaGrad
    """

    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
            
        for key in params.keys():
            self.h[key] += grads[key] * grads[key] # 平方和
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)


class RMSprop(BaseOpts):

    """
    RMSprop：

    """

    def __init__(self, lr=0.01, decay_rate = 0.99):
        self.lr = lr
        self.decay_rate = decay_rate
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
            
        for key in params.keys():
            self.h[key] *= self.decay_rate
            self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)


class Adam(BaseOpts):

    """
    Adam (http://arxiv.org/abs/1412.6980v8)
    """

    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
        
        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)
        
        for key in params.keys():
            #self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
            #self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])
            
            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

In [None]:
# -*- coding: utf-8 -*-
# @File  : gradient.py
# @Author: lizhen
# @Date  : 2020/1/27
# @Desc  : 梯度
import numpy as np

def numerical_gradient(f, x):
    '''
    数值微分，求f(x)的梯度
    :param f:
    :param x:
    :return:
    '''
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = tmp_val + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 恢复成原来的值
        it.iternext()   
        
    return grad

In [None]:
# -*- coding: utf-8 -*-
# @File  : Nets.py
# @Author: lizhen
# @Date  : 2020/2/15
# @Desc  : 网络层的基类
class Net:
    def loss(self, x, t):
        '''
        调用优化器opt, 计算 x 与t 之间的差距
        :param x:
        :param t:
        :return:
        '''
        pass
    def gradient(self, x, t):
        pass
    def numerical_gradient(self, x, t):
        """
        调用loss(),获取loss value
        根据loss值，计算数值微分，
        :param x:
        :param t:
        :return:
        """
        pass


In [None]:
# -*- coding: utf-8 -*-
# @File  : multi_layer_net_extend.py
# @Author: lizhen
# @Date  : 2020/2/4
# @Desc  : 多层神经网络，有全连接层


from collections import OrderedDict
from src.common.layers import *
from src.nets import BaseNets

class MultiLayerNetExtend(BaseNets):

    def __init__(self, input_size, hidden_size_list, output_size,
                 activation='relu', weight_init_std='relu', weight_decay_lambda=0, 
                 use_dropout = False, dropout_ration = 0.5, use_batchnorm=False):
        """

        :param input_size:
        :param hidden_size_list:
        :param output_size:
        :param activation:
        :param weight_init_std:
        :param weight_decay_lambda:
        :param use_dropout:
        :param dropout_ration:
        :param use_batchnorm:
        """
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size_list = hidden_size_list
        self.hidden_layer_num = len(hidden_size_list)
        self.use_dropout = use_dropout
        self.weight_decay_lambda = weight_decay_lambda
        self.use_batchnorm = use_batchnorm
        self.params = {}

        # 初始化权重
        self.__init_weight(weight_init_std)

        # gen 激活
        activation_layer = {'sigmoid': Sigmoid, 'relu': Relu}
        self.layers = OrderedDict()
        for idx in range(1, self.hidden_layer_num+1):
            self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
                                                      self.params['b' + str(idx)])
            if self.use_batchnorm:
                self.params['gamma' + str(idx)] = np.ones(hidden_size_list[idx-1])
                self.params['beta' + str(idx)] = np.zeros(hidden_size_list[idx-1])
                self.layers['BatchNorm' + str(idx)] = BatchNormalization(self.params['gamma' + str(idx)], self.params['beta' + str(idx)])
                
            self.layers['Activation_function' + str(idx)] = activation_layer[activation]()
            
            if self.use_dropout:
                self.layers['Dropout' + str(idx)] = Dropout(dropout_ration)

        idx = self.hidden_layer_num + 1
        self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], self.params['b' + str(idx)])

        self.last_layer = SoftmaxWithLoss()

    def __init_weight(self, weight_init_std):
        '''

        :param weight_init_std:
        :return:
        '''
        all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]
        for idx in range(1, len(all_size_list)):
            scale = weight_init_std
            if str(weight_init_std).lower() in ('relu', 'he'):
                scale = np.sqrt(2.0 / all_size_list[idx - 1])  # ReLUを使う場合に推奨される初期値
            elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
                scale = np.sqrt(1.0 / all_size_list[idx - 1])  # sigmoidを使う場合に推奨される初期値
            self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
            self.params['b' + str(idx)] = np.zeros(all_size_list[idx])

    def predict(self, x, train_flg=False):
        for key, layer in self.layers.items():
            if "Dropout" in key or "BatchNorm" in key:
                x = layer.forward(x, train_flg)
            else:
                x = layer.forward(x)

        return x

    def loss(self, x, t, train_flg=False):
        y = self.predict(x, train_flg)

        weight_decay = 0
        for idx in range(1, self.hidden_layer_num + 2):
            W = self.params['W' + str(idx)]
            weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2)

        return self.last_layer.forward(y, t) + weight_decay

    def accuracy(self, X, T):
        Y = self.predict(X, train_flg=False)
        Y = np.argmax(Y, axis=1)
        if T.ndim != 1 : T = np.argmax(T, axis=1)

        accuracy = np.sum(Y == T) / float(X.shape[0])
        return accuracy

    def numerical_gradient(self, X, T):
        loss_W = lambda W: self.loss(X, T, train_flg=True)

        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = numerical_gradient(loss_W, self.params['W' + str(idx)])
            grads['b' + str(idx)] = numerical_gradient(loss_W, self.params['b' + str(idx)])
            
            if self.use_batchnorm and idx != self.hidden_layer_num+1:
                grads['gamma' + str(idx)] = numerical_gradient(loss_W, self.params['gamma' + str(idx)])
                grads['beta' + str(idx)] = numerical_gradient(loss_W, self.params['beta' + str(idx)])

        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t, train_flg=True)

        # backward
        dout = 1
        dout = self.last_layer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.params['W' + str(idx)]
            grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db

            if self.use_batchnorm and idx != self.hidden_layer_num+1:
                grads['gamma' + str(idx)] = self.layers['BatchNorm' + str(idx)].dgamma
                grads['beta' + str(idx)] = self.layers['BatchNorm' + str(idx)].dbeta

        return grads

In [None]:
# coding: utf-8

import numpy as np
import matplotlib.pyplot as plt

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

# 获取前1000个样本作为训练数据
x_train = x_train[:1000]
t_train = t_train[:1000]

max_epochs = 20
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.01


def __train(weight_init_std):
    '''
    训练
    :param weight_init_std: 初始化权重
    :return:
    '''
    # 带有bn层
    bn_network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, 
                                    weight_init_std=weight_init_std, use_batchnorm=True)
    # 没有bn层
    network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10,
                                weight_init_std=weight_init_std)
    # 优化器
    optimizer = SGD(lr=learning_rate)
    
    train_acc_list = []
    bn_train_acc_list = []
    
    iter_per_epoch = max(train_size / batch_size, 1)
    epoch_cnt = 0
    
    for i in range(1000000000):
        batch_mask = np.random.choice(train_size, batch_size)
        x_batch = x_train[batch_mask]
        t_batch = t_train[batch_mask]
    
        for _network in (bn_network, network):
            grads = _network.gradient(x_batch, t_batch)
            optimizer.update(_network.params, grads)
    
        if i % iter_per_epoch == 0:
            train_acc = network.accuracy(x_train, t_train)
            bn_train_acc = bn_network.accuracy(x_train, t_train)
            train_acc_list.append(train_acc)
            bn_train_acc_list.append(bn_train_acc)
    
            print("epoch:" + str(epoch_cnt) + " | " + str(train_acc) + " - " + str(bn_train_acc))
    
            epoch_cnt += 1
            if epoch_cnt >= max_epochs:
                break
                
    return train_acc_list, bn_train_acc_list


# 3.绘图
weight_scale_list = np.logspace(0, -4, num=16)
x = np.arange(max_epochs)

for i, w in enumerate(weight_scale_list):#
    print( "============== " + str(i+1) + "/16" + " ==============")
    train_acc_list, bn_train_acc_list = __train(w) # 训练网络
    
    # plt.subplot(4,4,i+1)
    plt.title("W:%3f"%(w))
    # if i == 15:
    #     # 绘制给各自的子图
    #     plt.plot(x, bn_train_acc_list, label='Batch Normalization', markevery=2)
    #     plt.plot(x, train_acc_list, linestyle = "--", label='Normal(without BatchNorm)', markevery=2)
    # else:
    #     plt.plot(x, bn_train_acc_list, markevery=2)
    #     plt.plot(x, train_acc_list, linestyle="--", markevery=2)


    # if i % 4:
    #     plt.yticks([])
    #
    # else:
    #     plt.ylabel("accuracy")
    # if i < 12:
    #     plt.xticks([])
    #
    # else:
    #     plt.xlabel("epochs")
    plt.plot(x, bn_train_acc_list, label=str(i)+'epoch Batch Normalization', markevery=2)
    plt.plot(x, train_acc_list, linestyle="--", label=str(i)+'epoch Normal(without BatchNorm)', markevery=2)

    plt.ylabel("accuracy")
    plt.xlabel("epochs")
    plt.legend(loc='lower right')
plt.savefig("test"+str(i)+".jpg")