# MLP from scratch

#### 在这一系列notebook中，我们会从使用Numpy来搭建最简单的同时最直观的神经网络，包括实现各种神经网络的技巧,内容包括

* relu
* softmax
* dropout
* maxNorm
* batchNorm

* 首先导入库 Numpy以及 plt

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

* 定义抽象神经网络层，该层什么东西也不做，接下来relu，dense层继承该类

In [2]:
class Layer(object):
    def __init__(self):
        pass
    def forward(self, input):
        return input
    def backward(self, input, grad_output):
        pass

* 定义一个激活层

In [4]:
# 定义Relu层
class ReLU(Layer):
    def __init__(self):
        pass
    def forward(self,input):
        return np.maximum(0,input)
    def backward(self,input,grad_output):
        relu_grad = input>0
        return grad_output*relu_grad
    
# 定义Sigmoid层
class Sigmoid(Layer):
    def __init__(self):
        pass
    
    def _sigmoid(self,x):
        return 1.0/(1+np.exp(-x))
    
    def forward(self,input):
        return self._sigmoid(input)
    
    def backward(self,input,grad_output):
        sigmoid_grad = self._sigmoid(input)*(1-self._sigmoid(input))
        return grad_output*sigmoid_grad

# 定义Tanh层
class Tanh(Layer):
    def __init__(self):
        pass
    def _tanh(self,x):
        return np.tanh(x)
    def forward(self,input):
        return self._tanh(input)
    def backward(self, input, grad_output):
        grad_tanh = 1-(self._tanh(input))**2
        return grad_output*grad_tanh

* 定义Dense

> 根据论文
[xavier](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) 而实现的一种初始化方法，该论文认为初始化数值与输入节点个数和输出节点个数有关

$$ \sqrt{\frac{2}{in+out}} $$

> 每一个前向传播为 

$$  a = X \cdot W + b $$

> 而反向传播为将前一层的残差传播过来，进行修正

$$ \frac{dL}{dw} = \frac{dL}{d Dense} \frac{d Dense}{dw} $$ 这里的 $ \frac{Ddense}{dw} $ 为 $x.T$,$ \frac{dL}{d Dense} $ 就是后面一层传过来的残差，同时在该层需要将残差往前传播。

* 使用随机梯度下降算法来进行学习

In [5]:
class Dense(Layer):
    def __init__(self, input_units, output_units, learning_rate=0.1,init='xavier'):
        self.learning_rate = learning_rate
        if init=='xavier':
            self.weights = np.random.randn(input_units, output_units)*np.sqrt(2./(input_units+output_units))
        else: 
            self.weights = np.random.randn(input_units, output_units)*0.01
        self.biases = np.zeros(output_units)
    def forward(self,input):
        return np.dot(input,self.weights)+self.biases
    def backward(self,input,grad_output):
        grad_input = np.dot(grad_output, self.weights.T)
        grad_weights = np.dot(input.T,grad_output)/input.shape[0]
        grad_biases = grad_output.mean(axis=0)
        
        ### sgd #####
        self.weights = self.weights - self.learning_rate*grad_weights
        self.biases = self.biases - self.learning_rate*grad_biases
        return grad_input

* 计算交叉熵
$$\frac{dCE(y,y^{pred})}{d\theta} = \left\{\begin{array}{cc} 
		y^{pred} - 1, & i=k\\ 
		y^{pred}, & other\ values 
	\end{array}\right.$$
    
    $$ loss = - log \space {e^{a_{correct}} \over {\underset i \sum e^{a_i} } } $$
    
    $$ loss = - a_{correct} + log {\underset i \sum e^{a_i} } $$

In [6]:
def softmax_crossentropy_with_logits(logits,reference_answers):
    logits_for_answers = logits[np.arange(len(logits)),reference_answers]
    
    xentropy = - logits_for_answers + np.log(np.sum(np.exp(logits),axis=-1))
    
    return xentropy
def grad_softmax_crossentropy_with_logits(logits,reference_answers):
    ones_for_answers = np.zeros_like(logits)
    ones_for_answers[np.arange(len(logits)),reference_answers] = 1
    
    softmax = np.exp(logits) / np.exp(logits).sum(axis=-1,keepdims=True)
    
    return - ones_for_answers + softmax

* 神经网络的前向传播

In [17]:
def forward(network,X):
    activations = []
    input = X
    for layer in network:
        activations.append(layer.forward(input))
        input = activations[-1]
                
    assert len(activations) == len(network)
    return activations

* 神经网络预测

In [18]:
def predict(network,X):
    logits = forward(network,X)[-1]
    return logits.argmax(axis=-1)

In [19]:
## 训练
def train(network,X,y):    
    layer_activations = forward(network,X)
    layer_inputs = [X]+layer_activations  
    logits = layer_activations[-1]
    
    loss = softmax_crossentropy_with_logits(logits,y)
    loss_grad = grad_softmax_crossentropy_with_logits(logits,y)
    
    for layer_i in range(len(network))[::-1]:
        layer = network[layer_i]
        loss_grad = layer.backward(layer_inputs[layer_i],loss_grad) #grad w.r.t. input, also weight updates
        
    return np.mean(loss)

## 实现dropout

* 根据论文[dropout](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf) 使用dropout能够防止过拟合

In [20]:
class Dropout(Layer):
    def __init__(self,p):
        self.p = p
    def forward(self,input,mode='test'):
        self.mode = mode
        if self.mode=='test':
            return input
        self.mask = (np.random.rand(*input.shape)>=self.p)/(1-self.p)
        input = input * self.mask
        return input
    def backward(self,input,grad_output):
        return grad_output*self.mask

## 实现各种优化算法，使用低耦合 高内聚

In [25]:
def sgd(w, dw,b,db, config=None):
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-2)
    
    w -= config['learning_rate'] * dw
    b -= config['learning_rate'] * db
    return w, b, config

* 同时，Dense内权值更新就要改变一下

In [26]:
class Dense(Layer):
    def __init__(self, input_units, output_units, learning_rate=0.1,init='xavier',optim=sgd):
        self.learning_rate = learning_rate
        if init=='xavier':
            self.weights = np.random.randn(input_units, output_units)*np.sqrt(2./(input_units+output_units))
        else: 
            self.weights = np.random.randn(input_units, output_units)*0.01
        self.biases = np.zeros(output_units)
        
        self.optim = optim
        self.config = None
        
    def forward(self,input,mode):
        return np.dot(input,self.weights)+self.biases
    def backward(self,input,grad_output):
        grad_input = np.dot(grad_output, self.weights.T)
        grad_weights = np.dot(input.T,grad_output)/input.shape[0]
        grad_biases = grad_output.mean(axis=0)   
        self.weights,self.biases,self.config = optim(self.weights,grad_weights,self.biases,self.grad_biases,self.config)
        return grad_input

## 实现maxnorm

In [28]:
class Dense(Layer):
    def __init__(self, input_units, output_units, learning_rate=0.1,init='xavier',optim=sgd,maxnorm=False):
        self.learning_rate = learning_rate
        if init=='xavier':
            self.weights = np.random.randn(input_units, output_units)*np.sqrt(2./(input_units+output_units))
        else: 
            self.weights = np.random.randn(input_units, output_units)*0.01
        self.biases = np.zeros(output_units)
        
        self.optim = optim
        self.config = None
        self.maxnorm = False
        
    def forward(self,input,mode):
        return np.dot(input,self.weights)+self.biases
    def backward(self,input,grad_output):
        grad_input = np.dot(grad_output, self.weights.T)
        grad_weights = np.dot(input.T,grad_output)/input.shape[0]
        grad_biases = grad_output.mean(axis=0)   
        self.weights,self.biases,self.config = optim(self.weights,grad_weights,self.biases,self.grad_biases,self.config)
        
        if self.maxnorm:
            norms = np.sqrt(np.sum(np.square(self.weights), 0, keepdims=True))
            desired = np.clip(norms, 0, 2)
            self.weights *= (desired / (1e-7 + norms)) 
        
        return grad_input