# 手动实现MLP

multi-layer preception

**关键点**

1. 反向传播公式推导
2. 常见激活函数及其梯度
3. 算法整体流程

In [1]:
# 导入数据
from tensorflow.keras import datasets
mnist = datasets.mnist.load_data()

In [2]:
# 定义MLP类
import random
import numpy as np
import math
class MLP:
    def __init__(self,sizes):
        shape ,self.activations = sizes
        self.w = [np.random.randn(c1,c2) for c1,c2 in zip(shape[:-1],shape[1:])]
        self.b = [np.zeros(c) for c in shape[1:]]
        self.layers = len(shape)
        
        
    def forward(self,x):
        h = x
        for w,b,act in zip(self.w,self.b,self.activations):
            u = h @ w + b
            h = eval(act)(u)
        return h
        
    def backprop(self,data):
        '''
        计算参数的梯度  SGD
        首先进行前向传播，记录每一层的输出（未通过激活函数的和通过激活函数的）
        记录 zip后一层的梯度
        
        '''
        grads_w = [np.zeros(w.shape) for w in self.w]
        grads_b = [np.zeros(b.shape) for b in self.b]
        
        x,y = data
        # 记录每层的输出
        u_val = []
        h_val = [x]
        h = x
        for w,b,act in zip(self.w,self.b,self.activations):
            u = h @ w + b
            h = eval(act)(u)
            u_val.append(u)
            h_val.append(h)
        # 计算交叉熵
        loss = crossentropy(y,h)
        # 计算交叉熵的导数
        d_loss_ce = d_corssentropy(y,h)
        # 计算各层激活函数的导数
        d_fun = []
        for u,act in zip(u_val,self.activations):  # 
            d_act = 'd_' + act
            d_fun.append(eval(d_act)(u))

        nab = d_loss_ce @ d_fun[-1]
        grads_w[-1] = h_val[-2][:,np.newaxis] @ nab[np.newaxis,:]
        grads_b[-1] = nab
        for i,w in zip(range(2,self.layers),reversed(self.w[1:])):
            i = -i
            
            nab = (nab @ w.T) @ d_fun[i]
            grads_w[i] = h_val[i-1][:,np.newaxis] @ nab[np.newaxis,:]
            grads_b[i] = nab
                
        return grads_w,grads_b,loss
        
        
    def train(self,train_data,test_data,lr = 0.1,epoch = 10,batch_size = 100):
        '''
        :train_data:  [[x] ,[y]]
        :test_data:   [[x] ,[y]]
        '''
        # SGD
        x,y = train_data
        data = list(zip(x,y))
        random.shuffle(data)

        data_slice = [data[k:k+batch_size] for k in range(0,len(data),batch_size)]
        for j in range(epoch):
            for i in range(len(data)//batch_size):
                # get_batch
                batch_data = data_slice[i]
                # BP
                acc = [
                    [np.zeros(w.shape) for w in self.w],
                    [np.zeros(b.shape) for b in self.b]]
                
                tot_loss = 0
                for i_data in batch_data:
                    grads_w,grads_b,loss = self.backprop(i_data)
                    acc = [[a+g for a,g in zip(acc[0],grads_w)],[a+g for a,g in zip(acc[1],grads_b)]]
                    tot_loss += loss
                if i%(batch_size//5)==0:
                    print('Current loss is {0}'.format(tot_loss))
                    x_test,y_test = test_data
                    predict = self.predict(x_test)
                    accuracy = np.sum(np.where(predict==y_test,1,0)) / len(y_test)
                    print('Step {0} : accuracy is {1} % '.format(j,accuracy*100))
                
                # update param
                d_w,d_b = [dw/batch_size for dw in acc[0]],[db/batch_size for db in acc[1]]
                self.w = [w - lr*dw for w,dw in zip(self.w,d_w)]
                self.b = [b - lr*db for b,db in zip(self.b,d_b)]
            # validation
            # compute accuracy
            
        
    def predict(self,x):            
        return [np.argmax(self.forward(t)) for t in x]       
        
def sigmoid(x):
    return 1/(1+np.exp(-x))
def d_sigmoid(x):        
    return np.eye(x.shape[-1])*(sigmoid(x)*(1-sigmoid(x)))

def softmax(x):
    return np.array([np.exp(t) for t in x]) / np.sum(np.exp(t) for t in x)

def d_softmax(x):
    s = softmax(x)
    mat = np.eye(x.shape[-1])
    mat *= s
    mask = s[:,np.newaxis] * s[np.newaxis,:]

    return mat - mask

def relu(x):
    return np.where(x>0,x,0)

def d_relu(x):
    return np.eye(x.shape[-1])*np.where(x>0,1.,0.)


def crossentropy(p,q):
    '''
    p 为真实的label（非one-hot）
    q为预测概率
    '''
    return -np.log(q[p]) 


def d_corssentropy(p,q):
    '''
    p 为真实的label
    q为预测概率
    '''
    one_hot = np.zeros(q.shape)
    one_hot[p] = -1/q[p]
    return one_hot

In [3]:
(x,y),(x_test,y_test) = mnist
# 归一化、faltten
x = x.reshape([-1,28*28]) / 255.
x_test = x_test.reshape([-1,28*28]) / 255.


train_data = (x,y)
test_data = (x_test,y_test)

In [None]:
ann = MLP([[784,128,64,32,10],['sigmoid','sigmoid','sigmoid','softmax']])

In [None]:
ann.train(train_data,test_data,epoch=10)



Current loss is 1164.3824541627728
Step 0 : accuracy is 9.68 % 
Current loss is 278.59111409368955
Step 0 : accuracy is 12.94 % 
Current loss is 232.9602061534984
Step 0 : accuracy is 17.14 % 
Current loss is 222.3801554470345
Step 0 : accuracy is 21.65 % 
Current loss is 213.96921020310037
Step 0 : accuracy is 25.230000000000004 % 
Current loss is 209.47310943948463
Step 0 : accuracy is 28.939999999999998 % 
Current loss is 198.02824734364307
Step 0 : accuracy is 32.23 % 
Current loss is 168.80944277846925
Step 0 : accuracy is 35.75 % 
Current loss is 178.8659935916214
Step 0 : accuracy is 38.48 % 
Current loss is 159.47079087770618
Step 0 : accuracy is 40.93 % 
Current loss is 165.67221057662172
Step 0 : accuracy is 43.04 % 
Current loss is 150.53254871784273
Step 0 : accuracy is 45.32 % 
Current loss is 168.39612546593682
Step 0 : accuracy is 47.02 % 
Current loss is 169.61453108032552
Step 0 : accuracy is 49.0 % 
Current loss is 154.07976197159346
Step 0 : accuracy is 50.49 % 
Curr

In [None]:
# from sklearn import datasets
# moon = datasets.make_moons()

In [None]:


# ann = MLP([[2,3,2],['sigmoid','sigmoid']])
# # ann.train(moon,moon,epoch=20,batch_size = 10)