![](./img/04_1.png)

# 反向传播：“Flat” BP实现

In [None]:
def f(w0, X0, W1, x1, w2):
    # 前向传播：计算输出
    s0 = w0 * x0
    s1 = w1 * x1
    s2 = s0 + s1
    s3 = s2 + w2
    L = sigmoid(s3)
    
    # 反向传播：计算其梯度
    grad_L = 1.0
    grad_s3 = grad_L * (1-L) * L  # sigmoid
    grad_w2 = grad_s3  # 加法门
    grad_s2 = grad_s3
    grad_s0 = grad_s2  # 加法门
    grad_s1 = grad_s2
    grad_w1 = grad_s1 * x1  # 乘法门
    grad_x1 = grad_s1 * w1
    grad_w0 = grad_s0 * x0  # 乘法门
    grad_x0 = grad_s0 * w0

# 反向传播：模块化BP实现

图（或者网）对象 （伪代码）

In [None]:
class ComputationlGraph(object):
    
    def forward(inputs):  # 前向传播计算损失值
        # 1. [pass inputs to input gates...]
        # 2. forward the computational graph
        for gate in self.graph.nodes_topologically_sorted():
            gate.forward()
        return loss  # the final gate in the graph outputs the loss
    
    def backward():  # 反向传播计算其梯度
        for gate in reversed(self.graph.nodes_topologically_sorted()):
            gate.backward()  # little piece of backprop (chain rule applied)
        return inputs_gradients

门/节点/函数对象：实际的Pytorch代码

In [None]:
class Multiply(torch.autograd.Function):
    @staticmethed
    def forward(ctx, x, y):  # 先前向计算出各参量值
        ctx.save_for_backward(x, y)  # 对前向传播得到的中间变量进行缓存，以便反向计算时使用
        z = x * y
        return z
    @staticmethod
    def backward(ctx, grad_z):  # 再反向计算输入的梯度 grad_z：上游梯度
        x, y = ctx.saved_tensors
        grad_x = y * grad_z  # dz/dx * dL/dz 上游梯度和本地梯度相乘
        grad_y = x * grad_z  # dz/dy * dL/dz
        return grad_x, grad_y

# 神经网络：结构

示例：神经网络的前向计算

一个神经元（单层神经网络）前向传播的python代码

In [None]:
class Neuron:
    
    def neuron_tick(inputs):
        """ assume inputs and weights are 1-D numpy arrays and bias is a number """
        cell_body_sum = np.sum(inputs * self.weights) + self.bias
        firing_rate = 1.0 / (1.0 + math.exp(-cell_body_sum))  # sigmoid activation function
        return firing_rate

训练一个两层的神经网络一共需要大约20行代码

In [4]:
import numpy as np
from numpy.random import randn

# 定义网络
N, D_in, H, D_out = 64, 1000, 100, 10
x, y = randn(N, D_in), randn(N, D_out)
w1, w2 = randn(D_in, H), randn(H, D_out)

for t in range(2000):
    # 前向传播，计算损失值
    h = 1 / (1 + np.exp(-x.dot(w1)))
    y_pred = h.dot(w2)
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # 反向传播，计算梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h.T.dot(grad_y_pred)
    grad_h = grad_y_pred.dot(w2.T)
    grad_w1 = x.T.dot(grad_h * h * (1-h))
    
    # 更新权重
    w1 -= 1e-4 * grad_w1
    w2 -= 1e-4 * grad_w2

0 24816.604092663685
1 17995.346249413164
2 14794.786604278359
3 13028.271137146934
4 11921.096676879233
5 11126.083842078951
6 10523.340864851027
7 10032.517589867593
8 9619.232300574906
9 9260.016199603855
10 8960.378437489224
11 8709.237172413352
12 8484.941380995022
13 8275.830463786804
14 8072.940855904405
15 7873.898946606173
16 7668.884214209295
17 7450.883318051843
18 7240.163710303433
19 7054.481909204179
20 6891.474235912703
21 6742.546267831541
22 6606.56584431391
23 6478.982325660225
24 6353.775949993937
25 6228.63353318842
26 6108.166185308429
27 5995.039366208986
28 5888.134324928532
29 5786.292104899743
30 5688.754803296582
31 5595.977081214813
32 5507.079470031262
33 5419.183278536286
34 5329.915339634587
35 5244.3695965446495
36 5166.116530247995
37 5091.987102707701
38 5020.515459060762
39 4950.821755288049
40 4882.118018866076
41 4813.691351434925
42 4745.057004425835
43 4676.098150973716
44 4606.961537030913
45 4537.9787715758375
46 4469.666909240738
47 4402.3841679