In [1]:
import numpy as np

### 转换工具

In [2]:
# dictionary转换为vector
def dictionary_to_vector(parameters):
    L = len(parameters) // 2
    
    theta = np.array([]).reshape((0, 1))
    shape_caches = []
    for l in range(1, L + 1):
        W = parameters["W" + str(l)]
        theta = np.concatenate((theta, np.reshape(W, (-1,1))), axis=0)
        shape_caches.append(W.shape)
        b = parameters["b" + str(l)]
        theta = np.concatenate((theta, np.reshape(b, (-1,1))), axis=0)
        shape_caches.append(b.shape)
        
    return theta, shape_caches

# vector转换为dictionary
def vector_to_dictionary(theta, shape_caches):
    L = len(shape_caches) // 2
    
    parameters = {}
    i = 0
    pos = 0
    for l in range(1, L + 1):
        length = shape_caches[i][0] * shape_caches[i][1]
        parameters["W" + str(l)] = theta[pos:pos + length].reshape(shape_caches[i])
        i += 1
        pos += length
        length = shape_caches[i][0] * shape_caches[i][1]
        parameters["b" + str(l)] = theta[pos:pos + length].reshape(shape_caches[i])
        i += 1
        pos += length

    return parameters

# 梯度转换为vector
def gradients_to_vector(gradients):
    L = len(gradients) // 2
    
    theta = np.array([]).reshape((0, 1))
    for l in range(1, L + 1):
        theta = np.concatenate((theta, np.reshape(gradients["dW" + str(l)], (-1,1))), axis=0)
        theta = np.concatenate((theta, np.reshape(gradients["db" + str(l)], (-1,1))), axis=0)

    return theta

### 正反向传播函数

In [3]:
# sigmoid激活函数
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

In [4]:
# ReLU激活函数
def relu(Z):
    return np.maximum(0, Z)

In [5]:
# 计算正向传播 ReLU->ReLU->...->Sigmoid
def forward_propagate(parameters, X):
    L = len(parameters) // 2
    
    caches = {}
    
    A = X
    caches['A0'] = A
    for l in range(1, L):
        Z = parameters['W' + str(l)] @ A + parameters['b' + str(l)]
        A = relu(Z)
        caches['A' + str(l)] = A
    
    Z = parameters['W' + str(L)] @ A + parameters['b' + str(L)]
    A = sigmoid(Z)
    caches['A' + str(L)] = A
    
    return A, caches

In [6]:
# 计算损失函数
def compute_cost(AL, Y):
    logprobs = Y * np.log(AL) + (1 - Y) * np.log(1 - AL)
    cost = -np.nanmean(logprobs)
    return cost

In [7]:
# 计算反向传播 Sigmoid->ReLU->...->ReLU
def backward_propagate(parameters, Y, caches):
    L = len(parameters) // 2
    
    grads = {}
    
    dZ = caches['A' + str(L)] - Y # cross-entropy cost对AL的偏导数 * sigmoid的导数
    grads['dW' + str(L)] = dZ @ caches['A' + str(L - 1)].T / dZ.shape[1]
    grads['db' + str(L)] = np.mean(dZ, axis=1, keepdims=True)
    
    for l in reversed(range(1, L)):
        dZ =  parameters['W' + str(l + 1)].T @ dZ * (caches['A' + str(l)] > 0) # ReLU的导数
        grads['dW' + str(l)] = dZ @ caches['A' + str(l - 1)].T / dZ.shape[1]
        grads['db' + str(l)] = np.mean(dZ, axis=1, keepdims=True)
    
    return grads

### 梯度检验

In [8]:
def gradient_check(parameters, gradients, X, Y, epsilon = 1e-7):
    # 变量设置
    parameters_values, shape_caches = dictionary_to_vector(parameters)
    grad = gradients_to_vector(gradients)
    num_parameters = parameters_values.shape[0]
    gradapprox = np.zeros((num_parameters, 1))
    
    # 计算梯度近似值
    for i in range(num_parameters):
        thetaplus = np.copy(parameters_values)
        thetaplus[i][0] += epsilon
        AL, _ = forward_propagate(vector_to_dictionary(thetaplus, shape_caches), X)
        J_plus = compute_cost(AL, Y)

        thetaminus = np.copy(parameters_values)
        thetaminus[i][0] -= epsilon
        AL, _ = forward_propagate(vector_to_dictionary(thetaminus, shape_caches), X)
        J_minus = compute_cost(AL, Y)
        
        gradapprox[i] = (J_plus - J_minus) / epsilon / 2.
    
    difference = np.linalg.norm(grad - gradapprox) / (np.linalg.norm(grad) + np.linalg.norm(gradapprox))
    
    if difference > epsilon:
        print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
    else:
        print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
    
    return difference

In [9]:
def gradient_check_n_test_case(): 
    np.random.seed(2)
    x = np.random.randn(4,3)
    y = np.array([1, 1, 0])
    W1 = np.random.randn(5,4)
    b1 = np.random.randn(5,1)
    W2 = np.random.randn(3,5)
    b2 = np.random.randn(3,1)
    W3 = np.random.randn(1,3)
    b3 = np.random.randn(1,1)
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3}
    return x, y, parameters

In [10]:
X, Y, parameters = gradient_check_n_test_case()
AL, caches = forward_propagate(parameters, X)
grads = backward_propagate(parameters, Y, caches)
difference = gradient_check(parameters, grads, X, Y)

[92mYour backward propagation works perfectly fine! difference = 1.34885360864e-08[0m


### 错误的反向传播函数

In [11]:
# 计算错误的反向传播
def backward_propagate(parameters, Y, caches):
    L = len(parameters) // 2
    
    grads = {}
    
    dZ = caches['A' + str(L)] - Y # cross-entropy cost对AL的偏导数 * sigmoid的导数
    grads['dW' + str(L)] = dZ @ caches['A' + str(L - 1)].T / dZ.shape[1]
    grads['db' + str(L)] = np.mean(dZ, axis=1, keepdims=True)
    
    for l in reversed(range(1, L)):
        dZ =  parameters['W' + str(l + 1)].T @ dZ * (caches['A' + str(l)] > 0) # ReLU的导数
        grads['dW' + str(l)] = dZ @ caches['A' + str(l - 1)].T
        grads['db' + str(l)] = np.mean(dZ, axis=1, keepdims=True)
    
    return grads

In [12]:
X, Y, parameters = gradient_check_n_test_case()
AL, caches = forward_propagate(parameters, X)
grads = backward_propagate(parameters, Y, caches)
difference = gradient_check(parameters, grads, X, Y)

[93mThere is a mistake in the backward propagation! difference = 0.478408736962[0m
