# Building your Deep Neural Network: Step by Step

- more than one layer
- activation function: LeRU

# 1 - Packages

In [24]:
import numpy as np
import h5py
import matplotlib.pyplot as plt

import scipy
from PIL import Image
import imageio

from tools.activation_function import sigmoid, sigmoid_derivative, ReLU, ReLU_derivative, tanh, tanh_derivative

%matplotlib inline

# keep all the random function calls consistent
np.random.seed(1)

# 2 - Initalization

## 2.1 - Layers & Parameters

1. hidden layers: 2

2. initialize weights and bias in each layers
例如: [6, 10, 5, 1]: 输入层6 features, 输出1个, 10为hidden_1, 5为hidden_2

**一列为一组数据**

In [25]:
# # 测试数据
# X = np.random.rand(4, 20)
# Y = np.ones(shape = (1, 20))

# print(X.shape)
# print(Y.shape)
def load_dataset():  
    train_dataset = h5py.File('data/train_catvnoncat.h5', "r")
    # 209 samples, 64 * 64 pixels
    train_X = np.array(train_dataset["train_set_x"][:]) # (209, 64, 64, 3) 
    train_y = np.array(train_dataset["train_set_y"][:]) # (209,)
  
    test_dataset = h5py.File('data/test_catvnoncat.h5', "r")  
    # 50 samples
    test_X = np.array(test_dataset["test_set_x"][:]) # (50, 64, 64, 3)
    test_y = np.array(test_dataset["test_set_y"][:]) # (50,)
  
    # label
    classes = np.array(test_dataset["list_classes"][:]) # [b'non-cat' b'cat'] 
    
    # y = [...] => y = [[...]]
    train_y = np.array([train_y]) # train_y = train_y.reshape((1, train_y.shape[0]))
    test_y = np.array([test_y]) # test_y = test_y.reshape((1, test_y.shape[0])) 
    
    return train_X, train_y, test_X, test_y, classes

train_X, train_y, test_X, test_y, classes = load_dataset()

X = train_X.reshape(train_X.shape[0], -1).T 
X = X / 255
Y = train_y

print(X.shape)
print(Y.shape)

(12288, 209)
(1, 209)


In [26]:
# layer_list = [3, 2, 1]
layer_list = [2, 1]

In [27]:
def init_layers (X, layer_list):
    L_len = len(layer_list)
    
    parameters = {
        'n_input': X.shape[0] # layer0 - features_number
    }

    for i in range(L_len):
        num = layer_list[i]
        if i == L_len - 1:
            parameters['n_ouput' + str(i + 1)] = num
        else:
            parameters['n_hidden_' + str(i + 1)] = num
    return parameters

layer_parameters_obj = init_layers(X, layer_list)
# eg: {'n_input': 3, 'n_hidden_1': 5, 'n_hidden_2': 4, 'n_ouput3': 1}

layer_parameters_values = list(layer_parameters_obj.values())
# eg: [3, 5, 3, 1]

In [28]:
def init_parameters (X, layer_parameters_values):
    m = X.shape[1]
    N_Layers = layer_parameters_values
    # [3, 5, 4, 1]: 输入层3 features, 输出1个, 5为hidden_1, 4为hidden_2
    parameters = {}
    
    # Weights and bias
    for i in range(len(N_Layers) - 1):
        n_prev = N_Layers[i]
        n_next = N_Layers[i + 1]
        # 一列为一组数据
        parameters['W' + str(i + 1)] = np.random.randn(n_prev, n_next) * np.sqrt(1 / m)
        parameters['b' + str(i + 1)] = np.zeros(shape = (n_next, 1)) 
    return parameters

weights_bias_parameters = init_parameters (X, layer_parameters_values)

print('权重和偏置:', weights_bias_parameters)

print('W1', weights_bias_parameters['W1'].shape)
print('b1', weights_bias_parameters['b1'].shape)

# print('W2', weights_bias_parameters['W2'].shape)
# print('b2', weights_bias_parameters['b2'].shape)

# print('W3', weights_bias_parameters['W3'].shape)
# print('b3', weights_bias_parameters['b3'].shape)

权重和偏置: {'W1': array([[ 0.11235832, -0.04231608],
       [-0.0365344 , -0.07421879],
       [ 0.0598615 , -0.15920076],
       ...,
       [ 0.0741329 ,  0.0510344 ],
       [ 0.07773701,  0.04771853],
       [-0.03377794,  0.01436066]]), 'b1': array([[0.],
       [0.]]), 'W2': array([[-0.02464854],
       [-0.01352169]]), 'b2': array([[0.]])}
W1 (12288, 2)
b1 (2, 1)


## 2.2 - Forward Propagation

$Z^{[l]} = W^{[l]}A^{[l-1]} +b^{[l]}$

$A = \sigma(Z)$


```
X: 3 * 20
W1 = 3 * 5  b2 = 5 * 1

Z1/A1 = 5 * 20
Z1 = np.dot(W1.T, X) + b2
A1 = ReLU(Z1)

最后
A3 = sigmoid(Z3)
```


In [29]:
def forward_propagation (X, layer_parameters_values, weights_bias_parameters):
    """
    - X 输入; 
    - layer_parameters_values 每层的节点个数
    - weights_bias_parameters: 从第1层开始的weights 和 bias
    """
    parameters = { "A0": X }
    L_num = len(layer_parameters_values)
    
    """
    W1 = weights_bias_parameters['W1']
    b1 = weights_bias_parameters['b1']
    
    output_Z = np.dot(W1.T, input_A)
    output_A = ReLU(output_Z)
    """
    for l in range(1, L_num):
        W = weights_bias_parameters['W' + str(l)]
        b = weights_bias_parameters['b' + str(l)]
        A_prev = parameters['A' + str(l - 1)]
        
        output_Z = np.dot(W.T, A_prev) + b
        parameters['Z' + str(l)] = output_Z

        
        # 最后一个结点是否为1个, 如果为1个, 那么最后一个activation function 为 sigmoid
        if l == L_num - 1 and layer_parameters_values[-1] == 1:
            output_A = sigmoid(output_Z)
            parameters['A' + str(l)] = output_A
        else:
            # output_A = ReLU(output_Z)
            output_A = np.tanh(output_Z)
            parameters['A' + str(l)] = output_A

    return parameters
    
A_parameters = forward_propagation(X, layer_parameters_values, weights_bias_parameters)

print(A_parameters)

# print('A0:', A_parameters['A0'])
# print('A1:', A_parameters['A1'])
# print('A2:', A_parameters['A2'])
# print('A3:', A_parameters['A3'])

# print('A0.shape:', A_parameters['A0'].shape)
# print('A1.shape:', A_parameters['A1'].shape)
# print('A2.shape:', A_parameters['A2'].shape)
# print('A3.shape:', A_parameters['A3'].shape)

{'A0': array([[0.06666667, 0.76862745, 0.32156863, ..., 0.56078431, 0.08627451,
        0.03137255],
       [0.12156863, 0.75294118, 0.27843137, ..., 0.60784314, 0.09411765,
        0.10980392],
       [0.21960784, 0.74509804, 0.26666667, ..., 0.64705882, 0.09019608,
        0.20784314],
       ...,
       [0.        , 0.32156863, 0.54117647, ..., 0.33333333, 0.01568627,
        0.        ],
       [0.        , 0.31372549, 0.55294118, ..., 0.41960784, 0.01960784,
        0.        ],
       [0.        , 0.31764706, 0.55686275, ..., 0.58431373, 0.        ,
        0.        ]]), 'Z1': array([[ 8.08707859e-01,  3.77833861e+00,  6.05494096e+00,
         8.07423130e-01, -5.62032145e-01,  3.52891484e+00,
         2.84775459e+00,  6.42424414e+00,  3.02484971e+00,
         1.53988872e+00,  5.48759395e+00,  5.78117653e+00,
         3.35013285e+00,  5.02593043e+00,  5.31348722e+00,
         9.44727359e+00,  6.80118346e+00,  8.99663024e+00,
         3.35373014e+00,  1.75105550e+00,  6.31352915e+

# 2.3 - Cost Function

In [30]:
def cost_function (A, Y):
    m = A.shape[1]
    epsilon = 1e-5
    J = (1 / m) * np.sum(-Y * np.log(A + epsilon) - (1 - Y) * np.log(1 - A + epsilon))

    # it turns [[ 10 ]] to 10
    return np.squeeze(J)

# 2.4 - Backward Propagation

### 1. J_W3, dJ_b3

#### 1.1 dJ_dA3
- dJ_dA3 = -Y / A3 + (1 - Y) / (1 - A3)

#### 1.2 dJ_dZ3
- dJ_dZ3 = dJ_dA3 * sigmoid_derivative(A3) = A3 - Y

if sigmoid:
- sigmoid_derivative(A3) = A3 * (1 - A3)
- dJ_dZ3 = dJ_dA3 * sigmoid_derivative(A3) = A3 - Y
- (或1.2) dJ_dZ3 = dJ_dA3 * ReLU_derivative(A3)
 
if ReLU:
- dJ_dZ3 = dJ_dA3 * ReLU_derivative(A3)

#### 1.3 dJ_dW3 / dJ_db3
- 上一层A值 * 本层的差值.T
- dJ_dW3 = (1 / m) * np.dot(A2, dJ_dZ3.T)
- dJ_db3 = (1 / m) * np.sum(dJ_dZ3, axis = 1, keepdims = True)

### 2. dJ_W2, dJ_b2

- dZ3_dA2 = W3
- dA2_dZ2 = ReLU_derivative(A2)

#### 2.1 dJ_dZ2
- dJ_dZ2 = dJ_dZ3(上层到Z差值) * dZ3_dA2(上一层权重) * dA2_dZ2(这层导数)

#### 2.2 dJ_W2 / dJ_b2
- dJ_W2 = 1 / m * (A1 * dJ_dZ2.T)
- dJ_b2 =  1 / m * np.sum(dJ_dZ2, axis = 1, keepdims = True)
```

In [31]:
def backward_last_layer (A_parameters, layer_parameters_values):
    L_num = len(layer_parameters_values)
    m = Y.shape[1]
    
    # 最后一个 A, 倒数第二个 A_prev
    A = A_parameters['A' + str(L_num - 1)]
    A_prev = A_parameters['A' + str(L_num - 2)]
    
    dJ_dZ_last = []
    if (layer_parameters_values[-1] == 1):
        # sigmoid: output_layer 结点数为1
        dJ_dZ_last = A - Y
    else: 
        # ReLu:
        dJ_dA_last = -Y / A + (1 - Y) / (1 - A)
        # dJ_dZ_last = dJ_dA_last * ReLU_derivative(A)
        J_dZ_last = dJ_dA_last * tanh_derivative(A)

    diff = dJ_dZ_last

    dJ_dW_last = (1 / m) * (np.dot(A_prev, diff.T))
    dJ_db_last = (1 / m) * np.sum(diff, axis = 1, keepdims = True)
    
    return dJ_dW_last, dJ_db_last, dJ_dZ_last

In [32]:
def backward_propagation (weights_bias_parameters, layer_parameters_values, A_parameters, Y):
    L = len(layer_parameters_values) # 层数
    m = Y.shape[1] # 样本个数
    
    J = cost_function(A_parameters['A' + str(L - 1)], Y) # 计算代价函数值
    
    # L - 1: 最后一层特殊处理
    dJ_dW_last, dJ_db_last, dJ_dZ_last = backward_last_layer(A_parameters, layer_parameters_values)
    derivatives_parameters = {
        "dJ_dW" + str(L - 1): dJ_dW_last,
        "dJ_db" + str(L - 1): dJ_db_last,
        "dJ_dZ" + str(L - 1): dJ_dZ_last
    }

    # L = 4, L - 1最后一层已经判断完成, 需要倒着从 L - 2开始 (eg: 2 -> 1)
    for i in reversed(range(1, L - 1)): # range(1, 3), 只会输出1, 2
        # 1. 上一层权重 dZ3_dA2
        W_next = weights_bias_parameters['W' + str(i + 1)] # 4 * 1
        A_current = A_parameters['A' + str(i)] # 4 * 20
        A_prev = A_parameters['A' + str(i - 1)] # 5 * 20
        
        # 2. 上一层差值 dJ_dZ3
        diff_next = derivatives_parameters['dJ_dZ' + str(i + 1)] # 1 * 20
        
        # 3. 这层导数 dA2_dZ2
        # relu_derivative_current = ReLU_derivative(A_current) # 4 * 20
        tanh_derivative_current = tanh_derivative(A_current)
        
        # 4. 该层差值 dJ_dZ = np.dot(上一层weights, 上一层差值) * 当前层导数
        # print('上一层weights:', W_next.shape) # (2, 1)
        # print('上一层差值:', diff_next.shape) # (1, 20)
        # print('当前层导数:', relu_derivative_current.shape) # 2 * 20

        # dJ_dZ = np.dot(W_next, diff_next) * relu_derivative_current
        dJ_dZ = np.dot(W_next, diff_next) * tanh_derivative_current
        
        diff = dJ_dZ

        derivatives_parameters['dJ_dZ' + str(i)] = dJ_dZ
        derivatives_parameters['dJ_dW' + str(i)] = (1 / m) * np.dot(A_prev, diff.T)
        derivatives_parameters['dJ_db' + str(i)] = (1 / m) * np.sum(diff, axis = 1, keepdims = True)
    
    print(derivatives_parameters)
    return derivatives_parameters

In [33]:
derivatives_parameters = backward_propagation (weights_bias_parameters, layer_parameters_values, A_parameters, Y)

{'dJ_dW2': array([[ 0.14308621],
       [-0.01599918]]), 'dJ_db2': array([[0.15030268]]), 'dJ_dZ2': array([[ 0.49849169,  0.4910791 , -0.50363795,  0.49399097,  0.50061354,
         0.49228291,  0.49691946, -0.5081895 ,  0.49267741,  0.49100843,
         0.49048469, -0.50908046,  0.49637627, -0.50447725, -0.50327033,
         0.49499212,  0.4910244 ,  0.49126414,  0.49716924, -0.50249349,
         0.49246183,  0.49159803,  0.49708178,  0.49269287, -0.50335707,
        -0.49873812,  0.49106323, -0.50286139,  0.49294114, -0.5087465 ,
         0.49581367,  0.4972175 ,  0.49232193,  0.4972175 ,  0.49716962,
         0.49530178,  0.49336098,  0.49151505, -0.5034918 ,  0.4970811 ,
         0.4904754 , -0.49995543, -0.50946973,  0.49316854,  0.4966107 ,
         0.49265642,  0.49683277, -0.50343123,  0.49721721,  0.49665881,
        -0.5041586 ,  0.4968956 ,  0.49650832,  0.49701468, -0.50844367,
         0.49126807, -0.50830564, -0.50527914,  0.49796986, -0.50477337,
        -0.50299548, -0.

# 2.5 - Train

In [34]:
def train (weights_bias_parameters, derivatives_parameters, layer_parameters_values):
    # hyperparameters
    alpha = 0.01
    interation = 500
    
    # parameters
    L = len(layer_parameters_values)
    J_arr = []
    
    for loop_index in range(interation):
        for layer in reversed(range(1, L)): # 1到L-1
            weights_bias_parameters['W' + str(layer)] -= alpha * derivatives_parameters['dJ_dW' + str(layer)]
            weights_bias_parameters['b' + str(layer)] -= alpha * derivatives_parameters['dJ_db' + str(layer)]
            
    return weights_bias_parameters

weights_bias_parameters = train (weights_bias_parameters, derivatives_parameters, layer_parameters_values)

# 2.6 - Predict

In [35]:
def predict (X, layer_parameters_values, weights_bias_parameters, Y):
    L = len(layer_parameters_values)
    A_parameters = forward_propagation (X, layer_parameters_values, weights_bias_parameters)
    
    A_last = A_parameters['A' + str(L - 1)]
    print('最后的值:', A_last)
    
    m = A_last.shape[1]
    Y_predict = np.zeros((1, m))
    for i in range(m):
        Y_predict[0, i] =  1 if A_last[0, i] > 0.5 else 0
    print('最后的值处理:', Y_predict)

    J_A_last = cost_function(A_last, Y)
    print('代价函数值: ', J_A_last)
    
    accuracy = (1 - np.mean(abs(Y_predict - Y))) * 100
    print('accuracy: %', format(accuracy))

predict (X, layer_parameters_values, weights_bias_parameters, Y)

最后的值: [[0.19787826 0.19386422 0.1938604  0.19731146 0.23861506 0.19388766
  0.19391341 0.19386032 0.19388475 0.19447659 0.19386044 0.19386043
  0.19387846 0.19386044 0.19386046 0.19386038 0.19386038 0.19386038
  0.19386565 0.19399637 0.19386039 0.19387402 0.19386038 0.19386038
  0.19386041 0.38359999 0.19386058 0.19386428 0.19386041 0.19386069
  0.19698289 0.19385481 0.19386076 0.19385317 0.19393412 0.19385992
  0.19386048 0.19386038 0.18904257 0.1938584  0.19386241 0.21367371
  0.19386038 0.19386038 0.19390591 0.19385953 0.19386031 0.19454219
  0.1979426  0.19386025 0.19386038 0.19386041 0.19386038 0.19386038
  0.19386073 0.19386039 0.1938652  0.19386038 0.20195225 0.19402224
  0.19387857 0.19386581 0.19386106 0.19386311 0.19386038 0.19375933
  0.19386278 0.19385973 0.19386038 0.19386038 0.1938604  0.19386878
  0.19210491 0.19388258 0.19386731 0.19386038 0.19386894 0.19386082
  0.19386243 0.195541   0.1938614  0.19386057 0.19443151 0.19301016
  0.19386037 0.1938605  0.19386067 0.19385