# Residual Networks

In [44]:
import numpy as np
import h5py

import tensorflow as tf

from keras import layers
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D
from keras.models import Model, load_model
from keras.preprocessing import image
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras.applications.imagenet_utils import preprocess_input
import pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model

from keras.initializers import glorot_uniform
import scipy.misc
from matplotlib.pyplot import imshow
%matplotlib inline

import keras.backend as K
K.set_image_data_format('channels_last')
K.set_learning_phase(1)

# 1 - The problem of very deep neural networks

随着深度越深，vanishing gradients 梯度消失的情况约明显。

* The main benefit of a very deep network is that it can represent very complex functions. It can also learn features at many different levels of abstraction, from edges (at the shallower layers, closer to the input) to very complex features (at the deeper layers, closer to the output). 

* However, using a deeper network doesn't always help. A huge barrier to training them is vanishing gradients: very deep networks often have a gradient signal that goes to zero quickly, thus making gradient descent prohibitively slow. 

* More specifically, during gradient descent, as you backprop from the final layer back to the first layer, you are multiplying by the weight matrix on each step, and thus the gradient can decrease exponentially quickly to zero (or, in rare cases, grow exponentially quickly and "explode" to take very large values).  


# 2 - Building a Residual Network
$Z1 -> Relu(Z1) -> Maxpool(A1) -> Conv(A1) -> Z2 -> Relu(A1 + Z2) -> ...$

## 2.1 - Initializing data

In [45]:
def load_dataset ():
    train_data = h5py.File('datasets/train_happy.h5', 'r')
    # train_data = { train_set_x, train_set_y }
    
    train_X = np.array(train_data['train_set_x'][:]) # (600, 64, 64, 3)
    train_y = np.array(train_data['train_set_y'][:]) # (600, 1)
    
    test_data = h5py.File('datasets/test_happy.h5', 'r')
    test_X = np.array(test_data['test_set_x'][:]) # (150, 64, 64, 3)
    test_y = np.array(test_data['test_set_y'][:]) # (150, 1)
    
    # the list of classes
    """
    test_data['list_classes'] 返回 <HDF5 dataset "list_classes": shape (2,), type "<i8">
    test_data["list_classes"][:] 返回 [0, 1]
    """
    classes = np.array(test_data["list_classes"][:]) #[0, 1]
    train_y = train_y.reshape((1, train_y.shape[0])) # (600, 1) => (1, 600)
    test_y = test_y.reshape((1, test_y.shape[0])) # (150, 1) => (1, 150)
    return train_X, train_y, test_X, test_y

In [64]:
train_X, train_y, test_X, test_y = load_dataset()

# Normailize image data
train_X = train_X / 255.0
test_X = test_X / 255.0

# shape y
train_y = train_y.T
test_y = test_y.T

# print(train_X.shape) # (600, 64, 64, 3)
# print(train_y.shape) # (600, 1)

## 2.2 - Identity block

Identity block = 残差模块

![image](https://wx3.sinaimg.cn/mw1024/701c57e5ly1ge9afo75lij21eu0eqjue.jpg)


**这里我们用3个深度块儿为一组计算**
1. $X => Conv2d => BN => RELU => A1$
2. $A1 => Conv2d => BN => RELU => A2$
3. $A2 => Conv2d => BN => RELU(Z2 + X) => A3$

In [None]:
def identity_block (X, f, filters_num, key):
    """
    X = (m, n_H, n_W, n_C)
    filters_num = { f1, f2, f3 } in each layer for output name
    key = key in ResNet
    """
    
    # define name of block
    conv_name_base = 'CONV' + str(key)
    bn_name_base = 'BN' + str(key)
    
    # filters_num
    n_C_1, n_C_2, n_C_3 = filters_num
    
    n

In [110]:
def identity_block (X, f, filters_num, key):
    """
    X = (m, n_H, n_W, n_C)
    filters_num = { f1, f2, f3 } in each layer for output name
    key = key in ResNet
    """
    # 创建Tensor
    # Tensor("input_1:0", shape=(?, 4, 4, 6), dtype=float32)
    X = Input(shape = X.shape[1:]) # (n_H_prev * n_H_prev * n_C_prev)
    
    # define name of block
    conv_name_base = 'CONV' + str(key)
    bn_name_base = 'BN' + str(key)
    
    # filters_num
    n_C_1, n_C_2, n_C_3 = filters_num
    
    # ========= Layer 1 =========
    # 1. CONV (f * f * n_C_prev * f1)
    # Z = Conv2D(32, (7, 7), strides = (1, 1), padding = 'valid', name = 'CONV0')(X)
    # Tensor("CONV12a_4/BiasAdd:0", shape=(?, 4, 4, 2), dtype=float32)
    Z1_conv = Conv2D(
        n_C_1, 
        (1, 1), 
        padding = 'valid', 
        kernel_initializer = glorot_uniform(seed=0), 
        name = conv_name_base + '2a'
    )(X)
    
    # 2. BN
    # Tensor("BN12a/cond/Merge:0", shape=(?, 4, 4, 2), dtype=float32)
    Z1_bn = BatchNormalization(
        axis = 3, 
        name = bn_name_base + '2a'
    )(Z1_conv)
    
    # 3. RELU
    A1 = Activation('relu')(Z1_bn)
    
    # ========= Layer 2 =========
    # 1. CONV (f * f * n_C_prev * f2)
    # Tensor("CONV12b/BiasAdd:0", shape=(?, 4, 4, 4), dtype=float32)
    Z2_conv = Conv2D(
        filters = n_C_2, 
        kernel_size = (f, f), 
        padding = 'same', 
        kernel_initializer=glorot_uniform(seed=0), # Xavier统一初始化器
        name = conv_name_base + '2b'
    )(A1)
    
    # 2. BN
    Z2_bn = BatchNormalization(
        axis = 3, 
        name = bn_name_base + '2b'
    )(Z2_conv)
    
    # 3. RELU
    A2 = Activation('relu')(Z2_bn)
    
    # ========= Layer 3 =========
    # 1. CONV (f * f * n_C_prev * f3)
    # Tensor("CONV12c_1/BiasAdd:0", shape=(?, 4, 4, 6), dtype=float32)
    Z3_conv = Conv2D(
        filters = n_C_3, 
        kernel_size = (1, 1), 
        padding = 'valid', 
        kernel_initializer=glorot_uniform(seed=0), # Xavier统一初始化器
        name = conv_name_base + '2c'
    )(A2)
    
    # 2. BN
    Z3_bn = BatchNormalization(
        axis = 3, 
        name = bn_name_base + '2c'
    )(Z3_conv)
    
    # 3. RELU  Z3_bn 和 X 尺寸一样
    Z3 = Z3_bn + X
    A3 = Activation('relu')(Z3)

    return A3

In [113]:
np.random.seed(1)
X = np.random.randn(3, 4, 4, 6) # m = 3, n_W, n_H = 4, n_C = 6

A3 = identity_block(X, f = 2, filters_num = [2, 4, 6], key = 1)
# Tensor("activation_16/Relu:0", shape=(?, 4, 4, 6), dtype=float32)
print(A3) 

Tensor("activation_19/Relu:0", shape=(?, 4, 4, 6), dtype=float32)
