<a href="https://www.kaggle.com/code/kisarak/deep-neural-network-from-scratch?scriptVersionId=219612326" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from testCases_v4 import *
import copy
np.random.seed(1)

## 1. initialize_parameters
## (layer_dims [list]) -> parameters [dict]

Implement initialization for an L-layer Neural Network. 

In [20]:
def initialize_parameters(layer_dims):

    parameters = {}
    L = len(layer_dims) # number of layers in the network

    for l in range(1, L):
        
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
    return parameters

## 2. layer_forward
## (A_prev, W, b, activation) -> A, cache(in_cache, z_cache)

Build a single layer pass of forward propagation.

The layer forward module (vectorized over all the examples) computes the following equations:

$$Z^{[l]} = W^{[l]}A^{[l-1]} +b^{[l]}$$

$$A^{[l]} = g(Z^{[l]})$$

returning `cache` inside the function is generally preferred and sometimes necessary in backpropagation, even though you could technically store Z outside the function.

Functions like `relu` should ideally be self-contained. By returning cache, the function ensures it provides all the information needed for both the forward and backward passes.

In [None]:
def sigmoid(Z):
    
    A = 1/(1+np.exp(-Z))
    cache = Z
    return A, cache

def relu(Z):
    
    A = np.maximum(0,Z)
    cache = Z 
    return A, cache

In [None]:
def layer_forward(A_prev, W, b, activation):

    Z = np.dot(W, A_prev) + b
    in_cache = (A_prev, W, b)
    
    if activation == "sigmoid":
        A, Z_cache = sigmoid(Z)
        
    elif activation == "relu":
        A, Z_cache = relu(Z)

    cache = (in_cache, Z_cache)

    return A, cache

## 3. model_forward
## X, parameters -> AL, caches(cache 1 to L)

In [None]:
def model_forward(X, parameters):

    caches = []
    A = X
    L = len(parameters) // 2
    
    for l in range(1, L):
        A_prev = A 
        W, b = parameters['W' + str(l)], parameters['b' + str(l)]
        A, cache = layer_forward(A_prev, W, b, 'relu') # <-- Forward Prop
        caches.append(cache)
    
    W, b = parameters['W' + str(L)], parameters['b' + str(L)]
    AL, cache = layer_forward(A, W, b, 'sigmoid')  # <-- Forward Prop
    caches.append(cache)
          
    return AL, caches

## 4. compute_cost
Compute the cross-entropy cost $J$, using the following formula: $$-\frac{1}{m} \sum\limits_{i = 1}^{m} \left[y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)\right]$$


In [23]:
def compute_cost(AL, Y):

    # Binary Cross Entropy
    
    m = Y.shape[1]
    cost = -1/m * (np.dot(Y, np.log(AL.T)) + np.dot(1-Y, np.log(1-AL.T)))

    return np.squeeze(cost)

## 5. layer_backward

If $g(\;)$ is the activation function, 
`sigmoid_backward` and `relu_backward` compute $$dZ^{[l]} = dA^{[l]} * g'(Z^{[l]})$$  

For layer $l$, the linear part is: $Z^{[l]} = W^{[l]} A^{[l-1]} + b^{[l]}$

If you have already calculated the derivative $dZ^{[l]} = \frac{\partial \mathcal{L} }{\partial Z^{[l]}}$.

The three outputs $(dW^{[l]}, db^{[l]}, dA^{[l-1]})$ are computed using the input $dZ^{[l]}$.

$$ dW^{[l]} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} $$
$$ db^{[l]} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)} $$
$$ dA^{[l-1]} = W^{[l] T} dZ^{[l]} $$

Note: We use the variable d(variable) to denote the partial derivative of $\mathcal L$ with respect to (variable)

$$dX \rightarrow \mathcal L _X = \frac{\partial \mathcal{L} }{\partial X}$$

In [22]:
def relu_backward(dA, cache):
    
    Z = cache
    dZ = np.array(dA, copy=True)
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    return dZ

def sigmoid_backward(dA, cache):
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    return dZ

In [27]:
def layer_backward(dA, cache, activation):
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
    
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    
    dW = (1 / m) * np.dot(dZ, A_prev.T)
    db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [28]:
def model_backward(AL, Y, caches):
    
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    dAL = -(np.divide(Y, AL) - np.divide(1-Y, 1-AL))
    current_cache = caches[L-1]
    dA_prev_temp, dW_temp, db_temp = layer_backward(dAL, current_cache, activation="sigmoid")
    grads["dA" + str(L-1)] = dA_prev_temp
    grads["dW" + str(L)] = dW_temp
    grads["db" + str(L)] = db_temp
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = layer_backward(grads["dA" + str(l+1)], current_cache, activation="relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l+1)] = dW_temp
        grads["db" + str(l+1)] = db_temp

    return grads

In [29]:
def update_parameters(params, grads, learning_rate):
    
    parameters = copy.deepcopy(params)
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
    
    return parameters

In [None]:
update_parameters(*update_parameters_test_case(), 0.1)