In [43]:
import numpy as np
import copy

### Forward propagation Model 

**Forward propagation for single layer:**

$Z^{[l]} = W^{[l]}.A^{[l-1]} + b^{[l]}$

$A^{[l]} = g(Z^{[l]})$

we should cashe $A^{[l-1]}$, $W^{[l]}$, $b^{[l]}$, $Z^{[l]}$ for using them in backward propagation, so we have:

linear_cashe = ($A^{[l-1]}$, $W^{[l]}$, $b^{[l]}$)

cashe = (linear_cashe, $Z^{[l]}$)

**relu activation function**

$g(Z) = max(0, Z)$ for $z = w.a + b$ 

**sigmoid function** 

 $g(Z) = sigmoid(z) = \frac{1}{1 + e^{-z}}$ for $z = w.a + b$ 

In [3]:
def relu(Z):
    return np.maximum(0, Z)

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

In [4]:
def forward_prop(A_prev, W, b, activation):
    Z = np.dot(W, A_prev) + b
    linear_cashe = (A_prev, W, b)

    if activation == "relu":
        A = relu(Z)
    elif activation == "sigmoid": 
        A = sigmoid(Z)
        
    cashe = (linear_cashe, Z)

    return A, cashe
        

### Implementation of initialization for an L-layer Neural Network. 


In [5]:
def initialize_parameters_deep(layer_dims):
    """
    Arguments:
    layer_dims -- python array (list) containing the dimensions of each layer in our network
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl -- bias vector of shape (layer_dims[l], 1)
    """
    
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims) # number of layers in the network

    for l in range(1, L):
        
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
               
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))

        
    return parameters

### L-Layer Model

Implementation of the forward propagation for L-Layer Model.

**Instructions**: In the code below, the variable `AL` will denote $A^{[L]} = \sigma(Z^{[L]}) = \sigma(W^{[L]} A^{[L-1]} + b^{[L]})$. (This is sometimes also called `Yhat`, i.e., this is $\hat{Y}$.) 

In [33]:
def L_model_forward(X, parameters):
    caches = []
    A = X
    L = len(parameters) // 2

    for l in range(1, L):
        A_prev = A
        A, cache = forward_prop(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], "relu")
        caches.append(cache)

    AL, cache = forward_prop(A, parameters['W' + str(L)], parameters['b' + str(L)], "sigmoid")
    caches.append(cache)

    return AL, caches

### Cost Function
Compute the cross-entropy cost $J$, using the following formula:

$$J = - \frac{1}{m} \sum\limits_{i = 1}^{m} \large{(} \small y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L] (i)}\right) \large{)}$$

In [8]:
def compute_cost(AL, Y):
    """
    Computes the cross-entropy cost given in equation abov
    
    """
    m = Y.shape[1] # number of examples

    logprobs = np.multiply(np.log(AL),Y) + np.multiply(np.log(1 - AL),1 - Y)
    cost = - (1/m) * np.sum(logprobs)
    
    cost = float(cost)
    
    return cost

### Backward Propagation Module

**Backward Propagation for single layer:**

$dZ^{[l]} = dA^{[l]}*g^{[l]'}(Z^{[l]})$

$dW^{[l]} = \frac{1}{m}dZ^{[l]} A^{[l-1]T}$

$db^{[l]} = \frac{1}{m} np.sum(dZ^{[l]}, axis = 1, keepdims=True)$

$dA^{[l-1]} = W^{[l]T}dZ^{[l]}$

to implement $g^{[1]'}$ for relu function:

$$g^{[1]'} = \begin{cases}
      1 & \text{if}\ Z > 0 \\
      0 & \text{otherwise}
    \end{cases}$$

to implement $g^{[1]'}$ for sigmoid function:

$$s = sigmoid(Z)$$

$$g^{[1]'} = s * (1-s)$$

In [37]:
def relu_backward(dA, Z):
    
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA, Z):
    
    s = sigmoid(Z)
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

In [39]:
def backward_prop(dA, cache, activation):

    linear_cache, Z = cache
    A_prev, W, b = linear_cache

    m = A_prev.shape[1]

    if activation == "relu":
        dZ = relu_backward(dA, Z)

    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, Z)

    dW = (1 / m) * np.dot(dZ, A_prev.T)
    db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db
    

###  L-Model Backward 

**Initializing backpropagation**:


To backpropagate through this network, we know that the output is:

$A^{[L]} = \sigma(Z^{[L]})$

The code thus needs to compute `dAL` $= \frac{\partial \mathcal{L}}{\partial A^{[L]}}$

To do so, we use this formula:
```python
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
```

In [13]:
def L_model_backward(AL, Y, caches):
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL

    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    #Lth layer (SIGMOID)
    dA_prev, dW, db = backward_prop(dAL, caches[L-1], "sigmoid")
    grads["dA" + str(L-1)] = dA_prev
    grads["dW" + str(L)] = dW
    grads["db" + str(L)] = db

    for l in reversed(range(L-1)):
        # lth layer: (RELU).
        dA_prev, dW, db = backward_prop(grads["dA" + str(l+1)], caches[l], "relu")
        grads["dA" + str(l)] = dA_prev
        grads["dW" + str(l + 1)] =dW
        grads["db" + str(l + 1)] = db

    return grads

### Update Parameters Using Gradient Descent

**General gradient descent rule**: $\theta = \theta - \alpha \frac{\partial J }{ \partial \theta }$ where $\alpha$ is the learning rate and $\theta$ represents a parameter.

In [44]:
def update_parameters(params, grads, learning_rate):
   
    parameters = copy.deepcopy(params)
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        parameters["W" + str(l+1)] -= learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] -= learning_rate * grads["db" + str(l+1)]
        
    return parameters

In [66]:
def L_layer_model(X, Y, dim_layers, num_iterations = 1000, learning_rate = 0.01, print_cost = False):
    
    np.random.seed(1)
    costs = []
    parameters = initialize_parameters_deep(dim_layers)

    for i in range(0, num_iterations):
        
        AL, caches = L_model_forward(X, parameters)
        accuracy = np.mean((AL >= 0.5) == Y) * 100  # Calculate accuracy
        cost = compute_cost(AL, Y)
    
        grads = L_model_backward(AL, Y, caches)
        
        parameters = update_parameters(parameters, grads, learning_rate)
        
        if print_cost and i % 1000 == 0 or i == num_iterations - 1:
            print("Cost after iteration %i: %f, Accuracy: %.2f%%" % (i, cost, accuracy))
        if i % 1000 == 0 or i == num_iterations:
            costs.append(cost)

    return parameters, costs
    

### Predict

$y_{prediction} = \mathbb 1 * \text{(activation > 0.5)} = \begin{cases}
      1 & \text{if}\ activation > 0.5 \\
      0 & \text{otherwise}
    \end{cases}$ 

In [68]:
def predict(X, parameters):
    AL, caches = L_model_forward(X, parameters)
    AL = np.where(AL > 0.5, 1, 0)
    return AL

In [67]:
# Updated sample data
X = np.array([[0, 1, 4, 5, 2, 5, 3], [2, 3, 5, 6, 2, 7, 4]]) # Input features ,shape=(nx, m), m:training examples, nx: #features
Y = np.array([[0, 0, 1, 1, 0, 1, 0]])                   # Output labels

parameters, costs = L_layer_model(X, Y, [2, 5, 1], num_iterations = 10000, learning_rate = 0.01, print_cost = True)



Cost after iteration 0: 0.693310, Accuracy: 57.14%
Cost after iteration 1000: 0.472283, Accuracy: 85.71%
Cost after iteration 2000: 0.262052, Accuracy: 85.71%
Cost after iteration 3000: 0.175653, Accuracy: 100.00%
Cost after iteration 4000: 0.127086, Accuracy: 100.00%
Cost after iteration 5000: 0.099495, Accuracy: 100.00%
Cost after iteration 6000: 0.080136, Accuracy: 100.00%
Cost after iteration 7000: 0.065600, Accuracy: 100.00%
Cost after iteration 8000: 0.054514, Accuracy: 100.00%
Cost after iteration 9000: 0.045941, Accuracy: 100.00%
Cost after iteration 9999: 0.039232, Accuracy: 100.00%


In [54]:
parameters["W1"]

array([[ 0.01781109,  0.00429925],
       [ 0.00096497, -0.01863493],
       [-0.00277388, -0.00354759],
       [-0.00082741, -0.00627001],
       [-0.00043818, -0.00477218]])

In [69]:
predictions = predict(X, parameters)

print("Predictions:")
print(predictions)

Predictions:
[[0 0 1 1 0 1 0]]
