# Building your Deep Neural Network: Step by Step

## 1 - Packages

In [3]:
# Import necessary libraries
import numpy as np  # For numerical operations and matrix manipulations
import h5py  # For working with HDF5 file formats (used in storing large datasets)
import matplotlib.pyplot as plt  # For data visualization
from dnn_utils_v2 import sigmoid, sigmoid_backward, relu, relu_backward  # Activation functions and their gradients

# Set random seed for reproducibility
np.random.seed(1)

# Configure Matplotlib parameters for plots
plt.rcParams['figure.figsize'] = (5.0, 4.0)  # Default plot size
plt.rcParams['image.interpolation'] = 'nearest'  # No interpolation for image display
plt.rcParams['image.cmap'] = 'gray'  # Set default color map to grayscale

# Test case import - these functions likely contain predefined test data for validation
from testCases_v4a import *  # Importing test cases for validation of neural network models


## 3 - Initialization

### 3.1 - 2-layer Neural Network

In [6]:
import numpy as np  # Import numpy for numerical computations

# FUNCTION: initialize_parameters
def initialize_parameters(n_x, n_h, n_y):
    """
    Initialize the parameters of a two-layer neural network.
    
    Arguments:
    n_x -- size of the input layer (number of features)
    n_h -- size of the hidden layer (number of hidden units)
    n_y -- size of the output layer (number of output classes)
    
    Returns:
    parameters -- Python dictionary containing:
                  W1 -- weight matrix of shape (n_h, n_x)
                  b1 -- bias vector of shape (n_h, 1)
                  W2 -- weight matrix of shape (n_y, n_h)
                  b2 -- bias vector of shape (n_y, 1)
    """
    
    # Set a random seed for reproducibility
    np.random.seed(1)
    
    # Initialize weight matrices with small random values and bias vectors with zeros
    W1 = np.random.randn(n_h, n_x) * 0.01  # Weight matrix for layer 1
    b1 = np.zeros((n_h, 1))  # Bias vector for layer 1
    W2 = np.random.randn(n_y, n_h) * 0.01  # Weight matrix for layer 2
    b2 = np.zeros((n_y, 1))  # Bias vector for layer 2

    # Sanity check: Ensure the dimensions of the matrices and vectors are correct
    assert W1.shape == (n_h, n_x), f"W1 shape should be {(n_h, n_x)}, but got {W1.shape}"
    assert b1.shape == (n_h, 1), f"b1 shape should be {(n_h, 1)}, but got {b1.shape}"
    assert W2.shape == (n_y, n_h), f"W2 shape should be {(n_y, n_h)}, but got {W2.shape}"
    assert b2.shape == (n_y, 1), f"b2 shape should be {(n_y, 1)}, but got {b2.shape}"

    # Store parameters in a dictionary
    parameters = {
        "W1": W1,
        "b1": b1,
        "W2": W2,
        "b2": b2
    }

    return parameters


In [7]:
# Initialize the parameters with the given layer sizes
parameters = initialize_parameters(3, 2, 1)

# Print the values of the initialized parameters
print("W1 =", parameters["W1"])  # Weight matrix for layer 1
print("b1 =", parameters["b1"])  # Bias vector for layer 1
print("W2 =", parameters["W2"])  # Weight matrix for layer 2
print("b2 =", parameters["b2"])  # Bias vector for layer 2


W1 = [[ 0.01624345 -0.00611756 -0.00528172]
 [-0.01072969  0.00865408 -0.02301539]]
b1 = [[0.]
 [0.]]
W2 = [[ 0.01744812 -0.00761207]]
b2 = [[0.]]


## 3.2 - L-layer Neural Network

In [9]:
import numpy as np  # Ensure numpy is imported

# FUNCTION: initialize_parameters_deep
def initialize_parameters_deep(layer_dims):
    """
    Initialize parameters for an L-layer deep neural network.
    
    Arguments:
    layer_dims -- list containing the dimensions of each layer in the network (including input and output layers)
    
    Returns:
    parameters -- dictionary containing the initialized weights and biases for each layer:
                  W1, b1, ..., WL, bL:
                  Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                  bl -- bias vector of shape (layer_dims[l], 1)
    """
    
    np.random.seed(3)  # For reproducibility of random values
    parameters = {}  # Dictionary to store the weight and bias matrices
    L = len(layer_dims)  # Total number of layers in the network (input + hidden + output)

    # Loop through each layer to initialize weights and biases
    for l in range(1, L):
        # Initialize weight matrix with small random values and bias vector with zeros
        parameters[f'W{l}'] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters[f'b{l}'] = np.zeros((layer_dims[l], 1))

        # Ensure the dimensions of weights and biases are correct
        assert parameters[f'W{l}'].shape == (layer_dims[l], layer_dims[l-1]), f"Shape mismatch in W{l}"
        assert parameters[f'b{l}'].shape == (layer_dims[l], 1), f"Shape mismatch in b{l}"

    return parameters


In [10]:
# Initialize parameters for a deep neural network with layer sizes [5, 4, 3]
parameters = initialize_parameters_deep([5, 4, 3])

# Print initialized parameters for the first two layers
print("W1 =", parameters["W1"])  # Weight matrix for layer 1
print("b1 =", parameters["b1"])  # Bias vector for layer 1
print("W2 =", parameters["W2"])  # Weight matrix for layer 2
print("b2 =", parameters["b2"])  # Bias vector for layer 2


W1 = [[ 0.01788628  0.0043651   0.00096497 -0.01863493 -0.00277388]
 [-0.00354759 -0.00082741 -0.00627001 -0.00043818 -0.00477218]
 [-0.01313865  0.00884622  0.00881318  0.01709573  0.00050034]
 [-0.00404677 -0.0054536  -0.01546477  0.00982367 -0.01101068]]
b1 = [[0.]
 [0.]
 [0.]
 [0.]]
W2 = [[-0.01185047 -0.0020565   0.01486148  0.00236716]
 [-0.01023785 -0.00712993  0.00625245 -0.00160513]
 [-0.00768836 -0.00230031  0.00745056  0.01976111]]
b2 = [[0.]
 [0.]
 [0.]]


## 4 - Forward propagation module

### 4.1 - Linear Forward

In [13]:
import numpy as np  # Ensure numpy is imported

# FUNCTION: linear_forward
def linear_forward(A, W, b):
    """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    A -- Activations from the previous layer (or input data), of shape (size of previous layer, number of examples)
    W -- Weights matrix, numpy array of shape (size of current layer, size of previous layer)
    b -- Bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- Pre-activation parameter (linear combination), of shape (size of current layer, number of examples)
    cache -- Tuple containing "A", "W", and "b" for efficient backward pass computation
    """
    
    # Calculate the pre-activation parameter Z using the formula: Z = W.A + b
    Z = np.dot(W, A) + b

    # Ensure that the shape of Z is as expected (size of current layer, number of examples)
    assert Z.shape == (W.shape[0], A.shape[1]), "Shape mismatch: Z has incorrect dimensions"
    
    # Cache the inputs for backward propagation
    cache = (A, W, b)
    
    return Z, cache


In [14]:
A, W, b = linear_forward_test_case()

Z, linear_cache = linear_forward(A, W, b)

print("Z =", Z)

Z = [[ 3.26295337 -1.23429987]]


### 4.2 - Linear Activation Forward

In [16]:
# FUNCTION: linear_activation_forward
def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the linear -> activation layer.

    Arguments:
    A_prev -- Activations from previous layer (or input data), of shape (size of previous layer, number of examples)
    W -- Weights matrix, numpy array of shape (size of current layer, size of previous layer)
    b -- Bias vector, numpy array of shape (size of the current layer, 1)
    activation -- The activation function to be used, either "sigmoid" or "relu"

    Returns:
    A -- The output of the activation function (post-activation value)
    cache -- A tuple containing both linear and activation caches, stored for backpropagation
    """
    
    # Compute linear forward followed by the specified activation function
    if activation == "sigmoid":
        # Perform linear forward and apply sigmoid activation
        Z, linear_cache = linear_forward(A_prev, W, b)  # Z is the pre-activation parameter
        A, activation_cache = sigmoid(Z)                # A is the post-activation using sigmoid
        
    elif activation == "relu":
        # Perform linear forward and apply ReLU activation
        Z, linear_cache = linear_forward(A_prev, W, b)  # Z is the pre-activation parameter
        A, activation_cache = relu(Z)                   # A is the post-activation using ReLU
    
    # Ensure the output's shape matches the expected dimensions
    assert A.shape == (W.shape[0], A_prev.shape[1]), "Shape mismatch: A has incorrect dimensions"

    # Store both linear cache and activation cache for backward pass
    cache = (linear_cache, activation_cache)

    return A, cache


In [17]:
# Test case setup: Load sample data for A_prev, W, and b
A_prev, W, b = linear_activation_forward_test_case()

# Perform forward propagation with sigmoid activation
A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation="sigmoid")
print("With sigmoid: A =", A)

# Perform forward propagation with ReLU activation
A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation="relu")
print("With ReLU: A =", A)


With sigmoid: A = [[0.96890023 0.11013289]]
With ReLU: A = [[3.43896131 0.        ]]


### 4.3 - L-Layer Model

In [19]:
def L_model_forward(X, parameters):
    """
    Implements forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation.
    
    Arguments:
    X -- input data, numpy array of shape (input size, number of examples)
    parameters -- dictionary containing the initialized weights and biases
    
    Returns:
    AL -- the output of the final layer (post-activation)
    caches -- list of caches containing every cache from linear_activation_forward for backpropagation
    """
    caches = []  # List to store all caches for backpropagation
    A = X  # Set input data as activation for the first layer
    L = len(parameters) // 2  # Number of layers in the network (W1, b1, ..., WL, bL)

    # [LINEAR -> RELU]*(L-1): Forward propagate through the first (L-1) layers using ReLU activation
    for l in range(1, L):
        A_prev = A  # Activation from the previous layer
        # Forward propagate through current layer l with ReLU activation
        A, cache = linear_activation_forward(A_prev, parameters[f'W{l}'], parameters[f'b{l}'], activation="relu")
        caches.append(cache)  # Store the cache for backpropagation
    
    # LINEAR -> SIGMOID: Forward propagate through the last layer using Sigmoid activation
    AL, cache = linear_activation_forward(A, parameters[f'W{L}'], parameters[f'b{L}'], activation="sigmoid")
    caches.append(cache)  # Store the cache for the final layer
    
    # Ensure the output has the correct shape
    assert AL.shape == (1, X.shape[1]), f"Expected AL to be of shape (1, {X.shape[1]})"
    
    return AL, caches  # Return the final output and all caches for backpropagation


In [20]:
# Example usage of the L_model_forward function

# Load the test case with input data X and initialized parameters
X, parameters = L_model_forward_test_case_2hidden()

# Perform forward propagation through the network
AL, caches = L_model_forward(X, parameters)

# Output the final activation (AL) and the number of cached layers
print(f"AL = {AL}")
print(f"Length of caches list = {len(caches)}")


AL = [[0.03921668 0.70498921 0.19734387 0.04728177]]
Length of caches list = 3


## 5 - Cost function

In [22]:
import numpy as np

def compute_cost(AL, Y):
    """
    Compute the cross-entropy cost for binary classification.

    Arguments:
    AL -- probability vector from the model's output, shape (1, number of examples)
    Y -- true label vector (1 if true, 0 if false), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost as a scalar value
    """
    
    m = Y.shape[1]  # Number of examples
    
    # Compute the cross-entropy cost
    cost = (-1 / m) * (np.dot(Y, np.log(AL).T) + np.dot(1 - Y, np.log(1 - AL).T))
    
    # Ensure the cost is a scalar (e.g., turn [[17]] into 17)
    cost = np.squeeze(cost)
    
    # Validate the cost is a scalar value
    assert cost.ndim == 0, "Cost should be a scalar."

    return cost


In [23]:
# Example test case data (for illustration purposes)
Y, AL = compute_cost_test_case()

# Compute and display the cost
cost = compute_cost(AL, Y)
print(f"Cost: {cost}")


Cost: 0.2797765635793422


## 6 - Backward propagation module

### 6.1 - Linear backward

In [26]:
def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer.

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output of the current layer.
    cache -- Tuple containing (A_prev, W, b) from the forward propagation of the current layer.

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation of the previous layer (same shape as A_prev).
    dW -- Gradient of the cost with respect to W of the current layer (same shape as W).
    db -- Gradient of the cost with respect to b of the current layer (same shape as b).
    """
    A_prev, W, _ = cache  # Unpacking the cache (b is unused here, so it's replaced with '_')
    m = A_prev.shape[1]

    # Calculate gradients
    dW = (1 / m) * np.dot(dZ, A_prev.T)  # Gradient of W
    db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)  # Gradient of b
    dA_prev = np.dot(W.T, dZ)  # Gradient of the activation from the previous layer

    # Ensure the shapes are correct
    assert dA_prev.shape == A_prev.shape, "Shape of dA_prev is incorrect"
    assert dW.shape == W.shape, "Shape of dW is incorrect"
    assert db.shape == (W.shape[0], 1), "Shape of db is incorrect"
    
    return dA_prev, dW, db


In [27]:
# Set up test inputs for backward propagation
dZ, linear_cache = linear_backward_test_case()

# Perform the backward linear step
dA_prev, dW, db = linear_backward(dZ, linear_cache)

# Display the results
print(f"dA_prev = {dA_prev}")
print(f"dW = {dW}")
print(f"db = {db}")


dA_prev = [[-1.15171336  0.06718465 -0.3204696   2.09812712]
 [ 0.60345879 -3.72508701  5.81700741 -3.84326836]
 [-0.4319552  -1.30987417  1.72354705  0.05070578]
 [-0.38981415  0.60811244 -1.25938424  1.47191593]
 [-2.52214926  2.67882552 -0.67947465  1.48119548]]
dW = [[ 0.07313866 -0.0976715  -0.87585828  0.73763362  0.00785716]
 [ 0.85508818  0.37530413 -0.59912655  0.71278189 -0.58931808]
 [ 0.97913304 -0.24376494 -0.08839671  0.55151192 -0.10290907]]
db = [[-0.14713786]
 [-0.11313155]
 [-0.13209101]]


### 6.2 - Linear-Activation backward

In [29]:
def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.

    Arguments:
    dA -- Post-activation gradient for the current layer (l).
    cache -- Tuple of values (linear_cache, activation_cache) for efficient backward propagation.
    activation -- Activation function used in this layer: "sigmoid" or "relu".

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation of the previous layer (l-1).
    dW -- Gradient of the cost with respect to W (current layer l).
    db -- Gradient of the cost with respect to b (current layer l).
    """
    linear_cache, activation_cache = cache

    # Calculate dZ based on the activation function
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
    else:
        raise ValueError("Invalid activation function specified.")

    # Perform linear backward step
    dA_prev, dW, db = linear_backward(dZ, linear_cache)

    return dA_prev, dW, db


In [30]:
# Set up test inputs for the linear activation backward function
dAL, linear_activation_cache = linear_activation_backward_test_case()

# Test backward propagation with sigmoid activation
dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache, activation="sigmoid")
print("Activation: sigmoid")
print(f"dA_prev = {dA_prev}")
print(f"dW = {dW}")
print(f"db = {db}\n")

# Test backward propagation with ReLU activation
dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache, activation="relu")
print("Activation: relu")
print(f"dA_prev = {dA_prev}")
print(f"dW = {dW}")
print(f"db = {db}")


Activation: sigmoid
dA_prev = [[ 0.11017994  0.01105339]
 [ 0.09466817  0.00949723]
 [-0.05743092 -0.00576154]]
dW = [[ 0.10266786  0.09778551 -0.01968084]]
db = [[-0.05729622]]

Activation: relu
dA_prev = [[ 0.44090989  0.        ]
 [ 0.37883606  0.        ]
 [-0.2298228   0.        ]]
dW = [[ 0.44513824  0.37371418 -0.10478989]]
db = [[-0.20837892]]


### 6.3 - L-Model Backward

In [56]:
def L_model_backward(AL, Y, caches):
    """
    Implements the backward propagation for the entire network, including the linear and activation layers.
    
    Arguments:
    AL -- Probability vector from forward propagation (output layer)
    Y -- True labels vector (same shape as AL)
    caches -- List of caches from forward propagation, where each cache contains:
              - The cache for the linear and activation parts of each layer
    
    Returns:
    grads -- A dictionary containing gradients for each layer:
             grads[f"dA{l}"] -> Gradient of the cost with respect to activations from layer l
             grads[f"dW{l}"] -> Gradient of the cost with respect to weights from layer l
             grads[f"db{l}"] -> Gradient of the cost with respect to biases from layer l
    """
    
    grads = {}  # Initialize dictionary to store gradients
    L = len(caches)  # Number of layers in the neural network
    m = AL.shape[1]  # Number of examples
    Y = Y.reshape(AL.shape)  # Reshape Y to ensure the same shape as AL
    
    # Compute the gradient of the loss with respect to AL using binary cross-entropy loss
    dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

    # Backpropagation for the final layer (layer L), which uses a sigmoid activation function
    current_cache = caches[L - 1]  # Retrieve the cache for the last layer
    grads[f"dA{L-1}"], grads[f"dW{L}"], grads[f"db{L}"] = linear_activation_backward(dAL, current_cache, activation="sigmoid")

    # Loop over the remaining layers in reverse order, from layer L-2 to layer 0 (which use ReLU activations)
    for l in reversed(range(L - 1)):
        current_cache = caches[l]  # Retrieve the cache for the current layer
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads[f"dA{l+1}"], current_cache, activation="relu")
        grads[f"dA{l}"] = dA_prev_temp  # Store gradient for activation
        grads[f"dW{l+1}"] = dW_temp     # Store gradient for weights
        grads[f"db{l+1}"] = db_temp     # Store gradient for biases

    return grads


In [62]:
# Test data for backward propagation
AL, Y_assess, caches = L_model_backward_test_case()

# Perform backward propagation using the computed AL and Y_assess
grads = L_model_backward(AL, Y_assess, caches)

# Function to print gradients
print_grads(grads)


dW1 = [[0.41010002 0.07807203 0.13798444 0.10502167]
 [0.         0.         0.         0.        ]
 [0.05283652 0.01005865 0.01777766 0.0135308 ]]
db1 = [[-0.22007063]
 [ 0.        ]
 [-0.02835349]]
dA1 = [[ 0.12913162 -0.44014127]
 [-0.14175655  0.48317296]
 [ 0.01663708 -0.05670698]]


### 6.4 - Update parameters

In [67]:
def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent.

    Arguments:
    parameters -- Python dictionary containing parameters:
                  "W1", "b1", ..., "WL", "bL" for each layer L in the network
    grads -- Python dictionary containing gradients:
             "dW1", "db1", ..., "dWL", "dbL" from backpropagation
    learning_rate -- Scalar, the learning rate for gradient descent

    Returns:
    parameters -- Python dictionary with updated parameters after gradient descent:
                  parameters["W1"], parameters["b1"], ..., parameters["WL"], parameters["bL"]
    """

    L = len(parameters) // 2  # Number of layers in the neural network
    
    # Update parameters for each layer
    for l in range(1, L + 1):
        # Update weights and biases using gradient descent update rule
        parameters[f"W{l}"] -= learning_rate * grads[f"dW{l}"]
        parameters[f"b{l}"] -= learning_rate * grads[f"db{l}"]

    return parameters


In [71]:
# Test case to check update_parameters function
parameters, grads = update_parameters_test_case()

# Update parameters using gradient descent with a learning rate of 0.1
parameters = update_parameters(parameters, grads, learning_rate=0.1)

# Printing updated parameters for W1, b1, W2, and b2 using f-strings for clarity
print(f"W1 = {parameters['W1']}")
print(f"b1 = {parameters['b1']}")
print(f"W2 = {parameters['W2']}")
print(f"b2 = {parameters['b2']}")


W1 = [[-0.59562069 -0.09991781 -2.14584584  1.82662008]
 [-1.76569676 -0.80627147  0.51115557 -1.18258802]
 [-1.0535704  -0.86128581  0.68284052  2.20374577]]
b1 = [[-0.04659241]
 [-1.28888275]
 [ 0.53405496]]
W2 = [[-0.55569196  0.0354055   1.32964895]]
b2 = [[-0.84610769]]
