In [1]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import copy
np.random.seed(1)

<strong><h1>Helper functions for L_layers_NN class </h1></strong>

In [2]:
def initialize_parameters_deep(layers_dims):
    """
    Arguments:
    layers_dims -- python array (list) containing the dimensions of each layer in our network
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl -- bias vector of shape (layer_dims[l], 1)
    """

    #Seed same as lab:
    #np.random.seed(3)
    
    parameters = {} 

    L = len(layers_dims)
    
    for l in range(1, L):
        
        #He inicialization
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l-1]) * np.sqrt(2.0 / layers_dims[l-1])
        
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

        #parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l-1]) * 0.01
        #parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

    return parameters



    
    


In [3]:
parameters = initialize_parameters_deep([5,4,3])
parameters

{'W1': array([[ 1.02732621, -0.38690873, -0.33404515, -0.67860494,  0.54733184],
        [-1.45562088,  1.10351585, -0.48142952,  0.20177804, -0.15771567],
        [ 0.92471825, -1.30294739, -0.20391454, -0.2428973 ,  0.71705876],
        [-0.69563232, -0.10905317, -0.55520641,  0.02669832,  0.36860471]]),
 'b1': array([[0.],
        [0.],
        [0.],
        [0.]]),
 'W2': array([[-0.77825528,  0.8094419 ,  0.63752091,  0.35531715],
        [ 0.63700135, -0.48346861, -0.08689651, -0.66168891],
        [-0.18942548,  0.37501795, -0.48907801, -0.28054711]]),
 'b2': array([[0.],
        [0.],
        [0.]])}

In [4]:
def linear_forward(A, W, b):
    """
    Linear part of a layer's forward propagation.
    
    Arguments:
    A -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter 
    cache -- a python tuple containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """
    
    Z = W @ A + b

    linear_cache = (A, W, b)

    return Z, linear_cache

In [5]:
def linear_activation_forward(A_prev, W, b, activation):
    
    """
    Forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value 
    cache -- a python tuple containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """
    
    Z, linear_cache = linear_forward(A_prev, W, b)
    activation_cache = Z
    
    if activation == "relu":

        A = np.maximum(0, Z)

    elif activation == "sigmoid":

        A = 1 / (1 + np.exp(-Z))

    elif activation == "linear": 

        A = Z

    else:
        
        raise ValueError(f"Unknown activation: {activation}")

    cache = (linear_cache, activation_cache)

    return A, cache


        
    

In [6]:
def L_model_forward(X, parameters):
    
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()
    
    Returns:
    AL -- activation value from the output (last) layer
    caches -- list of caches containing:
                every cache of linear_activation_forward() (there are L of them, indexed from 0 to L-1)
    """

    A = X
    L = len(parameters) // 2    

    caches = []

    for l in range(1, L):

        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]

        A, cache = linear_activation_forward(A, W, b, activation = "relu")
        
        caches.append(cache)

    W = parameters["W" + str(L)]
    b = parameters["b" + str(L)]

    AL, cache = linear_activation_forward(A, W, b, activation = "sigmoid")
    caches.append(cache)

    return AL, caches

In [7]:
def compute_cost(AL, Y, lambd, params):
    """
    Implement the cost function defined by equation (7).

    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """

    m = Y.shape[1]

    cost = -(1 / m) * (Y @ np.log(AL).T + (1 - Y) @ np.log(1 - AL).T)
    L2_term = 0 

    for key in params:
        if key.startswith("W"):
            L2_term += np.sum(np.square(params[key])) 

    cost += (lambd / (2 * m)) * L2_term

    assert cost.shape == (1, 1)
    
    return np.squeeze(cost)

In [8]:
def linear_backward(dZ, cache):
    
    """
    Implement the linear portion of backward propagation for a single layer (layer l)

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """

    A_prev, W, b = cache

    m = A_prev.shape[1]
    
    dW = (dZ @ A_prev.T) / m 
    
    db = np.sum(dZ, axis = 1, keepdims=True) / m
    
    dA_prev = W.T @ dZ

    return dA_prev, dW, db
    
    

In [9]:
def relu_derivative(Z):

    return (Z > 0).astype(float)

def sigmoid_derivative(Z):
    
    S = 1 / (1 + np.exp(-Z))
    
    return S * (1 - S)

In [10]:
def linear_activation_backward(dA, cache, activation):
    
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    
    linear_cache, Z = cache

    if activation == "relu":

        dZ = dA * relu_derivative(Z)

    elif activation == "sigmoid":

        dZ = dA * sigmoid_derivative(Z)

    elif activation == "linear":

        dZ = dA

    else: raise ValueError(f"Unknown activation {activation}")

    dA_prev, dW, db = linear_backward(dZ, linear_cache)


    return dA_prev, dW, db

In [11]:
def loss_function_derivative(AL, Y):
    
    dAL = - (Y / AL) + ((1 - Y) / (1 - AL))

    return dAL

In [12]:
def L_model_backward(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}

    L = len(caches)
    dAL = loss_function_derivative(AL, Y)
    
    dA = dAL

    current_cache = caches[L - 1]

    dA_prev_temp, dW_temp, db_temp = linear_activation_backward(dAL, current_cache, "sigmoid")
     
    grads["dA" + str(L-1)] = dA_prev_temp
    grads["dW" + str(L)] = dW_temp
    grads["db" + str(L)] = db_temp

    for l in reversed(range(L - 1)):

        current_cache = caches[l]


        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(dA_prev_temp, current_cache, "relu" )
 
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads


In [13]:
def update_parameters_with_regularization(params, grads, learning_rate, lambd):
    """
    Update parameters using gradient descent
    
    Arguments:
    params -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    parameters = copy.deepcopy(params)
    L = len(parameters) // 2 # number of layers in the neural network
    m = grads["dA" + str(L - 1)].shape[1]
    
    # Update rule for each parameter. Use a for loop.
    #(≈ 2 lines of code)
    for l in range(L):
        
        # parameters["W" + str(l+1)] = ...
        # parameters["b" + str(l+1)] = ...
        
        parameters["W" + str(l+1)] -= learning_rate * ( grads["dW" + str(l+1)] + (lambd / m) * parameters["W" + str(l + 1)])
        parameters["b" + str(l+1)] -= learning_rate * grads["db" + str(l+1)]

    return parameters

In [14]:
#continue with the implementation same as coursera lab and their seed but make sure to understand what is going on
#go back to previous functions to print out layers and see what layers are being worked on (if i want to) 

In [15]:
#https://www.coursera.org/learn/neural-networks-deep-learning/programming/Sfu8g/deep-neural-network-application

In [16]:
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False, lambd = 0.01):
    """
    Implements a L-layer neural network: [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID.
    
    Arguments:
    X -- input data, of shape (n_x, number of examples)
    Y -- true "label" vector (containing 1 if cat, 0 if non-cat), of shape (1, number of examples)
    layers_dims -- list containing the input size and each layer size, of length (number of layers + 1).
    learning_rate -- learning rate of the gradient descent update rule
    num_iterations -- number of iterations of the optimization loop
    print_cost -- if True, it prints the cost every 100 steps
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """
    costs = []
    #progress = tqdm(range(0, num_iterations), desc="Training", leave=True)

    parameters = initialize_parameters_deep(layers_dims)

    for iteration in range(0, num_iterations):

        AL, caches = L_model_forward(X, parameters)
        cost = compute_cost(AL, Y, lambd, parameters)

        grads = L_model_backward(AL, Y, caches)

        parameters = update_parameters_with_regularization(parameters, grads, learning_rate, lambd)

        if print_cost and (iteration % 100 == 0 or iteration == num_iterations - 1):
            print("Cost after iteration {}: {}".format(iteration, np.squeeze(cost)))
        if iteration % 100 == 0:
            costs.append((iteration, cost))

    return parameters, costs
        

In [17]:
def graph(costs):

    interation_list, cost_values = zip(*costs)

    plt.figure(figsize=(8, 5))
    plt.plot(interation_list, cost_values)
    plt.xlabel("Interation")
    plt.ylabel("Cost")
    plt.title("Cost Over Time")
    plt.grid(True)
    plt.show()