## Gradient Checking

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [2]:
# Loading the data (cat/non-cat)
def load_dataset():
    """Loads the Cat vs Non-Cat dataset

    Returns
    -------
    X_train, y_train, X_test, y_test, classes: Arrays
    Dataset splitted into train and test with classes
    """
    train_dataset = h5py.File('../datasets/train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:])
    train_set_y_orig = np.array(train_dataset["train_set_y"][:])

    test_dataset = h5py.File('../datasets/test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:])
    test_set_y_orig = np.array(test_dataset["test_set_y"][:])

    classes = np.array(test_dataset["list_classes"][:])
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

In [3]:
# Calculating sigmoid
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

In [4]:
# Calculating tanh
def tanh(Z):
    return np.tanh(Z)

In [5]:
# Calculating relu
def relu(Z):
    return np.maximum(Z, 0)

In [6]:
# Calculating leaky-relu
def leaky_relu(Z, alpha = 0.01):
    np.where(Z > 0, Z, Z * alpha)

In [7]:
# Calculating first derivative of sigmoid
def dif_sigmoid(Z):
    return (1-np.power(Z, 2))

In [8]:
# Calculating first derivative of tanh
def dif_tanh(Z):
    return 1-(tanh(Z)**2)

In [9]:
# Calculating first derivative of relu
def dif_relu(Z):
    return (Z>0).astype(Z.dtype)

In [10]:
# Calculating first derivative of leaky relu
def dif_leaky_relu(Z, alpha = 0.01):
    dz = np.ones_like(Z)
    dz[Z < 0] = alpha
    return dz

In [11]:
#Get activation function
def activation_fun(name):
    if name=='relu':
        return relu
    elif name=='sigmoid':
        return sigmoid
    elif name=='leaky_relu':
        return leaky_relu
    elif name=='tanh':
        return tanh
    else:
        return tanh

In [12]:
#Get first derivative of activation function
def derivative_activation(name):
    if name=='relu':
        return dif_relu
    elif name=='sigmoid':
        return dif_sigmoid
    elif name=='leaky_relu':
        return dif_leaky_relu
    elif name=='tanh':
        return dif_tanh
    else:
        return dif_tanh

In [13]:
# Printing the shape of the training and testing data
train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes = load_dataset()
print('train_set_x_orig shape', train_set_x_orig.shape)
print('train_set_y_orig',train_set_y_orig.shape)
print("test_set_x_orig",test_set_x_orig.shape)
print("test_set_y_orig",test_set_y_orig.shape)
print('classes',classes.shape)

train_set_x_orig shape (209, 64, 64, 3)
train_set_y_orig (209,)
test_set_x_orig (50, 64, 64, 3)
test_set_y_orig (50,)
classes (2,)


In [14]:
# Reshape the training and test examples
def preprocess(train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig):
    train_x = train_set_x_orig.reshape(train_set_x_orig.shape[0], train_set_x_orig.shape[1]*train_set_x_orig.shape[2]*train_set_x_orig.shape[3])/255.
    test_x = test_set_x_orig.reshape(test_set_x_orig.shape[0],test_set_x_orig.shape[1]*test_set_x_orig.shape[2]*test_set_x_orig.shape[3])/255.
    train_y = train_set_y_orig.reshape(-1,1)
    test_y = test_set_y_orig.reshape(-1,1)
    print('train_x shape', train_x.shape)
    print('train_y',train_y.shape)
    print("test_x",test_x.shape)
    print("test_y",test_y.shape)
    return train_x,test_x,train_y,test_y

In [15]:
#Defining initail weights and bias to dictanary w and b
def initial_weights(X,Y,h_nodes,hidden_layer):
    """
    This function creates a vector of zeros of shape (X.shape[1], 1) for w and initializes b to 0.
    
    Argument:
    X -- training dataset
    h_nodes -- number of hidden nodes in each hidden laayer, list
    hidden_layer -- number of hidden layers
    
    Returns:
    w -- dict of initialized vector of shape (X.shape[1], hidden_nodes)
    b -- dict of initialized scalar (corresponds to the bias)
    """
    np.random.seed(42) 
    x_nodes = X.shape[1]
    y_nodes = Y.shape[1]

    w = {}
    b = {} 
    
    for i in range(len(h_nodes)):
        if i==0:
            n_l_1 = x_nodes
        else:
            n_l_1 = h_nodes[i-1]
        
        w[i] = np.random.randn(n_l_1 ,h_nodes[i])*np.sqrt(2/n_l_1)
        b[i] = np.random.randn(1,h_nodes[i]) * np.sqrt(2/n_l_1)
 
    return w,b

In [16]:
# FORWARD PROPAGATION
def forward_propagate(X,w,b, activation,hidden_layer,h_nodes):
    """
    This functions performs forward propagation and calculates output value
    
    Argument:
    X -- training dataset
    w -- dict of weights
    b -- dict of bias
    activationion -- list of actiations used at particular hidden layer
    hidden_layer -- number of hidden layers, integer
    h_nodes -- number of hidden nodes in each hidden laayer, list
    
    Returns:
    A -- yhat for the training data, dict
    Z -- Dot product between X and w , dict
    """
    m = X.shape[0]
    Z = {}
    A = {}    
    for l in range(len(h_nodes)):
#         print(l)
        if l == 0:
            input_X = X
        else:
            input_X = A[l-1]
        Z[l] = (np.dot(input_X,w[l])+b[l])
        A[l] = activation_fun(activation[l])(Z[l])
    
    return Z, A

In [17]:
# Calculating loss using the cost function
def costfunction(Y,A):   
    """
    This function calculates the loss between the predicted and actual output
    
    Argument:
    Y -- actual output
    A -- predicted output, dict
    
    Returns:
    cost -- loss between the predicted and actual output
    """
    m = Y.shape[0]
    last_index = len(A)-1
    cost = np.nansum(-1/m*np.sum(Y*np.log(A[last_index]) + (1-Y)*np.log(1-A[last_index])))
    return cost

In [29]:
# BACKWARD PROPAGATION (TO FIND GRADIENT)
def back_prpagate(X,Y,Z,A,w,b,activation, hidden_layer,h_nodes):
    """Performs backward propagation and calculates derivative value for a layer

    Arguments:
    X -- array_like Data
    Y -- array_like True labels
    A -- predicted output, dict
    Z -- intermidiate dot product , dict
    w -- dict of weights
    b -- dict of bias
    activationion -- list of actiations used at particular hidden layer
    hidden_layer -- number of hidden layers, integer
    h_nodes -- number of hidden nodes in each hidden laayer, list    

    Returns:
    dw -- derivative of weight, dict
    db -- derivative of bias,dict
    dz -- cache,dict
    """
    m = X.shape[0]
    L= hidden_layer 
#     dz2 = (A2-Y)
#     dw2 = 1/m*(np.dot(A1.T,dz2))
#     db2 = 1/m*(np.sum(dz2, axis=0, keepdims=True))

#     dz1 = np.dot(dz2, w2.T) * activation_fun(activation)(A1)
#     dw1 = 1/m*(np.dot(X.T,dz1))
#     db1 = 1/m*(np.sum(dz1, axis=0, keepdims=True))    
    dz = {}
    da = {}
    dw = {}
    db = {}
    for l in range(len(h_nodes)-1, -1, -1):
#         print('A shape', len(A))
        if l==len(h_nodes)-1:
            dz[l] = (A[l] - Y)
            dw[l] = (1./m * np.dot(A[l-1].T, dz[l]))
            db[l] = (1./m * np.sum(dz[l],axis = 0,keepdims=True))
#             da[l-1] = 

        else:
            dz[l] = ((np.dot(dz[l+1], w[l+1].T)) * derivative_activation(activation[l])(Z[l]))
            if l!=0:
                input_X = A[l-1]
            else:
                input_X = X
            dw[l] = (1./m * np.dot(input_X.T, dz[l]))
            db[l] = (1./m * np.sum(dz[l],axis = 0,keepdims=True))
                
    return dz,dw,db

In [30]:
def dict_to_vector(w,b):       
    count = 0
    vect = []
    for key in range(len(w)):
        if key == 0:
            vect = w[key].reshape(-1,1)
        else:
            vect = np.concatenate((vect,w[key].reshape(-1,1)),axis = 0)
                  
        vect = np.concatenate((vect,b[key].reshape(-1,1)),axis = 0)
        
    return vect

In [31]:
def vector_to_dict(theta,w):
    w_1 = {}
    b_1 = {}
    for key in range(len(w)):
        shape_0,shape_1 = w[key].shape
        shape_total_w = shape_0 * shape_1
        shape_total_b = shape_1
        w_1[key] = theta[:shape_total_w].reshape((shape_0,shape_1))
        theta= theta[shape_total_w:]
        b_1[key] = theta[:shape_total_b].reshape((1,shape_1))
        theta= theta[shape_total_b:]
    
    return w_1,b_1

In [32]:
train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes = load_dataset()
num_iterations = 10
learning_rate = 0.05

train_x = np.array([[0,0],[0,1],[1,0],[1,1]])
train_y = np.array([[0],[1],[1],[0]])

hidden_layer = 4
h_nodes = [20,16,8,4,train_y.shape[1]]
activation = ['relu','relu','relu','relu','sigmoid']
w,b = initial_weights(train_x,train_y,h_nodes,hidden_layer)

X = train_x
Y = train_y

Z,A = forward_propagate(X,w,b,activation,hidden_layer,h_nodes)
cost = costfunction(Y,A)
dz,dw,db = back_prpagate(X,Y,Z,A,w,b,activation,hidden_layer,h_nodes)

vector_param = dict_to_vector(w,b)
vector_grad = dict_to_vector(dw,db)

In [36]:
parameters_values = vector_param.copy()
grad = vector_grad.copy()

epsilon = 1e-7
num_parameters = parameters_values.shape[0]
J_plus = np.zeros((num_parameters, 1))
J_minus = np.zeros((num_parameters, 1))
gradapprox = np.zeros((num_parameters, 1))

# Compute gradapprox
for i in range(num_parameters):

    thetaplus =  np.copy(parameters_values)                                       # Step 1
    thetaplus[i][0] = thetaplus[i][0] + epsilon                                   # Step 2
    w_1 ,b_1 = vector_to_dict(thetaplus,w)
    _,A_1 = forward_propagate(X,w_1,b_1 ,activation,hidden_layer,h_nodes)         # Step 3
    J_plus[i] =  costfunction(Y, A_1)  # Step 3
    
    
    thetaminus = np.copy(parameters_values)                                       # Step 1
    thetaminus[i][0] = thetaminus[i][0] - epsilon                                 # Step 2    
    w_2, b_2 = vector_to_dict(thetaminus,w)
    _, A_2 = forward_propagate(X, w_2,b_2,activation,hidden_layer,h_nodes)        # Step 3
    J_minus[i] =  costfunction(Y,A_2)

    # Compute gradapprox[i]
    gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon)

# Compare gradapprox to backward propagation gradients by computing difference.
numerator = np.linalg.norm(grad - gradapprox)                                     # Step 1'
denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)                   # Step 2'
difference = numerator / denominator                                              # Step 3'

if difference > 1e-7:
    print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
else:
    print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")


[92mYour backward propagation works perfectly fine! difference = 2.2528533009336904e-09[0m
