# Improvement: Gradient Checking

See [this](https://mrq02.github.io/DL/imp/#gc) for details.

In [1]:
import numpy as np

### 1) **1D GC**
<img src="images/1Dgrad_kiank.png" style="width:600px;height:250px;">
<caption><center> <u><b>Figure 1</b></u>: <b>1D linear model</b><br> </center></caption>

In [2]:
def forwardprop(x,theta):
    return theta*x

def backprop(x,theta):
    return x

In [3]:
def gradient_check(x,theta,epsilon=1e-7):
    # Compute the approx gradient
    thetap = theta+epsilon
    thetam = theta-epsilon
    Jp = forwardprop(x,thetap)
    Jm = forwardprop(x,thetam)
    gradapprox = (Jp-Jm)/(2*epsilon)
    
    # Compute the actual gradient
    grad = backprop(x,theta)
    
    # Compute the diff between the 2 gradients
    num = np.linalg.norm(grad-gradapprox)
    denom = np.linalg.norm(grad)+np.linalg.norm(gradapprox)
    diff = num/denom
    if diff<1e-7:
        print("The gradient is correct.")
    else:
        print("The gradient is wrong.")
    return diff

In [4]:
x, theta = 10, 8
diff = gradient_check(x,theta)
print("diff = "+str(diff))

The gradient is correct.
diff = 1.2623786476544242e-09


### 2) **N-D GC**
<img src="images/NDgrad_kiank.png" style="width:600px;height:400px;">
<caption><center> <u><b>Figure 2</b></u>: <b>deep neural network</b><br><i>LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID</i></center></caption>

In [5]:
def forwardprop_n(X,Y,params):
    # Get the params
    m = X.shape[1]
    W1 = params["W1"]
    b1 = params["b1"]
    W2 = params["W2"]
    b2 = params["b2"]
    W3 = params["W3"]
    b3 = params["b3"]

    # Forward
    Z1 = np.dot(W1,X)+b1
    A1 = np.maximum(0,Z1)
    Z2 = np.dot(W2,A1)+b2
    A2 = np.maximum(0,Z2)
    Z3 = np.dot(W3,A2)+b3
    A3 = 1/(1+np.exp(-Z3))
    
    # Cost
    logprobs = np.multiply(-np.log(A3),Y) + np.multiply(-np.log(1-A3),1-Y)
    cost = 1./m * np.sum(logprobs)
    cache = (Z1,A1,W1,b1,Z2,A2,W2,b2,Z3,A3,W3,b3)
    return cost,cache

def backprop_n(X,Y,cache):
    # Get the params
    m = X.shape[1]
    (Z1,A1,W1,b1,Z2,A2,W2,b2,Z3,A3,W3,b3) = cache
    
    # Backward
    dZ3 = A3-Y
    dW3 = 1./m*np.dot(dZ3,A2.T)
    db3 = 1./m*np.sum(dZ3, axis=1, keepdims=True)
    dA2 = np.dot(W3.T,dZ3)
    dZ2 = np.multiply(dA2,np.int64(A2>0))
    dW2 = 1./m*np.dot(dZ2,A1.T)
    db2 = 1./m*np.sum(dZ2, axis=1, keepdims=True)
    dA1 = np.dot(W2.T,dZ2)
    dZ1 = np.multiply(dA1,np.int64(A1>0))
    dW1 = 1./m*np.dot(dZ1,X.T)
    db1 = 1./m*np.sum(dZ1, axis=1, keepdims=True)
    
    # Return
    grads = {
        "dZ3": dZ3, "dW3": dW3, "db3": db3,
        "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
        "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1
    }
    return grads

def gradient_check_n_test_case(): 
    np.random.seed(1)
    x = np.random.randn(4,3)
    y = np.array([1, 1, 0])
    W1 = np.random.randn(5,4) 
    b1 = np.random.randn(5,1) 
    W2 = np.random.randn(3,5) 
    b2 = np.random.randn(3,1) 
    W3 = np.random.randn(1,3) 
    b3 = np.random.randn(1,1) 
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3}
    return x, y, parameters

def dict_to_vec(params):
    keys = []
    count = 0
    for key in ["W1","b1","W2","b2","W3","b3"]:
        new_vector = np.reshape(params[key],(-1,1))
        keys += [key]*new_vector.shape[0]
        if count == 0:
            theta = new_vector
        else:
            theta = np.concatenate((theta,new_vector),axis=0)
        count = count+1
    return theta, keys

def vec_to_dict(theta):
    params = {}
    params["W1"] = theta[:20].reshape((5,4))
    params["b1"] = theta[20:25].reshape((5,1))
    params["W2"] = theta[25:40].reshape((3,5))
    params["b2"] = theta[40:43].reshape((3,1))
    params["W3"] = theta[43:46].reshape((1,3))
    params["b3"] = theta[46:47].reshape((1,1))
    return params

def grads_to_vec(grads):
    count = 0
    for key in ["dW1","db1","dW2","db2","dW3","db3"]:
        new_vector = np.reshape(grads[key],(-1,1))
        if count == 0:
            theta = new_vector
        else:
            theta = np.concatenate((theta,new_vector),axis=0)
        count = count+1
    return theta

In [6]:
def gradient_check_n(params,grads,X,Y,epsilon=1e-7):
    # Get the values
    params_vec, _ = dict_to_vec(params)
    grad = grads_to_vec(grads)
    n_params = params_vec.shape[0]
    Jp = np.zeros((n_params,1))
    Jm = np.zeros((n_params,1))
    gradapprox = np.zeros((n_params,1))
    
    # Compute the approx gradient
    for i in range(n_params):
        thetap = np.copy(params_vec)
        thetap[i][0] += epsilon
        Jp[i], _ = forwardprop_n(X,Y,vec_to_dict(thetap))
        thetam = np.copy(params_vec)
        thetam[i][0] -= epsilon
        Jm[i], _ = forwardprop_n(X,Y,vec_to_dict(thetam))
        gradapprox[i] = (Jp[i]-Jm[i])/(2*epsilon)
    
    # Compute the diff between the 2 gradients
    num = np.linalg.norm(grad-gradapprox)
    denom = np.linalg.norm(grad)+np.linalg.norm(gradapprox)
    diff = num/denom
    if diff<1e-7:
        print("The gradient is correct.")
    else:
        print("The gradient is wrong.")
    print("diff = "+str(diff))
    return diff

In [7]:
X,Y,params = gradient_check_n_test_case()
cost,cache = forwardprop_n(X,Y,params)
grads = backprop_n(X,Y,cache)
diff = gradient_check_n(params,grads,X,Y)

The gradient is wrong.
diff = 1.1885552035482147e-07
