In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy import ndimage
from PIL import Image
import h5py
import matplotlib.pyplot as plt

from testCases_v4a import *
from dnn_utils_v2 import sigmoid, sigmoid_backward, relu, relu_backward

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

#### Buidling the Helper functions for our model

In [2]:
def initialize_parameters(n_x, n_h, n_y):  # for a two layer Neural Network
    
   
    W1 = np.random.randn(n_h, n_x)
    b1 = np.zeros((n_h,1))
   
    W2 = np.random.randn(n_y, n_h)
    b2 = np.zeros((n_y,1))
    
    
    assert(W1.shape == (n_h, n_x))
    assert(b1.shape == (n_h,1))
    assert(W2.shape == (n_y, n_h))
    assert(b2.shape == (n_y,1))
    
    parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
    return parameters

In [3]:
parameters = initialize_parameters(12, 5, 1)

print("W1" + str(parameters["W1"]))
print("b1" + str(parameters["b1"]))
print("W2" + str(parameters["W2"]))
print("b2" + str(parameters["b2"]))


W1[[ 1.62434536 -0.61175641 -0.52817175 -1.07296862  0.86540763 -2.3015387
   1.74481176 -0.7612069   0.3190391  -0.24937038  1.46210794 -2.06014071]
 [-0.3224172  -0.38405435  1.13376944 -1.09989127 -0.17242821 -0.87785842
   0.04221375  0.58281521 -1.10061918  1.14472371  0.90159072  0.50249434]
 [ 0.90085595 -0.68372786 -0.12289023 -0.93576943 -0.26788808  0.53035547
  -0.69166075 -0.39675353 -0.6871727  -0.84520564 -0.67124613 -0.0126646 ]
 [-1.11731035  0.2344157   1.65980218  0.74204416 -0.19183555 -0.88762896
  -0.74715829  1.6924546   0.05080775 -0.63699565  0.19091548  2.10025514]
 [ 0.12015895  0.61720311  0.30017032 -0.35224985 -1.1425182  -0.34934272
  -0.20889423  0.58662319  0.83898341  0.93110208  0.28558733  0.88514116]]
b1[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
W2[[-0.75439794  1.25286816  0.51292982 -0.29809284  0.48851815]]
b2[[0.]]


In [4]:
def initialize_parameters_deep(layer_dims): # initializing parameters for a L layer deep neural network

    L = len(layer_dims)  #layer in the network.
    parameters = {}
    
    for l in range(1,L):
        
        parameters["W"+ str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])
        parameters["b"+ str(l)] = np.zeros((layer_dims[l], 1))
        
        
        assert( parameters["W"+ str(l)].shape ==  ( layer_dims[l], layer_dims[l-1]))
        assert( parameters["b"+ str(l)].shape == (layer_dims[l], 1))
        
    return parameters
    

In [5]:
parameters = initialize_parameters_deep([2,3,4,9])

for l in range(1, 4):
    
    print("W"+str(l)+ " = " + str(parameters["W" + str(l)]), '\n')
    print("b"+str(l)+ " = " + str(parameters["b" + str(l)]), '\n')


W1 = [[-0.07557171  1.13162939]
 [ 1.51981682  2.18557541]
 [-1.39649634 -1.44411381]] 

b1 = [[0.]
 [0.]
 [0.]] 

W2 = [[-0.50446586  0.16003707  0.87616892]
 [ 0.31563495 -2.02220122 -0.30620401]
 [ 0.82797464  0.23009474  0.76201118]
 [-0.22232814 -0.20075807  0.18656139]] 

b2 = [[0.]
 [0.]
 [0.]
 [0.]] 

W3 = [[ 0.41005165  0.19829972  0.11900865 -0.67066229]
 [ 0.37756379  0.12182127  1.12948391  1.19891788]
 [ 0.18515642 -0.37528495 -0.63873041  0.42349435]
 [ 0.07734007 -0.34385368  0.04359686 -0.62000084]
 [ 0.69803203 -0.44712856  1.2245077   0.40349164]
 [ 0.59357852 -1.09491185  0.16938243  0.74055645]
 [-0.9537006  -0.26621851  0.03261455 -1.37311732]
 [ 0.31515939  0.84616065 -0.85951594  0.35054598]
 [-1.31228341 -0.03869551 -1.61577235  1.12141771]] 

b3 = [[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]] 



Note: if a vector of an intermediate_variable( of the computation graph) is computed for all training examples, then to make sense of it on a smaller scale try making sense of each column of the vector.Each column contains data about a single image and is easlier to comprehend

### Forward Propagation Module

### Linear Forward.

Linear forward modeule computes $$Z^{[l]} = W^{[l]}A^{[l-1]} +b^{[l]}\tag{4}$$ also called pre-activation parameter ( vectorised over all training examples )

 

In [6]:
def linear_forward(A_prev, W, b):
    
    Z = np.dot(W, A_prev) + b
    assert( Z.shape == (W.shape[0], A_prev.shape[1]))
    
    cache  = (A_prev, W, b)   # cache is to give some info about the current Z.
    
    return Z, cache

In [42]:
z = np.random.choice(10,(2,2)) + 2
ans, _ = relu(z)
print(z, end = '\n'+'\n')
print(ans)

[[ 4  9]
 [ 9 11]]

[[ 4  9]
 [ 9 11]]


In [30]:
A, W, b = linear_forward_test_case()
Z, cache = linear_forward(A, W, b)

print("Z  " + str(Z),'\n')
print("cache  " + str(cache),'\n')   #try to make some sense of the cache here

print("cache[0] is A_prev used to calculate Z",'\n', "cache[1] and cache[2] are W and b for current layer respectively")

Z  [[ 3.26295337 -1.23429987]] 

cache  (array([[ 1.62434536, -0.61175641],
       [-0.52817175, -1.07296862],
       [ 0.86540763, -2.3015387 ]]), array([[ 1.74481176, -0.7612069 ,  0.3190391 ]]), array([[-0.24937038]])) 

cache[0] is A_prev used to calculate Z 
 cache[1] and cache[2] are W and b for current layer respectively


#### Activation applied on pre-activation_parameter Z , called as activation forward

In [24]:
def activation_forward(Z, activation):
    
    A = np.zeros((Z.shape[0], Z.shape[1]))
    
    if (activation == "sigmoid"):
        A, activation_cache = sigmoid(Z) # Sigmoid function adds its input argument as its cache.
    elif (activation == "relu"):
        A, activation_cache = relu(Z)
        
    return A, activation_cache # this activation_cache has just Z stored in it.

### Linear_activation_forward

Implement the forward propagation for the LINEAR->ACTIVATION layer

In [364]:
def tan_h(Z):
    
    A = np.tanh(Z)
    cache = Z
    
    return A, cache

In [365]:
def linear_activation_forward(A_prev, W, b, activation):
    
    Z , linear_cache = linear_forward(A_prev, W, b)
    assert( Z.shape == (W.shape[0], A_prev.shape[1]))
   
    if activation == "sigmoid":
    
        A, activation_cache = sigmoid(Z)
        
    elif activation == "relu" :
        
        A, activation_cache = relu(Z)
        
    else:
            
        A, activation_cache = tan_h(Z)
        
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)  #note that cache stores both linear and activation_cache.
    
    return A, cache 
        

In [366]:
A_prev, W, b = linear_activation_forward_test_case()

A, linear_activation_cache = linear_activation_forward(A_prev, W, b, "sigmoid")
print("With sigmoid: A =  " + str(A))

A, linear_activation_cache = linear_activation_forward(A_prev, W, b, "relu")
print("With relu: A =  " + str(A))

A, linear_activation_cache = linear_activation_forward(A_prev, W, b, "tanh")
print("With tanh: A =  " + str(A))

With sigmoid: A =  [[0.96890023 0.11013289]]
With relu: A =  [[3.43896131 0.        ]]
With tanh: A =  [[ 0.99794156 -0.96982745]]


Note: The [linear->activation] computation is counted as a single layer not two layers.



### L-Layer model

##### Implementing forward propagation for the whole model

We will create a function that will replicate [Linear-relu] L-1 times followed by [Linear-sigmoid] computation.


In [367]:
def L_model_forward(X, parameters): #the aim of this function is to compute AL i.e activations of the last layer.
    
    caches = []
    L = len(parameters) //2 #layer
    
    A_prev = X #initialising forward propagation.
    
    for l in range(1, L):
        
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
     
        A, linear_activation_cache = linear_activation_forward(A_prev, W, b, "relu")
        caches.append(linear_activation_cache)
        
        A_prev = A
     
    WL = parameters["W" + str(L)]
    bL = parameters["b" + str(L)]
    AL ,linear_activation_cache = linear_activation_forward(A_prev, WL, bL, "sigmoid")
    
    assert(AL.shape == (1, X.shape[1]))
    caches.append(linear_activation_cache)
    
    
    return AL, caches

In [368]:
X, parameters = L_model_forward_test_case_2hidden() #3 layer nn test_case.
AL, caches = L_model_forward(X, parameters)

print("AL " + str(AL))
print("Length of caches lsit: " + str(len(caches)))

AL [[0.03921668 0.70498921 0.19734387 0.04728177]]
Length of caches lsit: 3


In [369]:
#Making sense of caches list. #complicated case.




### Compute Cost

Formula Used: $ $ $$ J(W, b) = -\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) \tag{7}$$



In [3]:
def compute_cost(AL, Y):
    
    m = Y.shape[1]
    cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL)) / m
    
    np.squeeze(cost)
    assert( cost.shape == ())
    
    return cost

In [4]:
Y, AL = compute_cost_test_case()

cost = compute_cost(AL, Y)
print("cost :" + str(cost))

NameError: name 'compute_cost_test_case' is not defined

### Backward propagation module

#### Linear Backward

In [372]:
def linear_backward(dZ, linear_cache):
    
    #unpacking values from linear_cache
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    
    dW      = np.dot(dZ, A_prev.T) / m
    db      = np.sum(dZ, axis = 1, keepdims = True) / m
    dA_prev = np.dot(W.T, dZ)
    
    assert(dW.shape == W.shape)
    assert(db.shape == b.shape)
    assert(dA_prev.shape == A_prev.shape)
    
    
    return dW, db, dA_prev

In [373]:
dZ, linear_cache = linear_backward_test_case()
dW, db, dA_prev = linear_backward(dZ, linear_cache)

print("dA_prev "+ str(dA_prev))
print("dW "+ str(dW))
print("db "+ str(db))

dA_prev [[-1.15171336  0.06718465 -0.3204696   2.09812712]
 [ 0.60345879 -3.72508701  5.81700741 -3.84326836]
 [-0.4319552  -1.30987417  1.72354705  0.05070578]
 [-0.38981415  0.60811244 -1.25938424  1.47191593]
 [-2.52214926  2.67882552 -0.67947465  1.48119548]]
dW [[ 0.07313866 -0.0976715  -0.87585828  0.73763362  0.00785716]
 [ 0.85508818  0.37530413 -0.59912655  0.71278189 -0.58931808]
 [ 0.97913304 -0.24376494 -0.08839671  0.55151192 -0.10290907]]
db [[-0.14713786]
 [-0.11313155]
 [-0.13209101]]


In [386]:
def linear_activation_backward(dA, cache, activation):
    
    #unpack values from cache into linear and activation cache
    linear_cache, activation_cache = cache
   
    Z = activation_cache
    
    if activation == "sigmoid":
        
        dZ      = sigmoid_backward(dA, Z)
        assert(dZ.shape == Z.shape)
        
        dW, db, dA_prev  = linear_backward(dZ, linear_cache)
        
    elif activation == "relu":
        
        dZ      = relu_backward(dA, Z)
        dW, db, dA_prev  = linear_backward(dZ, linear_cache)
        

    return dA_prev, dW, db

In [387]:
dAL, linear_activation_cache = linear_activation_backward_test_case()

dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache, "sigmoid")
print("with sigmoid: ")
print("dA_prev"+ str(dA_prev))
print("dW"+ str(dW))
print("db"+ str(db)+'\n')


dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache, "relu")
print("with relu: ")
print("dA_prev"+ str(dA_prev))
print("dW"+ str(dW))
print("db"+ str(db))

with sigmoid: 
dA_prev[[ 0.11017994  0.01105339]
 [ 0.09466817  0.00949723]
 [-0.05743092 -0.00576154]]
dW[[ 0.10266786  0.09778551 -0.01968084]]
db[[-0.05729622]]

with relu: 
dA_prev[[ 0.44090989 -0.        ]
 [ 0.37883606 -0.        ]
 [-0.2298228   0.        ]]
dW[[ 0.44513824  0.37371418 -0.10478989]]
db[[-0.20837892]]


In [376]:
def L_model_backward(AL, Y, caches):
    #this function does [L-1]*[Linear_relu_backward]<-[Linear_sigmoid_backward]
    
    grads = {}
    L = len(caches) # each value is a tuple containing linear_cache and activation_cache for a layer.
    Y = Y.reshape(AL.shape)
    
    dAL = - (np.divide(Y, AL) - np.divide(1-Y, 1-AL))
    print(dAL)
    assert(dAL.shape == AL.shape)
    
    current_cache = cache[L-1]
    dA_prev, dW_temp, db_temp = linear_activation_backward(dAL ,caches[L-1], "sigmoid")
    
    grads["dA"+ str(L-1)] = dA_prev
    grads["dW"+ str(L)] = dW_temp
    grads["db"+ str(L)] = db_temp
    

    for l in reversed(range(L-1)): # L-1 index has cache values for Lth layer, u hve 2 start from L-1th layer.
        
        current_cache = caches[l] #caches for current layer.
        dA_prev, dW_temp, db_temp = linear_activation_backward(dA_prev, current_cache, "relu")
        
        # l+1 will give us the right number for the layer
        grads["dA"+str(l)]   = dA_prev
        grads["dW"+str(l+1)] = dW_temp
        grads["db"+str(l+1)] = db_temp
        
    #the resultant dA_prev_temp contains gradients wrt input features( grads wrt to the activations of the layer 0)
    #they are not required in supervised learning.
    
    return grads

In [377]:
AL, Y_assess, caches = L_model_backward_test_case()
grads = L_model_backward(AL, Y_assess, caches)

print_grads(grads)

[[-0.5590876   1.77465392]]
dW1 = [[0.41010002 0.07807203 0.13798444 0.10502167]
 [0.         0.         0.         0.        ]
 [0.05283652 0.01005865 0.01777766 0.0135308 ]]
db1 = [[-0.22007063]
 [ 0.        ]
 [-0.02835349]]
dA1 = [[ 0.12913162 -0.44014127]
 [-0.14175655  0.48317296]
 [ 0.01663708 -0.05670698]]


### Update parameters

In [378]:
def update_parameters(parameters, grads, learning_rate):
    
    L = len(parameters) //2 # layers
    
    for l in range(L):
        
        parameters["W"+ str(l+1)] = parameters["W"+ str(l+1)] - learning_rate* grads["dW"+str(l+1)]
        parameters["b"+ str(l+1)] = parameters["b"+ str(l+1)] - learning_rate* grads["db"+str(l+1)]
    
    return parameters

In [379]:
parameters, grads = update_parameters_test_case()
parameters = update_parameters(parameters, grads, 0.1)

parameters

{'W1': array([[-0.59562069, -0.09991781, -2.14584584,  1.82662008],
        [-1.76569676, -0.80627147,  0.51115557, -1.18258802],
        [-1.0535704 , -0.86128581,  0.68284052,  2.20374577]]),
 'b1': array([[-0.04659241],
        [-1.28888275],
        [ 0.53405496]]),
 'W2': array([[-0.55569196,  0.0354055 ,  1.32964895]]),
 'b2': array([[-0.84610769]])}

### Conclusion . 

I have successfully implemented all the helper functions required to build a deep neural network

In [380]:
layer_dims = [3, 4, 2] 
#as we grow our neural network in terms of the number of layers we get huge number of parameters to train.
# this is why big models can take very very long to train. as told can be days or months in coding ninjas class

parameters = {}
L = len(layer_dims) # number of layers in our neural network, including the input layer.
for l in range(1,len(layer_dims)): #easiest way to get the parameters initialised for a L layer neural network
    
    parameters["W"+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
    parameters["b"+str(l)] = np.zeros((layer_dims[l],1))
    
    #dimension checking is paramount here.
    
    assert(parameters["W"+ str(l)].shape ==(layer_dims[l], layer_dims[l-1]))
                                            #gives n[l]  , gives n[l-1]
    
    assert(parameters["b"+ str(l)].shape ==(layer_dims[l],1))
                                           #gives n[l]
    

In [381]:
print("W2",parameters["W2"],'\n')#look how many parameters we got for a 4 layer neural network.
print("W1",parameters["W1"])

W2 [[ 0.01976111 -0.01244123 -0.00626417 -0.00803766]
 [-0.02419083 -0.00923792 -0.01023876  0.01123978]] 

W1 [[-0.01101068 -0.01185047 -0.0020565 ]
 [ 0.01486148  0.00236716 -0.01023785]
 [-0.00712993  0.00625245 -0.00160513]
 [-0.00768836 -0.00230031  0.00745056]]


In [382]:
L = 6
for i in reversed(range (L-1)): # a loop from L-2 to 0
    print(i)

4
3
2
1
0


In [383]:
i

0

In [384]:
cache = ((2,1,2,3), (4,9,98))
linear_cache, activation_cache = cache #this is called unpacking. unpacking values contained in the cache tuple into two variables.

In [385]:
linear_cache, activation_cache

((2, 1, 2, 3), (4, 9, 98))