In [1]:
from keras.datasets import mnist

In [None]:
mnist

In [None]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
(X_train_origin, Y_train_origin), (X_test_origin, Y_test_origin) = mnist.load_data()

In [None]:
X_train_origin.shape 

In [None]:
Y_train_origin.shape

In [None]:
X_test_origin.shape

In [None]:
Y_test_origin.shape

In [None]:
# Example of a picture
index = 50
plt.imshow(X_train_origin[index])
print ("y = " + str(Y_train_origin[index]))
print ("x = " + str(X_train_origin[index]))

In [None]:
## store all the shapes..
m_train_x_size = X_train_origin.shape[0]
m_train_y_size = Y_train_origin.shape[0]
m_test_x_size = X_test_origin.shape[0]
m_test_y_size = Y_test_origin.shape[0]

print ("Number of training examples in x: m_train_x = " + str(m_train_x_size))
print ("Number of training examples in y: m_train_y = " + str(m_train_y_size))
print ("Number of test examples in x: m_test_x = " + str(m_test_x_size))
print ("Number of test examples in x: m_test_y = " + str(m_test_y_size))
print ("training input image shape in pixel = " + str(X_train_origin.shape[1:]))
print ("test input image shape in pixel = " + str(X_test_origin.shape[1:]))


In [None]:
## flaten the array and 
## reshape from (no of images, width, height, color channel) to (width * height * color channel, no of images)

X_train_flatten = X_train_origin.reshape(X_train_origin.shape[0], -1).T
X_test_flatten = X_test_origin.reshape(X_test_origin.shape[0], -1).T

Y_train_flatten = Y_train_origin.reshape(Y_train_origin.shape[0], 1).T
Y_test_flatten = Y_test_origin.reshape(Y_test_origin.shape[0], 1).T


In [None]:
# keep all training pixel values between 0 and 1
X_train = X_train_flatten/255
X_test = X_test_flatten/255

Y_train = Y_train_flatten
Y_test = Y_test_flatten

print(X_train.shape)
print(Y_train.shape)

print(X_train.shape)
print(Y_test.shape)

print("\n")
print(X_train)

### Some helping functions

In [None]:
# initialize parameters..
def initialize_parameters(size_of_input_layer, size_of_hidden_layer_1, size_of_output_layer):
    
    np.random.seed(1)
    
    W1 = np.random.randn(size_of_hidden_layer_1, size_of_input_layer) * 0.01
    
    b1 = np.zeros((size_of_hidden_layer_1, 1), dtype= float)
    
    W2 = np.random.randn(size_of_output_layer, size_of_hidden_layer_1) * 0.01
    
    b2 = np.zeros((size_of_output_layer, 1), dtype= float)
    

    assert(W1.shape == (size_of_hidden_layer_1, size_of_input_layer))
    
    assert(b1.shape == (size_of_hidden_layer_1, 1))
    
    assert(W2.shape == (size_of_output_layer, size_of_hidden_layer_1))
           
    assert(b2.shape == (size_of_output_layer, 1))

    parameters = {
        "W1":W1,
        "b1":b1,
        "W2":W2,
        "b2":b2
    }
           
    return parameters
    

#### Forward prop section start..

$A^{[L]} = \sigma(Z^{[L]}) = \sigma(W^{[L]} A^{[L-1]} + b^{[L]})$. 

$$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) \tag{7}$$

In [None]:
# Sigmoid
def Sigmoid(Z):
    res = 1/(1+np.exp(-Z))
    
    cache = Z
    return res, cache

In [None]:
# Softmax
def Softmax(Z):
    res = np.exp(Z - Z.max())
    res = res / np.sum(res, axis=0)
    
    cache = Z
    return res, cache

In [None]:
##Activation Functions and results..
def Forward_Activation(A_prev, W, b, activation_function_to_apply):
    
    Z, linear_cache = Linear_Forward_Pass_Calculate_Z(A_prev, W, b)
    
    if activation_function_to_apply == 'sigmoid':
        A, activation_cache = Sigmoid(Z)          # activation_cache contains Z. It will be used in back propagation..
    elif activation_function_to_apply == 'softmax':
        A, activation_cache = Softmax(Z)
    
    assert(A.shape == (W.shape[0], A_prev.shape[1]))
    
    cache = (linear_cache, activation_cache)
    
    return A, cache

In [None]:
# Forward pass to get Z where Z = WT*A + b ....
def Linear_Forward_Pass_Calculate_Z(A_prev, W, b): 
        # A = activation of previous layer
        # W = Weights of current layer
        # b = bias of current layer

    Z = np.dot(W,A_prev) + b
    
    assert(Z.shape == (W.shape[0], A_prev.shape[1]))
    
    cache = (A_prev, W, b)
    
    return Z, cache

In [None]:
#cost function
def Compute_cost(Y_hat, Y): #Y_hat = prediction, Y = actual Output..
    
    m = Y.shape[1] # m = number_of_example_data
    
    cost = (-1/m)*np.sum(Y*np.log(Y_hat) + (1-Y)*np.log(1-Y_hat))
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
                                 # np.squeeze()  = Remove axes of length one from a.
    assert(cost.shape == ())
    
    return cost

#### Backprop section start..

 compute $$dZ^{[l]} = dA^{[l]} * g'(Z^{[l]}) \tag{11}$$.  
 $$ dW^{[l]} = \frac{\partial \mathcal{J} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} \tag{8}$$
$$ db^{[l]} = \frac{\partial \mathcal{J} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}\tag{9}$$
$$ dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]} \tag{10}$$


In [None]:
#Backward sigmoid..
def Backward_Sigmoid(dA, cache):
    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

In [None]:
#Backward softmax..
def Backward_Softmax(dA_Final, cache):
    
    Z = cache
    
    res = np.exp(Z - Z.max())
    res = res / np.sum(res, axis=0) * (1 - res / np.sum(res, axis=0))
    
    dZ = dA_Final * res
    
    assert (dZ.shape == Z.shape)
    
    return dZ




In [None]:
#Backward Activation.. 
def Backward_activation(dA, linear_and_activation_cache, backwad_activation_to_apply):
    
    linear_cache, activation_cache = linear_and_activation_cache
    
    if backwad_activation_to_apply == "sigmoid":
        
        dZ = Backward_Sigmoid(dA, activation_cache)
        
        dA_prev, dW, db = Linear_Backward_Pass_Calculate_dW_A_db(dZ, linear_cache)
        
    elif backwad_activation_to_apply == "softmax":
        
        dZ = Backward_Softmax(dA, activation_cache)
        dA_prev, dW, db = Linear_Backward_Pass_Calculate_dW_A_db(dZ, linear_cache)
    
    return dA_prev, dW, db

In [None]:
#linear backward pass..
def Linear_Backward_Pass_Calculate_dW_A_db(dZ, linear_cache):
    
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    
    dW = (1/m)*np.dot(dZ,(A_prev.T))
    db = (1/m)*np.sum(dZ, axis = 1, keepdims=True)
    dA_prev = np.dot((W.T),dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

#### Update parameters

$$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]} \tag{16}$$
$$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]} \tag{17}$$

In [None]:
def update_parameters(parameters, gradients, learning_rate): #parameters are, W1, b1, W2, b2 ..[weights and biases]
    
    L = len(parameters) // 2   # number of layers in the neural network
    
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*gradients["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*gradients["db" + str(l+1)]
        
    return parameters

### Build model

Overall steps:
    1. Initialize parameters / Define hyperparameters
    2. Loop for num_iterations:
        a. Forward propagation
        b. Compute cost function
        c. Backward propagation
        d. Update parameters (using parameters, and grads from backprop) 
    4. Use trained parameters to predict labels

In [None]:
def Two_Layer_Neural_Model(X, Y, layers_dimensions, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
    
    np.random.seed(1)
    
    gradients = {}
    costs = []
    
    m = X.shape[1]  # no of examples
    
    (size_of_input_layer, size_of_hidden_layer1, size_of_output_layer) = layers_dimensions 
    
    #initialize
    parameters = initialize_parameters(size_of_input_layer, size_of_hidden_layer1, size_of_output_layer)
    
    # Get W1, b1, W2 and b2 from the dictionary parameters.
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):
        
        #forward prop
        A1, cache1 = Forward_Activation(X, W1, b1, activation_function_to_apply = "sigmoid")
        A2, cache2 = Forward_Activation(A1, W2, b2, activation_function_to_apply = "softmax")
        
        # Compute cost
        cost = Compute_cost(A2, Y)
        
        # Initializing backward propagation
        dA2 = 2 * (A2 - Y) / A2.shape[0]
        
        #calculate gradients..
        dA1, dW2, db2 = Backward_activation(dA2, cache2, backwad_activation_to_apply = "softmax")
        dA0, dW1, db1 = Backward_activation(dA1, cache1, backwad_activation_to_apply = "sigmoid")
        
        #store gradients..
        gradients['dW1'] = dW1
        gradients['db1'] = db1
        gradients['dW2'] = dW2
        gradients['db2'] = db2
        
        #pdate parameters..
        parameters = update_parameters(parameters, gradients, learning_rate)
        
        # Retrieve W1, b1, W2, b2 from parameters
        W1 = parameters["W1"]
        b1 = parameters["b1"]
        W2 = parameters["W2"]
        b2 = parameters["b2"]
        
        # Print the cost every 100 training example
        if print_cost and i % 10 == 0:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
        if print_cost and i % 10 == 0:
            costs.append(cost)
            
    # plot the cost

    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters

In [None]:
### CONSTANTS DEFINING THE MODEL ####
n_x = 784     # num_px * num_px
n_h = 128
n_y = 10
layers_dims = (n_x, n_h, n_y)

In [None]:
parameters = Two_Layer_Neural_Model(X_train, Y_train, layers_dimensions = (n_x, n_h, n_y), num_iterations = 100, print_cost=True)

In [None]:
def predict(X, y, parameters):
    """
    This function is used to predict the results of a  L-layer neural network.
    
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model
    
    Returns:
    p -- predictions for the given dataset X
    """
    
    m = X.shape[1]
    
    n = len(parameters) // 2 # number of layers in the neural network
    
    p = np.zeros((1,m))
    
    # Forward propagation
    probas, caches = L_model_forward(X, parameters)

    
    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0
    
    #print results
    #print ("predictions: " + str(p))
    #print ("true labels: " + str(y))
    print("Accuracy: "  + str(np.sum((p == y)/m)))
        
    return p
