In [31]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
(X1, Y1), (X2, Y2) = mnist.load_data()


In [2]:

m_train = X1.shape[0]
m_test = X2.shape[0]
X_train = (X1.reshape(X1.shape[0],-1).T)/255
Y_train_temp = Y1.reshape(Y1.shape[0],)
Y_train = np.zeros((Y_train_temp.size,10))
Y_train[np.arange(Y_train_temp.size),Y_train_temp] = 1
Y_train = Y_train.T
X_test = (X2.reshape(X2.shape[0],-1).T)/255
Y_test_temp = Y2.reshape(Y2.shape[0],)
Y_test = np.zeros((Y_test_temp.size,10))
Y_test[np.arange(Y_test_temp.size),Y_test_temp] = 1
Y_test = Y_test.T

In [3]:
def sigmoid(z):
    s = 1/(1+np.exp(-z))
    return s

def leakyrelu(z):
    s = np.where(z>0 , z , z*0.01)
    activation_cache = (z)
    return s, activation_cache

def softmax(z):
    s = np.exp(z)/np.sum(np.exp(z), axis = 0, keepdims = True)
    activation_cache = (z)
    return s, activation_cache

In [4]:
def initialize_parameters(X,Y):

    W1 = np.random.randn(100,X.shape[0])*0.01
    b1 = np.zeros((100,1), dtype = float)
    W2 = np.random.randn(50,W1.shape[0])*0.01
    b2 = np.zeros((50,1), dtype = float)
    W3 = np.random.randn(25,W2.shape[0])*0.01
    b3 = np.zeros((25,1), dtype = float)
    W4 = np.random.randn(10,W3.shape[0])*0.01
    b4 = np.zeros((10,1), dtype = float)

    parameters = {"W1" : W1, "b1" : b1,"W2" : W2, "b2" : b2,"W3" : W3, "b3" : b3,"W4" : W4, "b4" : b4,}

    return parameters


In [5]:
def linear_forward(A, W, b):
    Z = np.dot(W,A) + b
    cache = (A, W, b)
    return Z, cache

In [6]:
def linear_activation_forward(A_prev, W, b, activation):
    if activation == "leakyrelu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = leakyrelu(Z)
    
    if activation == "softmax":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = softmax(Z)
        
    cache = (linear_cache, activation_cache)
    return A, cache


In [7]:
def L_forward(X, parameters):

    caches = []
    A = X
    L = len(parameters)//2

    for l in range (1,L):
        A_prev = A

        A,cache = linear_activation_forward(A_prev,parameters["W" + str(l)], parameters["b" + str(l)], "leakyrelu")
        caches.append(cache)

    AL, cache =  linear_activation_forward(A,parameters["W" + str(L)], parameters["b" + str(L)], "softmax")
    caches.append(cache)

    return AL, caches

In [8]:
def compute_cost(AL, Y):

    m = Y.shape[1]
    cost = - np.sum(Y*np.log(AL))/m
    np.squeeze(cost)

    return cost

In [9]:
def linear_backward(dZ , cache):

    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = np.dot(dZ, A_prev.T)/m
    db = np.sum(dZ, axis = 1, keepdims=True)/m
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

In [10]:
def softmax_backward(AL, Y):
    dZ = AL- Y
    return dZ

In [11]:
def leakyrelu_backward(dA, activation_cache):
    Z = activation_cache
    Z_temp = np.where(Z>0, 1, 0.01)
    dZ = dA * Z_temp

    return dZ

In [12]:
def linear_activation_backward(Y, AL, dA, cache, activation):
    linear_cache, activation_cache = cache

    if activation == "leakyrelu":
        dZ = leakyrelu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)

    elif activation == "softmax":
        dZ = softmax_backward(AL, Y)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)


    return dA_prev, dW, db
        

In [13]:
def L_model_backward(AL, Y, caches):

    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    dAL = -Y/AL

    current_cache = caches[L-1]
    dA_prev_temp, dW_temp, db_temp = linear_activation_backward(Y,AL,dAL,current_cache, "softmax")
    grads["dA" + str(L-1)] = dA_prev_temp
    grads["dW" + str(L)] = dW_temp
    grads["db" + str(L)] = db_temp

    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(Y,AL,dA_prev_temp, current_cache, "leakyrelu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l+1)] = dW_temp
        grads["db" + str(l+1)] = db_temp

    return grads
    

In [14]:
def update_parameters(params,grads,learning_rate):
    parameters = params.copy()

    L = len(parameters)//2

    for l in range(L):
        parameters["W" + str(l+1)] = params["W" + str(l+1)] - learning_rate*grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = params["b" + str(l+1)] - learning_rate*grads["db" + str(l+1)]

    return parameters

In [15]:
def random_mini_batches(X,Y, mini_batch_size = 64):
    m = X.shape[1]
    mini_batches = []

    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation]

    inc = mini_batch_size
    num_complete_minibatches = m // mini_batch_size
   
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:,k*inc:(k+1)*inc]
        mini_batch_Y = shuffled_Y[:,k*inc:(k+1)*inc]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: ]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: ]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

In [16]:

def model(X, Y, learning_rate = 0.0075, num_iterations = 3000, print_cost = False):

    grads = {}
    costs = []
    m = X.shape[1]
    parameters = initialize_parameters(X,Y)

    for i in range(0, num_iterations):
        AL, caches = L_forward(X,parameters)
        grads = L_model_backward(AL, Y, caches)
        parameters = update_parameters(parameters,grads,learning_rate)
        cost = compute_cost(AL,Y)
        Y_predict = np.zeros(AL.shape)
        Y_predict[np.argmax(AL, axis = 0), np.arange(AL.shape[1])] = 1
        
        if print_cost and i % 100 == 0 or i == num_iterations - 1:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
            print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_predict - Y)) * 100))
        
        if i % 100 == 0 or i == num_iterations:
            costs.append(cost)

    return parameters,costs, Y_predict

In [17]:
parameters, costs, Y_predict = model(X_train,Y_train, 1, 1000, True)

Cost after iteration 0: 2.3025856720705646
train accuracy: 81.586 %
Cost after iteration 100: 2.3011477492197248
train accuracy: 82.24733333333333 %
Cost after iteration 200: 2.301121053339652
train accuracy: 82.24733333333333 %
Cost after iteration 300: 2.3007188769786833
train accuracy: 82.24733333333333 %
Cost after iteration 400: 2.3003352822531506
train accuracy: 82.24733333333333 %
Cost after iteration 500: 1.9361728766589936
train accuracy: 85.089 %
Cost after iteration 600: 2.233536115234909
train accuracy: 84.32833333333333 %
Cost after iteration 700: 1.4268579306420581
train accuracy: 86.99433333333333 %
Cost after iteration 800: 1.0196973537292688
train accuracy: 91.88833333333334 %
Cost after iteration 900: 0.7015172678131936
train accuracy: 94.76466666666667 %
Cost after iteration 999: 0.5511161424634563
train accuracy: 96.222 %


In [18]:
def accuracy_test(parameters, X, Y):
    AL, caches = L_forward(X,parameters)
    Y_predict = np.zeros(AL.shape)
    Y_predict[np.argmax(AL, axis = 0), np.arange(AL.shape[1])] = 1
    print("accuracy: {} %".format(100 - np.mean(np.abs(Y_predict - Y)) * 100))

    return AL, Y_predict

In [30]:
AL, Y_predict = accuracy_test(parameters, X_train, Y_train)
print(Y_train[:,5702])
print(Y_predict[:,5702])
print(Y1[5702])

accuracy: 96.10666666666667 %
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
3
