In [47]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import random
import math
import matplotlib.pyplot as plt
import seaborn as sns


In [48]:
train = pd.read_csv(r"/Users/medhajdubey/Downloads/mnist_train.csv.zip")
test = pd.read_csv(r"/Users/medhajdubey/Downloads/mnist_test.csv.zip")

In [49]:
train.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
test.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
train.shape

(60000, 785)

In [52]:
test.shape

(10000, 785)

In [53]:
train = np.array(train)
test = np.array(test)

In [54]:
train

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [6, 0, 0, ..., 0, 0, 0],
       [8, 0, 0, ..., 0, 0, 0]])

In [55]:
test

array([[7, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [4, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [6, 0, 0, ..., 0, 0, 0]])

In [56]:
m,n = train.shape
m

60000

In [57]:
train = train.T
X_train = train[1:]
Y_train = train[0]

In [58]:
test = test.T
X_test = test[1:]
Y_test = test[0]

In [59]:
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(784, 60000) (784, 10000) (60000,) (10000,)


In [60]:
X_train = X_train/255
X_test = X_test/255

In [61]:
def _init_():
    W1 = np.random.rand(10,784) - 0.5#for first hidden layer
    b1 = np.random.rand(10,1) - 0.5
    W2 = np.random.rand(10,10) - 0.5#for second hidden layer
    b2 = np.random.rand(10,1) - 0.5
    return W1, b1, W2, b2

def ReLu(Z):
    return np.maximum(Z,0)

def softmax(Z):
    return np.exp(Z)/sum(np.exp(Z))

def ReLu_derivative(Z):
    return Z > 0

def forwardpropagation(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLu(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

#creating the y_hat output matrix
#here for each column, it goes to the column specified by the label in Y and sets it to 1
def one_hot_encoded(Y):
    Y_onehot = np.zeros((Y.size, Y.max() +1))
    Y_onehot[np.arange(Y.size), Y] = 1
    Y_hat = Y_onehot.T
    return Y_hat

def backpropagation(Z1, A1, Z2, A2, W2, X, Y):
    Y_hat = one_hot_encoded(Y)
    dZ2 = A2 - Y_hat
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * ReLu_derivative(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

def parameter_updation(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    return W1, b1, W2, b2

def loss_function(A2, Y, m):
    epsilon = 1e-12
    y_pred = np.clip(A2, epsilon, 1-epsilon)
    y_true = one_hot_encoded(Y)
    loss = -np.sum(y_true * np.log(y_pred))/m
    return loss

#returns most likely of the output classes
#suppose the possible outputs are 0,1,2,3,4,5,6,7,8,9
#this function will return the output which has the highest value assigned to it
def predictions(A2):
    return np.argmax(A2, 0)

def accuracy(predictions, Y):
    print(predictions,Y)
    return np.sum(predictions == Y)/Y.size 

def gradient_descent(X, Y, iterations, alpha, cost_list):
    W1, b1, W2, b2 = _init_()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forwardpropagation(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backpropagation(Z1, A1, Z2, A2, W2, X, Y)
        W1, b1, W2, b2 = parameter_updation(W1, b1, W2, b1, dW1, db1, dW2, db2, alpha)
        if (i%100 == 0):
            print("Iteration ",i)
            print("Accuracy: ", accuracy(predictions(A2),Y))
        loss = loss_function(A2, Y, m)
        cost_list.append(loss)
    return W1, b1, W2, b2, cost_list


In [62]:
alpha = 0.1
iterations = 1000
cost_list = []
W1, b1, W2, b2, cost_list = gradient_descent(X_train, Y_train, 1000, 0.1, cost_list)

Iteration  0
[6 7 2 ... 2 0 7] [5 0 4 ... 5 6 8]
Accuracy:  0.08115
Iteration  100
[0 0 6 ... 5 2 7] [5 0 4 ... 5 6 8]
Accuracy:  0.4982
Iteration  200
[3 0 6 ... 5 6 2] [5 0 4 ... 5 6 8]
Accuracy:  0.6837833333333333
Iteration  300
[3 0 6 ... 5 6 2] [5 0 4 ... 5 6 8]
Accuracy:  0.7678666666666667
Iteration  400
[3 0 4 ... 5 0 3] [5 0 4 ... 5 6 8]
Accuracy:  0.8036833333333333
Iteration  500
[0 0 4 ... 5 0 3] [5 0 4 ... 5 6 8]
Accuracy:  0.8227833333333333
Iteration  600
[0 0 4 ... 5 0 3] [5 0 4 ... 5 6 8]
Accuracy:  0.8352166666666667
Iteration  700
[0 0 4 ... 5 0 3] [5 0 4 ... 5 6 8]
Accuracy:  0.8450333333333333
Iteration  800
[0 0 4 ... 5 0 3] [5 0 4 ... 5 6 8]
Accuracy:  0.8523166666666666
Iteration  900
[0 0 4 ... 5 0 3] [5 0 4 ... 5 6 8]
Accuracy:  0.8579833333333333
