In [81]:
import pandas as pd
import numpy as np

In [82]:
data=pd.read_csv('mnist.csv')

In [83]:
data.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
data=np.array(data)
m,n=data.shape
np.random.shuffle(data)

data_dev=data[:1000].T
Y_dev=data_dev[0]
X_dev=data_dev[1:]

data_train=data[1000:].T
Y_train=data_train[0]
X_train=data_train[1:]

In [85]:
X_train = X_train / 255.
X_dev = X_dev / 255.

In [86]:
def init_params():
  W1 = np.random.randn(10, 784) * np.sqrt(1 / 784)
  W2 = np.random.randn(10, 10) * np.sqrt(1 / 10)
  B1 = np.zeros((10, 1))
  B2 = np.zeros((10, 1))
  return W1, B1, W2, B2

In [87]:
def ReLU(z):
  return np.maximum(z,0)

def Softmax(z):
    z = z - np.max(z, axis=0, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

In [88]:
def fwd(W1,B1,W2,B2,X):
  Z1=W1.dot(X)+B1
  A1=ReLU(Z1)
  Z2=W2.dot(A1)+B2
  A2=Softmax(Z2)
  return Z1,A1,Z2,A2

In [89]:
def one_hot(Y):
  one_hot_Y=np.zeros((Y.size,Y.max()+1))
  one_hot_Y[np.arange(Y.size),Y]=1
  one_hot_Y=one_hot_Y.T
  return one_hot_Y

In [90]:
def deriv_ReLU(z):
  return z>0

In [91]:
def bck(Z1,A1,Z2,A2,W2,X,Y):
  m=Y.size
  one_hot_Y=one_hot(Y)
  dZ2=A2-one_hot_Y
  dW2=1/m*dZ2.dot(A1.T)
  dB2=(1/m)*np.sum(dZ2,axis=1,keepdims=True)
  dZ1=W2.T.dot(dZ2)*deriv_ReLU(Z1)
  dW1=(1/m)*dZ1.dot(X.T)
  dB1=(1/m)*np.sum(dZ1,axis=1,keepdims=True)
  return dW1,dB1,dW2,dB2

In [92]:
def update_params(W1,B1,W2,B2,dW1,dB1,dW2,dB2,alpha):
  W1=W1-alpha*dW1
  B1=B1-alpha*dB1
  W2=W2-alpha*dW2
  B2=B2-alpha*dB2
  return W1,B1,W2,B2

In [93]:
def get_predictions(A2):
  return np.argmax(A2,0)

def get_accuracy(predictions,Y):
  return np.sum(predictions==Y)/Y.size

In [94]:
def gradient_descent(X,Y,iterations,alpha):
  W1,B1,W2,B2=init_params()
  for i in range(iterations):
    Z1,A1,Z2,A2=fwd(W1,B1,W2,B2,X)
    dW1,dB1,dW2,dB2=bck(Z1,A1,Z2,A2,W2,X,Y)
    W1,B1,W2,B2=update_params(W1,B1,W2,B2,dW1,dB1,dW2,dB2,alpha)
    if i%50==0:
      print(f'Iteration: {i}')
      print(f"Accuracy: {get_accuracy(get_predictions(A2),Y)}")
  return W1,B1,W2,B2

In [97]:
W1,B1,W2,B2=gradient_descent(X_train,Y_train,500,0.01)

Iteration: 0
Accuracy: 0.11966101694915254
Iteration: 50
Accuracy: 0.39552542372881355
Iteration: 100
Accuracy: 0.5300508474576271
Iteration: 150
Accuracy: 0.601271186440678
Iteration: 200
Accuracy: 0.654
Iteration: 250
Accuracy: 0.6891525423728814
Iteration: 300
Accuracy: 0.7111525423728814
Iteration: 350
Accuracy: 0.7303050847457627
Iteration: 400
Accuracy: 0.7449491525423729
Iteration: 450
Accuracy: 0.7573898305084745


In [98]:
Z1_dev, A1_dev, Z2_dev, A2_dev = fwd(W1, B1, W2, B2, X_dev)
acc_dev = get_accuracy(get_predictions(A2_dev), Y_dev)
print("Dev Set Accuracy:", acc_dev)


Dev Set Accuracy: 0.748
