<a href="https://colab.research.google.com/github/Marvintheandroid42/Deep-Learning-From-Scratch/blob/main/MNIST_NN_From_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
def import_data(label = 2):

  MNIST = np.array(
      pd.read_csv('/content/sample_data/mnist_train_small.csv') )

  X = MNIST[:, 1:]
  y = MNIST[:, 0]


  X = (X - np.min(X)) / (np.max(X) - np.min(X)) #scaling the X values

  y[np.where(y != label)] = 0
  y[np.where(y == label)] = 1 #binary output processing


  return X.T, y.reshape(-1,1) #(m, n) (n, 1)

In [None]:
X, y = import_data()

In [None]:
class Dense():

  def __init__(self, input_dim, output_dim):

    #using the random normal sampling to get weights -1 < x < 1

    self.W = np.random.randn(input_dim, output_dim)
    self.b = np.random.randn(output_dim, 1)

    self.input_dim = input_dim
    self.output_dim = output_dim

  def forward(self, X):

    #making sure the input matrix is of the shape (input_dim, n)

    if X.shape[0] != self.input_dim:

      X = X.T

    self.input = X

    # Weighted sum gives an output with shape (output_dim, n)

    z = np.dot(self.W.T, X) + self.b

    return z

  def backward(self, input_grad, learning_rate): #shape of input grad from the activation is (output_dim, n)

    weight_grad = (1/input_grad.shape[1]) * np.dot(self.input, input_grad.T) #(input_dim, output_dim)

    #need to add the (1/input_grad.shape[1]) as the dot product is the aggregate of all the data points
    #so in order to take the mean we need to divide by the number of data points as the sum for the mean
    #is already done by the dot product, need to carry the 1/n term from the loss function into the update

    bias_grad = np.mean(input_grad, axis=1).reshape(-1,1) #(output_dim, 1)

    #we dont need to carry the term for the bias as we are already taking the mean using the numpy function!

    output_grad = np.dot(self.W, input_grad) #(input_dim, n)

    self.W = self.W - learning_rate * (weight_grad)

    self.b = self.b - learning_rate * (bias_grad)

    return output_grad


In [None]:
class Sigmoid():

  def activation(self, x):

    return 1 / (1 + np.exp(-1 * x))

  def forward(self, X):

    #X is of the shape (output_dim of dense layer, n)

    self.input = X

    #Output is shape (output_dim, n)

    return self.activation(X)

  def backward(self, input_grad): #input_grad is of the shape (output_dim, n)

    #(output_dim, n) .* (output_dim, n) = (output_dim, n)

    output_grad = self.activation(self.input) * self.activation(1 - self.input)

    return input_grad * output_grad

In [None]:
class Log_Loss(): #need to compute the forward propogation druing every epoch

  def forward(self, y_hat, y): #both arrays should have the shape (1, n)

    if y.shape[0] != 1:

      y = y.T


    if y_hat.shape[0] != 1:

      y_hat = y_hat.T


    self.y_hat = y_hat

    self.y = y

    return -1 * np.mean(y * np.log(y_hat) + (1-y)*np.log(1-y_hat), axis=1)


  def backward(self):

    return -1 * ((self.y / self.y_hat) - ((1-self.y)/(1-self.y_hat))) #(1, n) shape


In [None]:
class MSE_Loss():

  def forward(self, y_hat, y):

    if y.shape[0] != 1:

      y = y.T


    if y_hat.shape[0] != 1:

      y_hat = y_hat.T

    self.y_hat = y_hat

    self.y = y

    return np.mean((y - y_hat)**2, axis=1)

  def backward(self):

    return 2 * (self.y - self.y_hat)



In [None]:
l1 = Dense(784, 64)
l2 = Dense(64, 1)
a1 = Sigmoid()
a2 = Sigmoid()

In [None]:
def forward(X):
  return a2.forward(l2.forward(a1.forward(l1.forward(X))))

In [None]:
def backward(X, y, ALPHA):


  loss_func = Log_Loss()


  loss = loss_func.forward(forward(X), y)
  loss_grad = a2.backward(loss_func.backward()) #(1, n)

  l2_grad = a1.backward(l2.backward(loss_grad, ALPHA))

  l1_grad = l1.backward(l2_grad, ALPHA)

  return loss

In [None]:
def training(EPOCHS, ALPHA, LAMBDA, verbose=True):

  for i in range(EPOCHS): #implement learning rate diminishing using exponential function with respect to the number of epochs

  #need to implement dynamic learning rate, regularization and early stopping
    loss = backward(X, y, ALPHA*((LAMBDA)**i))

    if verbose == True:

      print('EPOCH #: ', i, '      ', 'LOSS: ', loss)

In [None]:
EPOCHS = 15
ALPHA = 5 #inital learning rate
LAMBDA = 0.7 #diminishing value for learning rate to take smaller steps based on epoch

In [None]:
training(EPOCHS, ALPHA, LAMBDA)

EPOCH #:  0        LOSS:  [0.61514815]
EPOCH #:  1        LOSS:  [2.42306346]
EPOCH #:  2        LOSS:  [1.44150173]
EPOCH #:  3        LOSS:  [0.82935898]
EPOCH #:  4        LOSS:  [0.44236343]
EPOCH #:  5        LOSS:  [0.2602296]
EPOCH #:  6        LOSS:  [0.22739271]
EPOCH #:  7        LOSS:  [0.22323973]
EPOCH #:  8        LOSS:  [0.22105908]
EPOCH #:  9        LOSS:  [0.21961998]
EPOCH #:  10        LOSS:  [0.21864021]
EPOCH #:  11        LOSS:  [0.21796573]
EPOCH #:  12        LOSS:  [0.21749871]
EPOCH #:  13        LOSS:  [0.21717418]
EPOCH #:  14        LOSS:  [0.21694814]


In [None]:
#UTILS

def logits_to_classes(y_hat, threshold):

  y_hat[np.where(y_hat >= threshold)] = 1

  y_hat[np.where(y_hat < threshold)] = 0

  return y_hat

def accuracy(y_hat, y, verbose=True):

  if y_hat.shape[1] != 1:

    y_hat = y_hat.T

  if y.shape[1] != 1:

    y = y.T


  acc = np.round(len(np.where(y_hat == y)[0]) / len(y), decimals=3) * 100

  if verbose == True:

    print('ACCURACY: ', acc, '%')

  return acc


In [None]:
acc = accuracy(logits_to_classes(forward(X), 0.5), y)

ACCURACY:  92.80000000000001 %
