In [None]:
import numpy as np
from tqdm import tqdm
from keras.datasets import fashion_mnist

In [2]:
class Layer:
    def __init__(self, num_inputs, num_neurons, activation, weight_init):
        self.num_inputs = num_inputs
        self.num_neurons = num_neurons
        self.activation_fn = activation
        self.weight_init = weight_init #what is this???

        self.w = np.random.randn(self.num_neurons, self.num_inputs)
        self.b = np.random.randn(self.num_neurons)

    def activation(self,x):
        if self.activation_fn == 'ReLU':
            return np.maximum(0,x)
        if self.activation_fn == 'softmax':
            mx = np.max(x, axis = 1, keepdims=True)
            x -= mx
            # tp = np.sum(np.exp(x), axis=0, keepdims=True)
            # print(tp)
            return(np.exp(x)/np.sum(np.exp(x), axis=1, keepdims=True))
        if self.activation_fn == 'sigmoid':
            x = np.clip(x, -500, 500)
            return(1/(1+np.exp(-x)))

    def grad_activation(self, x):
        if self.activation_fn == 'ReLU':
            return 1*(x>0)
        if self.activation_fn == 'sigmoid':
            return (self.activation(x)*(1 - self.activation(x)))

    def forward(self, cur_input):
        re_bias = self.b.reshape(-1,1)
        self.a = np.dot(self.w,cur_input.T) + re_bias
        self.a = self.a.T
        self.h = self.activation(self.a)
        return self.h

    def backward(self, grad_a, prev_a, prev_h, grad_activation):
        self.dw = np.dot(grad_a.T, prev_h)
        self.db = np.sum(grad_a, axis=0)
        prev_h_grad = np.dot(grad_a, self.w)
        der = grad_activation(prev_a)
        grad_prev_a = prev_h_grad*der
        return grad_prev_a
        

In [3]:
class NeuralNetwork:
    def __init__(self, num_inputs, num_classes, num_hidden_layer, num_neurons, activation, weight_init):
        self.num_inputs = num_inputs
        self.num_classes = num_classes
        self.num_hidden_layer = num_hidden_layer
        self.num_neurons = num_neurons
        self.activation = activation
        self.weight_init = weight_init
        self.layers = []
        self.layers.append(Layer(num_inputs, num_neurons, activation, weight_init))
        for i in range(num_hidden_layer - 1):
            self.layers.append(Layer(num_neurons, num_neurons, 'ReLU', weight_init))
        self.layers.append(Layer(num_neurons, num_classes, 'softmax', weight_init))

    def forward(self, inputs):
        self.inputs = inputs
        cur_in = inputs
        for i in range(self.num_hidden_layer+1):
            cur_out = self.layers[i].forward(cur_in)
            cur_in = cur_out
        self.y_pred = cur_out
        return cur_out

    def backward(self, outputs):
        grad_a_L = -(outputs - self.y_pred)
        for i in range(self.num_hidden_layer, 0, -1):
            grad_a_L = self.layers[i].backward(grad_a_L,self.layers[i-1].a,self.layers[i-1].h, self.layers[i-1].grad_activation)

        self.layers[0].dw = np.dot(grad_a_L.T, self.inputs)
        self.layers[0].db = np.sum(grad_a_L, axis=0)
        
    def minibatch_sgd(self, dw, db, eta : float = 0.01):
            for j in range(self.num_hidden_layer+1):
                self.layers[j].w -= eta*dw[j]
                self.layers[j].b -= eta*db[j]

    def momentum_gd(self, uw, ub, dw, db, eta : float = 0.01, beta : float = 0.9):
        for j in range(self.num_hidden_layer+1):
            uw[j] = beta*uw[j] + dw[j]
            ub[j] = beta*ub[j] + db[j]
            self.layers[j].w -= eta*uw[j]
            self.layers[j].b -= eta*ub[j]
        return uw, ub

    def NAG_gd(self, mw, mb, dw, db, eta : float = 0.01, beta : float = 0.9):
        for j in range(self.num_hidden_layer+1):
            mw[j] = beta*mw[j] + dw[j]
            mb[j] = beta*mb[j] + db[j]
            self.layers[j].w -= eta*(beta*mw[j] + dw[j])
            self.layers[j].b -= eta*(beta*mb[j] + db[j])
        return mw, mb

    def RMSProp_gd(self, uw, ub, dw, db, eta : float = 0.01, beta : float = 0.9, epsilon : float = 1e-8):
        for j in range(self.num_hidden_layer+1):
            uw[j] = beta*uw[j] + (1-beta)*dw[j]**2
            ub[j] = beta*ub[j] + (1-beta)*db[j]**2
            self.layers[j].w -= eta*dw[j]/(np.sqrt(uw[j])+epsilon)
            self.layers[j].b -= eta*db[j]/(np.sqrt(ub[j])+epsilon)
        return uw, ub
    
    def Adam_gd(self, mw, mb, uw, ub, dw, db, t, eta : float = 0.01, beta1 : float = 0.9, beta2 : float = 0.999, epsilon : float = 1e-8):
        for j in range(self.num_hidden_layer+1):
            mw[j] = beta1*mw[j] + (1-beta1)*dw[j]
            mb[j] = beta1*mb[j] + (1-beta1)*db[j]
            uw[j] = beta2*uw[j] + (1-beta2)*(dw[j]**2)
            ub[j] = beta2*ub[j] + (1-beta2)*(db[j]**2)
            mw_hat = mw[j]/(1-beta1**t)
            mb_hat = mb[j]/(1-beta1**t)
            uw_hat = uw[j]/(1-beta2**t)
            ub_hat = ub[j]/(1-beta2**t)
            self.layers[j].w -= eta*mw_hat/(np.sqrt(uw_hat)+epsilon)
            self.layers[j].b -= eta*mb_hat/(np.sqrt(ub_hat)+epsilon)
        return mw, mb, uw, ub


    def NAdam_gd(self, mw, mb, uw, ub, dw, db, t, eta : float = 0.01, beta1 : float = 0.9, beta2 : float = 0.999, epsilon : float = 1e-8):
        for j in range(self.num_hidden_layer+1):
            mw[j] = beta1*mw[j] + (1-beta1)*dw[j]
            mb[j] = beta1*mb[j] + (1-beta1)*db[j]
            uw[j] = beta2*uw[j] + (1-beta2)*dw[j]**2
            ub[j] = beta2*ub[j] + (1-beta2)*db[j]**2
            m_w_hat = mw[j]/(1-np.power(beta1, t+1))
            m_b_hat = mb[j]/(1-np.power(beta1, t+1))
            uw_hat = uw[j]/(1-np.power(beta2, t+1))
            ub_hat = ub[j]/(1-np.power(beta2, t+1))
            self.layers[j].w -= (eta/(np.sqrt(uw_hat) + epsilon))*(beta1*m_w_hat+ (1-beta1)*dw[j]/(1-np.power(beta1, t+1)))
            self.layers[j].b -= (eta/(np.sqrt(ub_hat) + epsilon))*(beta1*m_b_hat + (1-beta1)*db[j]/(1-np.power(beta1, t+1)))
        return mw, mb, uw, ub


    def train(self, X_train, y_train, batch_size, epochs, optimizer, eta : float = 0.001, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
        for i in range(epochs):
            uw = [np.zeros_like(self.layers[j].w) for j in range(self.num_hidden_layer+1)]
            ub = [np.zeros_like(self.layers[j].b) for j in range(self.num_hidden_layer+1)]
            mw = [np.zeros_like(self.layers[j].w) for j in range(self.num_hidden_layer+1)]
            mb = [np.zeros_like(self.layers[j].b) for j in range(self.num_hidden_layer+1)]
            t = 1
            for i in tqdm(range(0, X_train.shape[0], batch_size)):
                x = X_train[i:i+batch_size]
                y = y_train[i:i+batch_size]
                self.forward(x)
                self.backward(y)
                dw = [self.layers[j].dw / X_train.shape[0] for j in range(self.num_hidden_layer+1)]
                db = [self.layers[j].db / X_train.shape[0] for j in range(self.num_hidden_layer+1)]
                if optimizer == "minibatch_sgd":
                    self.minibatch_sgd(dw, db, eta, batch_size)
                elif optimizer == "momentum_gd":
                    uw, ub = self.momentum_gd(uw, ub,dw, db, eta, beta1)
                elif optimizer == "NAG_gd":
                    mw, mb = self.NAG_gd(mw, mb, dw, db, eta, beta1)
                elif optimizer == "RMSProp_gd":
                    uw, ub = self.RMSProp_gd(uw, ub, dw, db, eta, beta1, epsilon)
                elif optimizer == "Adam_gd":
                    mw, mb, uw, ub = self.Adam_gd(mw, mb, uw, ub, dw, db, t, eta, beta1, beta2, epsilon)
                elif optimizer == "NAdam_gd":
                    mw, mb, uw, ub = self.NAdam_gd(mw, mb, uw, ub, dw, db, t, eta, beta1, beta2, epsilon)   
                t += 1
            self.test(X_train, y_train) 
                    

    def test(self, X_test, y_test):
        self.forward(X_test)
        y_pred = self.layers[-1].a
        y_pred = np.argmax(y_pred, axis=1)
        y_test = np.argmax(y_test, axis=1)

        print(np.sum(y_pred == y_test)/y_test.shape[0])

    def cross_entropy(self, y_pred, y):
        return -np.sum(y*np.log(y_pred))/ y_pred.shape[0]


In [None]:
nn = NeuralNetwork(784, 10, 1, 64, 'ReLU', 'random')
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

X_train = X_train.reshape(X_train.shape[0], 784)
X_test = X_test.reshape(X_test.shape[0], 784)

X_train /= 255
X_test /= 255

y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

nn.minibatch_sgd(X_train, y_train, eta=0.01, batch_size=25)
nn.test(X_test, y_test)

In [None]:
#     # momentum based gradient descent
# def train_momentum(self, X_train, y_train, epochs, learning_rate, gamma):
#     for i in range(epochs):
#         dw, db = 0, 0
#         for x,y in zip(X_train,y_train):
#             self.forward(x)
#             self.backward(y)
#             dw += self.layers[0].grad_w
#             db += self.layers[0].grad_b
#         for k in range(self.num_hidden_layer+1):
#             uw[k] = gamma*prev_uw + learning_rate*dw
#             ub = gamma*prev_ub + learning_rate*db
#             self.layers[k].weights -= uw
#             self.layers[k].biases -= ub
#         prev_uw = uw
#         prev_ub = ub

# class MGD(Optimiser):
# 	def __init__(self, model : Model = None, learning_rate : float = 0.01, weight_decay : float = 0.0, momentum : float = 0.9):
# 		super().__init__(model, learning_rate, weight_decay)
# 		self.momentum = momentum
# 		self.u_w = [np.zeros_like(self.model.layers[i].weights) for i in range(len(self.model.layers))]
# 		self.u_b = [np.zeros_like(self.model.layers[i].bias) for i in range(len(self.model.layers))]

# 	def step(self) :
# 		i = 0
# 		for layer in self.model.layers:
# 			self.u_w[i] = self.momentum * self.u_w[i] + layer.grad_w
# 			self.u_b[i] = self.momentum * self.u_b[i] + layer.grad_b
# 			layer.weights -= self.learning_rate * (self.u_w[i] + self.weight_decay * layer.weights)
# 			layer.bias -= self.learning_rate * (self.u_b[i] + self.weight_decay * layer.bias)
# 			i += 1

    

        
# def do_stochastic_gradient_descent():
  
#   w,b,eta,max_epochs = -2,-2,1.0,1000
  
#   for i in range(max_epochs):
#     dw,db = 0,0
#     for x,y in zip(X,Y):
#       dw += grad_w(x,w,b,y)
#       db += grad_b(x,w,b,y)    
#       w = w - eta*dw
#       b = b - eta*db