In [81]:
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import fashion_mnist
import copy

# load dataset
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

In [82]:
classes = set(y_train)
variousSamples = list()
for i in classes:
  ind = np.where(y_train == i)[0][0]
  variousSamples.append(x_train[ind])

In [83]:
x_flatten_train = x_train.reshape(x_train.shape[0],x_train.shape[1]*x_train.shape[2],1)
y_encoded = np.zeros((y_train.shape[0], max(classes) + 1))
y_encoded[np.arange(y_train.shape[0]), y_train] = 1

In [84]:
y_encoded = y_encoded.reshape(60000,10,1)

In [85]:
def normalize_data(x):
  x_norm = x.astype('float32')
  x_norm = x_norm / 255.0
  return x_norm

x_flatten_train = normalize_data(x_flatten_train)

In [86]:
layers = 3
samples = y_train.shape[0]
lr = 0.01
epochs = 2
nodesPerLayer = list()

In [87]:
nodesPerLayer.append(784)
for i in range(0,layers):
  nodesPerLayer.append(int(1024/(2**(i+1))))
nodesPerLayer.append(10)

In [88]:
def func(activation,a_k):
  if(activation == "tanh"):
    a_k = np.tanh(a_k)
  else:
    #print(a_k)
    a_k = 1/(1 + np.exp(-1*a_k))
  return a_k

In [89]:
def derivativeFun(activation,a_k):
  activationResult = func(activation,a_k)
  if(activation == "tanh"):
    activationResult = 1 - (activationResult**2)
  else:
    activationResult = activationResult - (activationResult**2)

  return activationResult

In [90]:
def decision(a_k,classificationFunction):
  if classificationFunction == "crossEntropy":
    a_k = np.exp(a_k - np.max(a_k))
    a_k = a_k / sum(a_k)
  return a_k

In [91]:
def OneHotEncode(C):
  oneHot = np.zeros(C.shape)
  oneHot[np.argmax(C)] = 1
  return oneHot

In [92]:
def forwardProp(inputX,activation,classificationFunction,weights,bias):
  h_k = inputX
  PreActivations = list()
  PostActivations = list()
  PostActivations.append(h_k)
  for k in range(0,layers):
    #print(weights[k].shape,h_k.shape)
    a_k = bias[k] + np.dot(weights[k],h_k)
    PreActivations.append(a_k)
    h_k = func(activation,a_k)
    PostActivations.append(h_k)
  a_k = bias[layers] + np.matmul(weights[layers],h_k)
  PreActivations.append(a_k)
  yPred = decision(a_k,classificationFunction)
  return PreActivations,PostActivations,yPred

In [93]:
def backProp(real, pred, h_k, weights, activation, PreActivations):
    a_l_L_theta = pred - real
    currentActivationGradient = a_l_L_theta
    WeightGradients = []
    biasGradients = []
    layers = len(weights) - 1

    for i in range(layers, -1, -1):
        W_i_L_theta = currentActivationGradient*np.transpose(h_k[i])
        WeightGradients.insert(0, W_i_L_theta)
        b_i_L_theta = np.sum(currentActivationGradient, axis=0, keepdims=True)
        biasGradients.insert(0, b_i_L_theta)

        if i > 0:
            h_i_prev_L_theta = np.matmul(weights[i].T, currentActivationGradient)
            currentActivationGradient = h_i_prev_L_theta * derivativeFun(activation, PreActivations[i - 1])

    return WeightGradients, biasGradients


In [103]:
def stochastic_gradient_descent(nodesPerLayer, x_flatten_train, y_encoded, batch_size):
    # Initialize weights and biases
    weights = [np.random.randn(nodesPerLayer[i], nodesPerLayer[i-1]) * 0.1 for i in range(1, len(nodesPerLayer))]
    bias = [np.random.randn(nodesPerLayer[i], 1) * 0.1 for i in range(1, len(nodesPerLayer))]

    num_batches = len(x_flatten_train) // batch_size

    for epoch in range(epochs):
        print("Epoch:", epoch)
        for batch in range(0,20):
            start = batch * batch_size
            end = (batch + 1) * batch_size

            batch_x = x_flatten_train[start:end]
            batch_y = y_encoded[start:end]

            batch_Wdelta = [np.zeros_like(w) for w in weights]
            batch_Bdelta = [np.zeros_like(b) for b in bias]

            for j in range(len(batch_x)):
                A, B, C = forwardProp(batch_x[j], "sigmoid", "crossEntropy", weights, bias)
                Wdelta, Bdelta = backProp(batch_y[j], C, B, weights, "sigmoid", A)

                for k in range(len(batch_Wdelta)):
                    batch_Wdelta[k] += Wdelta[k]
                    batch_Bdelta[k] += Bdelta[k]

            for k in range(len(weights)):
                weights[k] -= lr * (batch_Wdelta[k] / batch_size)
                bias[k] -= lr * (batch_Bdelta[k] / batch_size)

    return weights, bias


In [104]:
def momentum_gradient_descent(nodesPerLayer, x_flatten_train, y_encoded, gamma, batch_size):
    # Initialize weights and biases
    weights = [np.random.randn(nodesPerLayer[i], nodesPerLayer[i-1]) * 0.1 for i in range(1, len(nodesPerLayer))]
    bias = [np.random.randn(nodesPerLayer[i], 1) * 0.1 for i in range(1, len(nodesPerLayer))]

    # Initialize momentum parameters
    Wdelta = [np.zeros((nodesPerLayer[i], nodesPerLayer[i-1])) for i in range(1, len(nodesPerLayer))]
    Bdelta = [np.zeros((nodesPerLayer[i], 1)) for i in range(1, len(nodesPerLayer))]

    num_batches = len(x_flatten_train) // batch_size

    for epoch in range(epochs):
        print("Epoch:", epoch)
        for batch in range(0,num_batches):
            start = batch * batch_size
            end = (batch + 1) * batch_size

            batch_x = x_flatten_train[start:end]
            batch_y = y_encoded[start:end]

            batch_Wdelta = [np.zeros_like(w) for w in weights]
            batch_Bdelta = [np.zeros_like(b) for b in bias]

            for j in range(len(batch_x)):
                A, B, C = forwardProp(batch_x[j], "sigmoid", "crossEntropy", weights, bias)
                CurrWdelta, CurrBdelta = backProp(batch_y[j], C, B, weights, "sigmoid", A)

                for k in range(len(batch_Wdelta)):
                    batch_Wdelta[k] += CurrWdelta[k]
                    batch_Bdelta[k] += CurrBdelta[k]

            for k in range(len(weights)):
                Wdelta[k] = gamma * Wdelta[k] + lr * batch_Wdelta[k] / batch_size
                Bdelta[k] = gamma * Bdelta[k] + lr * batch_Bdelta[k] / batch_size

                weights[k] -= Wdelta[k]
                bias[k] -= Bdelta[k]

    return weights, bias


In [105]:
def nesterov_gradient_descent(nodesPerLayer,x_flatten_train,y_encoded,gamma, batch_size):
    # Initialize weights and biases
    weights = [np.random.randn(nodesPerLayer[i], nodesPerLayer[i-1]) * 0.1 for i in range(1, len(nodesPerLayer))]
    bias = [np.random.randn(nodesPerLayer[i], 1) * 0.1 for i in range(1, len(nodesPerLayer))]

    num_batches = len(x_flatten_train) // batch_size

    for epoch in range(epochs):
        print("Epoch:", epoch)
        for batch in range(0,num_batches):
            start = batch * batch_size
            end = (batch + 1) * batch_size

            batch_x = x_flatten_train[start:end]
            batch_y = y_encoded[start:end]

            lookahead_weights = [w - gamma * dw for w, dw in zip(weights, weights)]
            lookahead_bias = [b - gamma * db for b, db in zip(bias, bias)]

            for j in range(len(batch_x)):
                A, B, C = forwardProp(batch_x[j], "sigmoid", "crossEntropy", lookahead_weights, lookahead_bias)
                CurrWdelta, CurrBdelta = backProp(batch_y[j], C, B, lookahead_weights, "sigmoid", A)

                for k in range(len(weights)):
                    weights[k] -= lr * CurrWdelta[k]
                    bias[k] -= lr * CurrBdelta[k]

    return weights, bias


In [106]:
def rmsprop(nodesPerLayer, x_flatten_train, y_encoded, beta, eps, epochs, batch_size= 200, lr=0.001):
    # Initialize weights and biases
    weights = [np.random.randn(nodesPerLayer[i], nodesPerLayer[i-1]) * 0.1 for i in range(1, len(nodesPerLayer))]
    bias = [np.random.randn(nodesPerLayer[i], 1) * 0.1 for i in range(1, len(nodesPerLayer))]

    # Initialize RMSprop parameters
    rmsweights = [np.zeros((nodesPerLayer[i], nodesPerLayer[i-1])) for i in range(1, len(nodesPerLayer))]
    rmsbias = [np.zeros((nodesPerLayer[i], 1)) for i in range(1, len(nodesPerLayer))]

    num_batches = len(x_flatten_train) // batch_size

    for epoch in range(epochs):
        print("Epoch:", epoch)
        for batch in range(0,100):
            start = batch * batch_size
            end = (batch + 1) * batch_size

            batch_x = x_flatten_train[start:end]
            batch_y = y_encoded[start:end]

            batch_w_delta = [np.zeros_like(w) for w in weights]
            batch_b_delta = [np.zeros_like(b) for b in bias]

            for j in range(len(batch_x)):
                A, B, C = forwardProp(batch_x[j], "tanh", "crossEntropy", weights, bias)
                CurrWdelta, CurrBdelta = backProp(batch_y[j], C, B, weights, "tanh", A)

                for k in range(len(CurrWdelta)):
                    batch_w_delta[k] += CurrWdelta[k]
                    batch_b_delta[k] += CurrBdelta[k]

            for k in range(len(batch_w_delta)):
                rmsweights[k] = beta * rmsweights[k] + (1 - beta) * (batch_w_delta[k] ** 2)
                rmsbias[k] = beta * rmsbias[k] + (1 - beta) * (batch_b_delta[k] ** 2)

                weights[k] -= (lr * batch_w_delta[k]) / (np.sqrt(rmsweights[k]) + eps)
                bias[k] -= (lr * batch_b_delta[k]) / (np.sqrt(rmsbias[k]) + eps)

    return weights, bias

In [107]:
def adam(nodesPerLayer, x_flatten_train, y_encoded, beta1, beta2, eps, batch_size, lr=0.001):
    # Initialize weights and biases
    weights = [np.random.randn(nodesPerLayer[i], nodesPerLayer[i-1]) * 0.1 for i in range(1, len(nodesPerLayer))]
    bias = [np.random.randn(nodesPerLayer[i], 1) * 0.1 for i in range(1, len(nodesPerLayer))]

    # Initialize Adam parameters
    m_weights = [np.zeros((nodesPerLayer[i], nodesPerLayer[i-1])) for i in range(1, len(nodesPerLayer))]
    v_weights = [np.zeros((nodesPerLayer[i], nodesPerLayer[i-1])) for i in range(1, len(nodesPerLayer))]
    m_bias = [np.zeros((nodesPerLayer[i], 1)) for i in range(1, len(nodesPerLayer))]
    v_bias = [np.zeros((nodesPerLayer[i], 1)) for i in range(1, len(nodesPerLayer))]

    num_batches = len(x_flatten_train) // batch_size

    for epoch in range(epochs):
        print("Epoch:", epoch)
        for batch in range(0,50):
            start = batch * batch_size
            end = (batch + 1) * batch_size

            batch_x = x_flatten_train[start:end]
            batch_y = y_encoded[start:end]

            batch_w_delta = [np.zeros_like(w) for w in weights]
            batch_b_delta = [np.zeros_like(b) for b in bias]

            for j in range(len(batch_x)):
                A, B, C = forwardProp(batch_x[j], "tanh", "crossEntropy", weights, bias)
                CurrWdelta, CurrBdelta = backProp(batch_y[j], C, B, weights, "tanh", A)

                for k in range(len(CurrWdelta)):
                    batch_w_delta[k] += CurrWdelta[k]
                    batch_b_delta[k] += CurrBdelta[k]

            for k in range(len(batch_w_delta)):
                m_weights[k] = beta1 * m_weights[k] + (1 - beta1) * batch_w_delta[k]
                v_weights[k] = beta2 * v_weights[k] + (1 - beta2) * (batch_w_delta[k] ** 2)
                m_bias[k] = beta1 * m_bias[k] + (1 - beta1) * batch_b_delta[k]
                v_bias[k] = beta2 * v_bias[k] + (1 - beta2) * (batch_b_delta[k] ** 2)

                m_weights_hat = m_weights[k] / (1 - beta1 ** (epoch + 1))
                v_weights_hat = v_weights[k] / (1 - beta2 ** (epoch + 1))
                m_bias_hat = m_bias[k] / (1 - beta1 ** (epoch + 1))
                v_bias_hat = v_bias[k] / (1 - beta2 ** (epoch + 1))

                weights[k] -= (lr * m_weights_hat) / (np.sqrt(v_weights_hat) + eps)
                bias[k] -= (lr * m_bias_hat) / (np.sqrt(v_bias_hat) + eps)

    return weights, bias


In [108]:
def nadam(nodesPerLayer, x_flatten_train, y_encoded, beta1, beta2, eps, batch_size, lr=0.001):
    # Initialize weights and biases
    weights = [np.random.randn(nodesPerLayer[i], nodesPerLayer[i-1]) * 0.1 for i in range(1, len(nodesPerLayer))]
    bias = [np.random.randn(nodesPerLayer[i], 1) * 0.1 for i in range(1, len(nodesPerLayer))]

    # Initialize Nadam parameters
    m_weights = [np.zeros((nodesPerLayer[i], nodesPerLayer[i-1])) for i in range(1, len(nodesPerLayer))]
    v_weights = [np.zeros((nodesPerLayer[i], nodesPerLayer[i-1])) for i in range(1, len(nodesPerLayer))]
    m_bias = [np.zeros((nodesPerLayer[i], 1)) for i in range(1, len(nodesPerLayer))]
    v_bias = [np.zeros((nodesPerLayer[i], 1)) for i in range(1, len(nodesPerLayer))]

    num_batches = len(x_flatten_train) // batch_size

    for epoch in range(epochs):
        print("Epoch:", epoch)
        for batch in range(0,20):
            start = batch * batch_size
            end = (batch + 1) * batch_size
            batch_x = x_flatten_train[start:end]
            batch_y = y_encoded[start:end]

            batch_w_delta = [np.zeros_like(w) for w in weights]
            batch_b_delta = [np.zeros_like(b) for b in bias]

            for j in range(len(batch_x)):
                A, B, C = forwardProp(batch_x[j], "tanh", "crossEntropy", weights, bias)
                CurrWdelta, CurrBdelta = backProp(batch_y[j], C, B, weights, "tanh", A)

                for k in range(len(CurrWdelta)):
                    batch_w_delta[k] += CurrWdelta[k]
                    batch_b_delta[k] += CurrBdelta[k]

            for k in range(len(batch_w_delta)):
                m_weights[k] = beta1 * m_weights[k] + (1 - beta1) * batch_w_delta[k]
                v_weights[k] = beta2 * v_weights[k] + (1 - beta2) * (batch_w_delta[k] ** 2)
                m_bias[k] = beta1 * m_bias[k] + (1 - beta1) * batch_b_delta[k]
                v_bias[k] = beta2 * v_bias[k] + (1 - beta2) * (batch_b_delta[k] ** 2)

                m_weights_hat = m_weights[k] / (1 - beta1 ** (epoch + 1))
                v_weights_hat = v_weights[k] / (1 - beta2 ** (epoch + 1))
                m_bias_hat = m_bias[k] / (1 - beta1 ** (epoch + 1))
                v_bias_hat = v_bias[k] / (1 - beta2 ** (epoch + 1))

                weights[k] -= lr * (beta1 * m_weights_hat + (1 - beta1) * batch_w_delta[k]) / (np.sqrt(v_weights_hat) + eps)
                bias[k] -= lr * (beta1 * m_bias_hat + (1 - beta1) * batch_b_delta[k]) / (np.sqrt(v_bias_hat) + eps)

    return weights, bias

In [109]:
def gradient_descent(nodesPerLayer,x_flatten_train,y_encoded):
  weights = list()
  bias = list()
  for i in range(1,len(nodesPerLayer)):
    w = np.random.randn(nodesPerLayer[i],nodesPerLayer[i-1])*0.1
    b =  np.random.randn(nodesPerLayer[i],1)
    weights.append(w)
    bias.append(b)
  Wdelta = list()
  Bdelta = list()
  for i in range(0,epochs):
    Wdelta.clear()
    Bdelta.clear()
    print("Epoch:" + str(i))
    for j in range(0,len(y_train)):
      A,B,C = forwardProp(x_flatten_train[j],"sigmoid","crossEntropy",weights,bias)
      CurrWdelta,CurrBdelta = backProp(y_encoded[j],C,B,weights,"sigmoid",A)
      if( len(Wdelta) == 0):
        Wdelta =  copy.deepcopy(CurrWdelta)
        Bdelta = copy.deepcopy(CurrBdelta)
      else:
        for k in range(0,len(Wdelta)):
          Wdelta[k] = Wdelta[k] + CurrWdelta[k]
          Bdelta[k] = Bdelta[k] + CurrBdelta[k]
      if(j%1000 == 0):
        print(j/1000)
    for k in range(0,len(weights)):
      weights[k] = weights[k] - lr*Wdelta[k]
      bias[k] = bias[k] - lr*Bdelta[k]
  return weights,bias

In [102]:
def trainModel(optimizer,x_train,y_train,nodesPerLayer):
  FinalWeights = list()
  FinalBias = list()
  beta1 = 0.89
  beta2 = 0.95
  eps = 1e-6
  batch_size = 200
  if(optimizer == "gradient_descent"):
    FinalWeights, FinalBias = gradient_descent(nodesPerLayer,x_train,y_train)
  elif(optimizer == "SGD"):
    FinalWeights, FinalBias = stochastic_gradient_descent(nodesPerLayer,x_train,y_train,batch_size)
  elif(optimizer == "momentumGD"):
    FinalWeights, FinalBias = momentum_gradient_descent(nodesPerLayer,x_train,y_train,0.6,batch_size)
  elif(optimizer == "nesterovGD"):
    FinalWeights, FinalBias = nesterov_gradient_descent(nodesPerLayer,x_train,y_train,0.6,batch_size)
  elif(optimizer == "rmsprop"):
    FinalWeights, FinalBias =rmsprop(nodesPerLayer,x_train,y_train,0.6,1e-6,2,batch_size,0.001)
  elif(optimizer == "adam"):
    FinalWeights, FinalBias = adam(nodesPerLayer, x_flatten_train, y_encoded, beta1, beta2, eps, batch_size,0.001)
  elif(optimizer == "nadam"):
    FinalWeights, FinalBias = nadam(nodesPerLayer, x_flatten_train, y_encoded, beta1, beta2, eps, batch_size,0.001)
  return FinalWeights,FinalBias

In [None]:
a,b = trainModel("momentumGD",x_flatten_train,y_encoded,nodesPerLayer)

In [62]:
def testModel(weights,bias,x_test,y_test):
  count = 0
  for i in range(0,x_test.shape[0]):
    A,B,C = forwardProp(x_test[i],"tanh","crossEntropy",weights,bias)
    if( y_test[i] == np.argmax(C)):
      count+=1
  print("Accuracy :" + str((count/y_test.shape[0])*100) + "%")

In [63]:
x_flatten_test = x_test.reshape(x_test.shape[0],x_test.shape[1]*x_test.shape[2],1)
x_flatten_test = normalize_data(x_flatten_test)

In [None]:
testModel(a,b,x_flatten_test,y_test)