In [1]:
import pandas as pd
import numpy as np
import math as m

Importing the data. We see that we have to classify 10 images. Hence it is a multiclass classification problem

In [3]:
data = pd.read_csv('fashion-mnist_train.csv')
data.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Converting the dataframe into a numpy array and getting the number of rows and columns for the matrix

In [4]:
data = np.array(data)
m,n = data.shape
np.random.shuffle(data)

Dividing data into development and training data. Dividing data into input and labels too

In [5]:
data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev/255

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train/255
a,b = X_train.shape

We are creating a neural network with one input layer of 784 neurons , one hidden layer of 10 neurons and the output layer has 10 neurons.

Creating a function that initialises the parameters w1,b1,w2,b2

In [6]:
def init_params():
    
    w1 = np.random.uniform(-0.5,0.5,size = (10,784))
    b1 = np.random.uniform(-0.5,0.5, size = (10,1))
    w2 = np.random.uniform(-0.5,0.5,size = (10,10))
    b2 = np.random.uniform(-0.5,0.5, size = (10,1))
    return w1,b1,w2,b2

In [7]:
w1,b1,w2,b2 = init_params()

Definng the activatin functions ReLU and Softmax. I used ReLU and Softmax as ReLU is the most generally used activation function for hidden layers, and softmax is best used for multiclassification problems like this one

In [8]:
def ReLU(Z):
    return np.maximum(0,Z)

In [9]:
def softmax(Z):
    e = np.exp(Z)
    return e/np.sum(e,axis = 0)

Defining the forward propagation function. Z1 is the output of input layer, Z2 is the output of the hidden layer and A2 is the expected output

In [10]:
def forward_prop(w1,b1,w2,b2,X):
    Z1 = w1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = w2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1,A1,Z2,A2

In [11]:
Z1,A1,Z2,A2 = forward_prop(w1,b1,w2,b2,X_train)

We defined a onehotencoder function as the labels are numbers ranging from 0-9. We need to convert each number into an array of zeros and ones to match the actual output

In [12]:
def onehotencoder(Y):
    one_hot_Y = np.zeros((Y.size,Y.max() + 1))
    one_hot_Y[np.arange(Y.size),Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [13]:
def deriv_ReLU(Z):
    return Z >0

For the backpropagation function, the Loss function is defined as (y_predicted - y-train) for simple differentiation
L2 regularisation is used as the model is simple enough and the aim is to reduce the value of weights to solve the problem of vanishing gradients

In [14]:
def backprop(Z1,A1,Z2,A2,X,Y,w2,reg_par):
    m = Y.size
    one_hot_Y = onehotencoder(Y)
    l2_reg_exp = (1/(2*m))*reg_par*(np.sum(np.square(w2)) + np.sum(np.square(w1)))
    dZ2 = (A2 - one_hot_Y) + l2_reg_exp
    loss = np.sum((A2-one_hot_Y)) + l2_reg_exp
    dw2 = 1/m*(dZ2.dot(A1.T)) + 1/m*reg_par*np.sum(w2)
    
    db2 = 1/m*(np.sum(dZ2)) 
    
    dZ1 = w2.T.dot(dZ2)*deriv_ReLU(Z1) 
    
    dw1 = 1/m*(dZ2.dot(X.T))+ 1/m*reg_par*np.sum(w1)
    
    db1 = 1/m*(np.sum(dZ1))
    return dw1,dw2,db1,db2,loss

In [15]:
dw1,dw2,db1,db2,loss = backprop(Z1,A1,Z2,A2,X_train,Y_train,w2,0.01)

Updating the parameters with the derivatives and alpha

In [16]:
def update_params(w1,b1,w2,b2,dw1,db1,dw2,db2,alpha):
    w1 = w1 - alpha*dw1
    b1 = b1 - alpha*db1
    w2 = w2 - alpha*dw2
    b2 = b2 - alpha*db2
    return w1,b1,w2,b2

In [17]:
def get_predictions(A2):
    return np.argmax(A2,0)

In [18]:
def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y)/Y.size

In [19]:
from sklearn.metrics import accuracy_score

In [20]:
def gradient_descent(X,Y,iterations,alpha):
    w1,b1,w2,b2 = init_params()
    reg_par = 0.01
    for i in range(iterations):
        Z1,A1,Z2,A2 = forward_prop(w1,b1,w2,b2,X)
        dw1,dw2,db1,db2,loss = backprop(Z1,A1,Z2,A2,X,Y,w2,reg_par)
        w1,b1,w2,b2 = update_params(w1,b1,w2,b2,dw1,db1,dw2,db2,alpha)
        arg_A2 = get_predictions(A2)
        # printing the accuracy and loss of model after every 100 iterations
        if i%100 == 0 :
            print('Iteration', i)
            print('Accuracy',accuracy_score(arg_A2, Y))
            print('The loss is:',loss)
    return w1,b1,w2,b2

In [21]:
w1,b1,w2,b2 = gradient_descent(X_train,Y_train,2000,0.1)

Iteration 0
Accuracy 0.0796271186440678
The loss is: 5.6027497985526355e-05
Iteration 100
Accuracy 0.627542372881356
The loss is: 5.606306684400931e-05
Iteration 200
Accuracy 0.681728813559322
The loss is: 5.62351190041673e-05
Iteration 300
Accuracy 0.6810508474576271
The loss is: 5.630942920142334e-05
Iteration 400
Accuracy 0.659728813559322
The loss is: 5.6380753862059365e-05
Iteration 500
Accuracy 0.7187627118644068
The loss is: 5.64378771379986e-05
Iteration 600
Accuracy 0.7379152542372881
The loss is: 5.645268937805839e-05
Iteration 700
Accuracy 0.7737118644067796
The loss is: 5.645698033708245e-05
Iteration 800
Accuracy 0.7842542372881356
The loss is: 5.646564461861436e-05
Iteration 900
Accuracy 0.7932203389830509
The loss is: 5.646671673643861e-05
Iteration 1000
Accuracy 0.8006101694915254
The loss is: 5.6467593535862996e-05
Iteration 1100
Accuracy 0.8062372881355933
The loss is: 5.647095454996228e-05
Iteration 1200
Accuracy 0.8102372881355933
The loss is: 5.6476245010145655e-05

From what we can observe, the model works fine on using the training data.
However, too many iterations are needed to increase the accuracy of the model, so optimisation is recquired.
The model does not get stuck at one local minima on running it multiple times