In [104]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt



In [105]:
data_train = pd.read_csv('/kaggle/input/fashionmnist/fashion-mnist_train.csv')
data_test = pd.read_csv('/kaggle/input/fashionmnist/fashion-mnist_test.csv')

In [106]:
data_train.head(10)

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,0,0,0,5,4,5,5,3,5,...,7,8,7,4,3,7,5,0,0,0
6,4,0,0,0,0,0,0,0,0,0,...,14,0,0,0,0,0,0,0,0,0
7,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4,0,0,0,0,0,0,3,2,0,...,1,0,0,0,0,0,0,0,0,0
9,8,0,0,0,0,0,0,0,0,0,...,203,214,166,0,0,0,0,0,0,0


In [107]:
data_train = np.array(data_train)
data_test = np.array(data_test)
m1, n1 = data_train.shape
m2, n2 = data_test.shape

np.random.shuffle(data_train)

data_test = data_test.T # Done to make each column an example instead of each row 
Y_test = data_test[0]
X_test = data_test[1:]

data_train = data_train.T # Done to make each column an example instead of each row 
Y_train = data_train[0]
X_train = data_train[1:]

X_train = X_train / 255.0
X_test = X_test / 255.0


In [108]:
def init_params():
    hidden_dim = 128 
    W1 = np.random.randn(hidden_dim, 784) * np.sqrt(2.0 / 784)
    b1 = np.zeros((hidden_dim, 1))
    W2 = np.random.randn(10, hidden_dim) * np.sqrt(2.0 / hidden_dim)
    b2 = np.zeros((10, 1))

    return W1, b1, W2, b2

    
def ReLU(Z):
    return np.maximum(0, Z)

def softmax(Z):
    # subtract max per column for numerical stability
    Z_shift = Z - np.max(Z, axis=0, keepdims=True)
    expZ = np.exp(Z_shift)
    return expZ / np.sum(expZ, axis=0, keepdims=True)

def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2


def one_hot(Y): 
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1 
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def derive_ReLU(Z):
    return Z > 0 

def back_prop(Z1, A1, Z2, A2, W2, X, Y):
    m = Y.size
    one_hot_Y = one_hot(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)  # (10, 1)
    dZ1 = W2.T.dot(dZ2) * derive_ReLU(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)  # (10, 1)
    return dW1, db1, dW2, db2 

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1 
    b1 = b1 - alpha * db1 
    W2 = W2 - alpha * dW2 
    b2 = b2 - alpha * db2 
    return W1, b1, W2, b2 



In [109]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, interations, alpha):
    W1, b1, W2, b2 = init_params()
    for e in range(interations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, A2, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if e % 50 == 0: 
            print(f"Iteration: {e}")
            print(f"Accuracy: {get_accuracy(get_predictions(A2), Y)}")
    return W1, b1, W2, b2 

In [110]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 500, 0.05)

Iteration: 0
[9 9 9 ... 2 2 2] [1 2 1 ... 5 9 8]
Accuracy: 0.06311666666666667
Iteration: 50
[1 8 1 ... 7 9 8] [1 2 1 ... 5 9 8]
Accuracy: 0.70055
Iteration: 100
[1 8 1 ... 5 9 8] [1 2 1 ... 5 9 8]
Accuracy: 0.7479666666666667
Iteration: 150
[1 8 1 ... 5 9 8] [1 2 1 ... 5 9 8]
Accuracy: 0.775
Iteration: 200
[1 8 1 ... 5 9 8] [1 2 1 ... 5 9 8]
Accuracy: 0.792
Iteration: 250
[1 8 1 ... 5 9 8] [1 2 1 ... 5 9 8]
Accuracy: 0.80275
Iteration: 300
[1 8 1 ... 5 9 8] [1 2 1 ... 5 9 8]
Accuracy: 0.8106833333333333
Iteration: 350
[1 8 1 ... 5 9 8] [1 2 1 ... 5 9 8]
Accuracy: 0.8165333333333333
Iteration: 400
[1 8 1 ... 5 9 8] [1 2 1 ... 5 9 8]
Accuracy: 0.82185
Iteration: 450
[1 8 1 ... 5 9 8] [1 2 1 ... 5 9 8]
Accuracy: 0.8255


In [103]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 500, 0.05)

# Evaluate on test
_, A1_test, _, A2_test = forward_prop(W1, b1, W2, b2, X_test)
test_preds = get_predictions(A2_test)
test_acc = get_accuracy(test_preds, Y_test)
print("Fashion-MNIST Test Accuracy:", test_acc)


Iteration: 0
[9 6 6 ... 4 5 9] [5 5 5 ... 5 4 9]
Accuracy: 0.0519
Iteration: 50
[7 7 7 ... 9 2 9] [5 5 5 ... 5 4 9]
Accuracy: 0.302
Iteration: 100
[7 7 7 ... 5 2 9] [5 5 5 ... 5 4 9]
Accuracy: 0.4182
Iteration: 150
[7 8 7 ... 5 2 9] [5 5 5 ... 5 4 9]
Accuracy: 0.5832
Iteration: 200
[7 7 7 ... 5 2 9] [5 5 5 ... 5 4 9]
Accuracy: 0.6866
Iteration: 250
[5 7 7 ... 5 2 9] [5 5 5 ... 5 4 9]
Accuracy: 0.7169
Iteration: 300
[5 5 7 ... 5 2 9] [5 5 5 ... 5 4 9]
Accuracy: 0.7508
Iteration: 350
[5 5 7 ... 5 2 9] [5 5 5 ... 5 4 9]
Accuracy: 0.7705
Iteration: 400
[5 5 5 ... 5 2 9] [5 5 5 ... 5 4 9]
Accuracy: 0.7845
Iteration: 450
[5 5 5 ... 5 2 9] [5 5 5 ... 5 4 9]
Accuracy: 0.7916
[8 7 6 ... 8 8 7] [2 9 6 ... 8 8 7]
Fashion-MNIST Test Accuracy: 0.7928
