In [389]:
import numpy as np
import pandas as pd

In [390]:
dataset = pd.read_csv('fashion-mnist_train.csv')

In [391]:
unique_values = dataset['label'].unique()
print(unique_values)

[2 9 6 0 3 4 5 8 7 1]


In [392]:
X = dataset.iloc[:,1:785].values
Y = dataset.iloc[:,0].values

In [393]:
Y.shape

(60000,)

In [394]:
y = Y

In [395]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [396]:
print(y)

[2 9 6 ... 8 8 7]


In [397]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [398]:
X[:,0:784] = sc.fit_transform(X[:,0:784])

In [399]:
# Dense layer
class Layer_Dense:

    # Layer initialization
    def __init__(self, n_inputs, n_neurons):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

In [400]:
# ReLU activation
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

In [401]:
class Activation_Sigmoid:

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs

        # Apply sigmoid function
        self.output = 1 / (1 + np.exp(-inputs))

    # Backward pass
    def backward(self, dvalues):
        # Calculate gradient
        self.dinputs = dvalues * self.output * (1 - self.output)

In [402]:
# Adam optimizer
class Optimizer_Adam:

    # Initialize optimizer - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7,
                 beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update momentum  with current gradients
        layer.weight_momentums = self.beta_1 * \
                                 layer.weight_momentums + \
                                 (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * \
                               layer.bias_momentums + \
                               (1 - self.beta_1) * layer.dbiases
        # Get corrected momentum
        # self.iteration is 0 at first pass
        weight_momentums_corrected = layer.weight_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        # Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + \
            (1 - self.beta_2) * layer.dweights**2

        layer.bias_cache = self.beta_2 * layer.bias_cache + \
            (1 - self.beta_2) * layer.dbiases**2
        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))

        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                         weight_momentums_corrected / \
                         (np.sqrt(weight_cache_corrected) +
                             self.epsilon)
        layer.biases += -self.current_learning_rate * \
                         bias_momentums_corrected / \
                         (np.sqrt(bias_cache_corrected) +
                             self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


In [403]:
class Loss:

    # Calculates the data and regularization losses
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        # Return loss
        return data_loss

In [404]:
# Cross-entropy loss
class Loss_SparseCategoricalCrossentropy(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)
        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values -
        # only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]

        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    # Backward pass
    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples



In [405]:
# Softmax classifier - combined Softmax activation and cross-entropy loss for faster backward step
class Activation_Sigmoid_Loss_SparseCategoricalCrossentropy():

    # Creates activation and loss function objects
    def __init__(self):
        self.activation = Activation_Sigmoid()
        self.loss = Loss_SparseCategoricalCrossentropy()

    # Forward pass
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)

    # Backward pass
    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)

        # If labels are one-hot encoded, turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples

In [406]:
# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(784, 64)

# Create ReLU activation
activation1 = Activation_ReLU()
activation2 = Activation_ReLU()

# Create second Dense layer with 64 input features and 128 outputs
dense2 = Layer_Dense(64, 128)

# Create third Dense layer with 128 input features and 10 outputs
dense3 = Layer_Dense(128, 10)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Sigmoid_Loss_SparseCategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)

In [407]:
for epoch in range(101):

    # Perform a forward pass of the training data through this layer
    dense1.forward(X)

    # Perform a forward pass through activation function 1 and this takes the output of first dense layer here
    activation1.forward(dense1.output)

    # Perform a forward pass through second Dense layer and this takes outputs of first activation layer here
    dense2.forward(activation1.output)

    # Perform a forward pass through activation function 2 and this takes the output of second dense layer here
    activation2.forward(dense2.output)
    
    # Perform a forward pass through third Dense layer and this takes outputs of second activation layer here
    dense3.forward(activation2.output)
    
    # Perform a forward pass through the activation-loss function and this takes the output of third dense layer here and returns loss
    loss = loss_activation.forward(dense3.output, y)

    # Calculate accuracy from output of third layer and targets
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 10:
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f}, ' +
              f'lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense3.backward(loss_activation.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.post_update_params()

print('\n')
print(f'Hurray! I got an accuracy of {accuracy*100:.3f}%')   

epoch: 0, acc: 0.086, loss: 0.693, lr: 0.05
epoch: 10, acc: 0.277, loss: 1.385, lr: 0.0499997750010125
epoch: 20, acc: 0.499, loss: 1.289, lr: 0.04999952500451246
epoch: 30, acc: 0.645, loss: 0.954, lr: 0.04999927501051235
epoch: 40, acc: 0.736, loss: 0.730, lr: 0.04999902501901213
epoch: 50, acc: 0.765, loss: 0.645, lr: 0.049998775030011766
epoch: 60, acc: 0.785, loss: 0.566, lr: 0.049998525043511224
epoch: 70, acc: 0.799, loss: 0.535, lr: 0.049998275059510454
epoch: 80, acc: 0.810, loss: 0.506, lr: 0.04999802507800943
epoch: 90, acc: 0.824, loss: 0.479, lr: 0.0499977750990081
epoch: 100, acc: 0.840, loss: 0.450, lr: 0.04999752512250644


Hurray! I got an accuracy of 84.020%
