<a href="https://colab.research.google.com/github/HaiwenGuan/MLP-MNIST/blob/main/MLP_FMNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import time
import tensorflow as tf
from keras.datasets import fashion_mnist

np.random.seed(4208)
tf.random.set_seed(4208)

tf.executing_eagerly()
tf.__version__

tf.config.list_physical_devices('GPU')

[]

In [11]:
size_input = 784
size_hidden_1 = 256
size_hidden_2 = 128
size_output = 10

In [3]:
(X_train, y_train),(X_test, y_test) = fashion_mnist.load_data()
X_train = tf.cast(tf.reshape(X_train, (-1, 784)), dtype=tf.float32)
X_test = tf.cast(tf.reshape(X_test, (-1, 784)), dtype=tf.float32)
X_train, X_test = X_train / 255.0, X_test / 255.0
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(100)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(20)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [4]:
tf.shape(y_train)

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([60000], dtype=int32)>

In [21]:
class MLP(object):
  def __init__(self, size_input, size_hidden_1, size_hidden_2, size_output, device=None):
    """
    size_input: int, size of input layer
    size_hidden: int, size of hidden layer
    size_output: int, size of output layer
    device: etiher 'cpu' or 'gpu' or None. If None, decided automatically during eager.
    """
    self.size_input, self.size_hidden_1, self.size_hidden_2, self.size_output, self.device =\
    size_input, size_hidden_1, size_hidden_2, size_output, device

    #Initialize weights between input layer and hidden layer
    self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden_1]))
    self.b1 = tf.Variable(tf.random.normal([1,self.size_hidden_1]))
    self.W2 = tf.Variable(tf.random.normal([self.size_hidden_1, self.size_hidden_2]))
    self.b2 = tf.Variable(tf.random.normal([1,self.size_hidden_2]))
    self.Wop = tf.Variable(tf.random.normal([self.size_hidden_2, self.size_output]))
    self.bop = tf.Variable(tf.random.normal([1,self.size_output]))

    self.variables = [self.W1, self.W2, self.b1, self.b2, self.Wop, self.bop]  # define variables(parameters) that will be updated 

  def forward(self, X): # (X is the input matrix)
    """
    forward pass
    X: Tensor, inputs
    """
    if self.device is not None:
      with tf.device('gpu:0' if self.device=='gpu' else 'cpu'):
        self.y = self.compute_output(X)
    else:
      self.y = self.compute_output(X)
      
    return self.y
    
  def loss(self, y_pred, y_true):
    '''
    y_pred - Tensor of shape(batch_size, size_output)
    y_true - Tensor of shape(batch_size, size_output)
    '''
    #y_true_tf = tf.cast(tf.reshape(y_true, (-1, self.size_output)), dtype=tf.float32)
    y_true_tf = tf.cast(y_true, dtype=tf.float32)
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    return scce(y_true, y_pred)

  def loss_l1(self, y_pred, y_true):
    '''
    y_pred - Tensor of shape(batch_size, size_output)
    y_true - Tensor of shape(batch_size, size_output)
    '''
    #y_true_tf = tf.cast(tf.reshape(y_true, (-1, self.size_output)), dtype=tf.float32)
    y_true_tf = tf.cast(y_true, dtype=tf.float32)
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    penalty = (tf.reduce_sum(tf.math.abs(self.W1)).numpy() + tf.reduce_sum(tf.math.abs(self.W2)).numpy()\
               + tf.reduce_sum(tf.math.abs(self.Wop).numpy())) \
               / ( tf.size(self.W1).numpy() + tf.size(self.W2).numpy() + tf.size(self.Wop).numpy() )
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    return scce(y_true, y_pred) + penalty

  def loss_l2(self, y_pred, y_true):
    '''
    y_pred - Tensor of shape(batch_size, size_output)
    y_true - Tensor of shape(batch_size, size_output)
    '''
    #y_true_tf = tf.cast(tf.reshape(y_true, (-1, self.size_output)), dtype=tf.float32)
    y_true_tf = tf.cast(y_true, dtype=tf.float32)
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    penalty = (tf.reduce_sum(tf.math.square(self.W1)).numpy() + tf.reduce_sum(tf.math.square(self.W2)).numpy() \
               + tf.reduce_sum(tf.math.square(self.Wop)).numpy())\
               / ( tf.size(self.W1).numpy() + tf.size(self.W2).numpy() + tf.size(self.Wop).numpy() )
    #penalty = self.l2 * (tf.reduce_sum(tf.math.square(self.W1)).numpy() + tf.reduce_sum(tf.math.square(self.W2)).numpy() + tf.reduce_sum(tf.math.square(self.Wop).numpy()))
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    return scce(y_true, y_pred) + penalty

  def backward(self, X_train, y_train):
    '''
    backward pass
    '''
    optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)
    with tf.GradientTape() as tape:
      predicted = self.forward(X_train)
      current_loss = self.loss(predicted, y_train)
    grads = tape.gradient(current_loss, self.variables)
    optimizer.apply_gradients( zip( grads, self.variables))

  def backward_1(self, X_train, y_train):
    '''
    backward pass
    '''
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    with tf.GradientTape() as tape:
      predicted = self.forward(X_train)
      current_loss = self.loss_l1(predicted, y_train)
    grads = tape.gradient(current_loss, self.variables)
    optimizer.apply_gradients( zip( grads, self.variables))

  def backward_2(self, X_train, y_train):
    '''
    backward pass
    '''
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    with tf.GradientTape() as tape:
      predicted = self.forward(X_train)
      current_loss = self.loss_l2(predicted, y_train)
    grads = tape.gradient(current_loss, self.variables)
    optimizer.apply_gradients( zip( grads, self.variables))
    
  def compute_output(self, X):
#    X_tf = tf.cast(X, dytpe=tf.float32)
    what1 = tf.matmul(X, self.W1) + self.b1
    hhat1 = tf.keras.activations.relu(what1)
    what2 = tf.matmul(hhat1, self.W2) + self.b2
    hhat2 = tf.keras.activations.relu(what2)
#    output = tf.nn.softmax(tf.matmul(hhat2, self.Wop) + self.bop)
    output = tf.matmul(hhat2, self.Wop) + self.bop

    return output


In [7]:
NUM_EPOCHS = 10

In [24]:
# Initialize model using CPU
mlp_on_gpu = MLP(size_input, size_hidden_1, size_hidden_2, size_output, device='gpu')
m = tf.keras.metrics.SparseCategoricalAccuracy()
time_start = time.time()
for epoch in range(NUM_EPOCHS):
  loss_total = tf.zeros([1,1],dtype=tf.float32)
  lt = 0
  accuracy = []
  train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(100, seed=epoch*(4208)).batch(100)
  for inputs, outputs in train_ds:
    preds = mlp_on_gpu.forward(inputs)
    loss_total = loss_total + mlp_on_gpu.loss_1(preds, outputs)
    #lt = lt + mlp_on_gpu.loss(outputs, preds)
    m.update_state(outputs,preds)
    accuracy.append(m.result().numpy())
    mlp_on_gpu.backward_1(inputs, outputs)
  #print('Number of Epoch = {} - Average Loss:= {}'.format(epoch + 1, np.sum(loss_total) / 600))
  m.reset_state()
  print('Number of Epoch = {} - Accuracy:= {} - Average Loss:= {}'.format(epoch + 1, np.sum(accuracy)/len(accuracy), np.sum(loss_total)/X_train.shape[0]))
time_taken = time.time() - time_start
print('\nTotal time taken (in seconds): {:.2f}'.format(time_taken))

Number of Epoch = 1 - Accuracy:= 0.5991901652018229 - Average Loss:= 1.455801953125
Number of Epoch = 2 - Accuracy:= 0.7549476623535156 - Average Loss:= 0.6576636067708334
Number of Epoch = 3 - Accuracy:= 0.7735118611653646 - Average Loss:= 0.490291796875
Number of Epoch = 4 - Accuracy:= 0.7838095092773437 - Average Loss:= 0.3973400065104167
Number of Epoch = 5 - Accuracy:= 0.7853821309407553 - Average Loss:= 0.33701832682291666
Number of Epoch = 6 - Accuracy:= 0.7925965881347656 - Average Loss:= 0.29116360677083336
Number of Epoch = 7 - Accuracy:= 0.7984589640299479 - Average Loss:= 0.2563765625
Number of Epoch = 8 - Accuracy:= 0.7991684977213541 - Average Loss:= 0.22992298177083334
Number of Epoch = 9 - Accuracy:= 0.8007133992513021 - Average Loss:= 0.206908984375
Number of Epoch = 10 - Accuracy:= 0.8031553141276042 - Average Loss:= 0.18860901692708334

Total time taken (in seconds): 180.34


In [25]:
test_loss_total = tf.Variable(0,dtype=tf.float32)
accuracy = []
m = tf.keras.metrics.SparseCategoricalAccuracy()
for inputs, outputs in test_ds:
  preds = mlp_on_gpu.forward(inputs)
  test_loss_total = test_loss_total + mlp_on_gpu.loss(preds, outputs)
  m.update_state(outputs,preds)
  accuracy.append(m.result().numpy())
#print('Test MSE: {:.4f}'.format(np.sum(test_loss_total.numpy()) / X_train.shape[0]))
print('Accuracy: {:.4f} - Loss: {:.4f}'.format(np.sum(accuracy) / len(accuracy), np.sum(test_loss_total.numpy()) / X_train.shape[0]))

Accuracy: 0.7848 - Loss: 0.1832
