In [1]:
%matplotlib inline

import warnings
import gzip, pickle
import numpy as np
import matplotlib.pyplot as plt


warnings.filterwarnings(action='ignore')  


In [2]:
DATA_PATH = 'Data/mnist.pkl.gz'

with gzip.open(DATA_PATH, 'rb') as f:
    (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = pickle.load(f, encoding='latin')

print('Training data shape:    ', X_train.shape)
print('Training labels shape:  ', y_train.shape)
print('Validation data shape:  ', X_valid.shape)
print('Validation labels shape:', y_valid.shape)
print('Test data shape:        ', X_test.shape)
print('Test labels shape:      ', y_test.shape)

Training data shape:     (50000, 784)
Training labels shape:   (50000,)
Validation data shape:   (10000, 784)
Validation labels shape: (10000,)
Test data shape:         (10000, 784)
Test labels shape:       (10000,)


In [3]:
def predict(W, b, X):
    scores = X @ W + b
    return np.argmax(scores, axis=1)

def accuracy(y_pred, y_true):
    return 100. * np.mean(y_pred == y_true)

In [6]:
# Leaky ReLU activation function
def leaky_relu(x, alpha=0.01):
    return np.maximum(alpha*x, x)

# Derivative of Leaky ReLU activation function
def leaky_relu_derivative(x, alpha=0.01):
    dx = np.ones_like(x)
    dx[x < 0] = alpha
    return dx

def softmax(x):
    x = x - np.max(x, axis=1, keepdims=True) 
    ### This subtraction of the maximum value helps prevent numerical overflow or instability that can occur when taking the exponential of large numbers.
    ### It is a common practice to improve the numerical stability of the softmax function, 
    ## which is often used in the output layer of neural networks for multi-class classification problems.
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def softmax_loss_2(scores, y, mode='train'):
    m = scores.shape[0] #number of data
    probs = softmax(scores)
    loss = -np.sum(np.log(probs[range(m), y])) / m
    
    if mode != 'train':
        return loss
    
    # backward
    dscores = probs
    dscores[range(m), y] -= 1.0
    dscores /= m
    
    return loss, dscores

def softmax_loss_3(scores, y, mode='train'):
    m = scores.shape[0] #number of data
    probs = softmax(scores)
    loss = -np.sum(np.log(probs[range(m), y])) / m
    
    if mode != 'train':
        return loss
    
    # backward
    dscores = probs
    dscores[range(m), y] -= 1.0
    dscores /= m
    
    return loss, dscores

class ThreeLayerNeuralNetwork:
    
    def __init__(self, num_features=784, num_hiddens=20, num_classes=10):
        self.num_hiddens = num_hiddens
        self.num_classes = num_classes
        
        # random initialization: create random weights, set all biases to zero
        self.params = {}
        self.params['W1'] = np.random.randn(num_features, num_hiddens) * 0.001
        self.params['W2'] = np.random.randn(num_hiddens, num_hiddens) * 0.001
        self.params['W3'] = np.random.randn(num_hiddens,  num_classes) * 0.001
        self.params['b1'] = np.zeros((num_hiddens,))
        self.params['b2'] = np.zeros((num_hiddens,))
        self.params['b3'] = np.zeros((num_classes,))
        
    def forward(self, X):
        # forward step
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        W3, b3 = self.params['W3'], self.params['b3']
        
        # forward step
        h_in_1 = X @ W1 + b1       # hidden layer1 input
        #h_1 = np.maximum(0, h_in_1)  # hidden layer1 output (using ReLU)
        h_1 = leaky_relu(h_in_1)        #Using Leaky Relu
        h_in_2 = h_1 @ W2 + b2       # hidden layer2 input
        #h_2 = np.maximum(0, h_in_2)  # hidden layer2 output (using ReLU)
        h_2 = leaky_relu(h_in_2)        #Using Leaky Relu
        scores = h_2 @ W3 + b3     # neural net output
        
        return scores
                            
    def train_step(self, X, y):
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        W3, b3 = self.params['W3'], self.params['b3']
        
        # forward step
        z_1 = X @ W1 + b1       # hidden layer1 input
        #a_1 = np.maximum(0, z_1)  # hidden layer1 output (using ReLU)
        a_1 = leaky_relu(z_1)       #Using Leaky Relu
        z_2 = a_1 @ W2 + b2       # hidden layer2 input
        #a_2 = np.maximum(0, z_2)  # hidden layer2 output (using ReLU)
        a_2 = leaky_relu(z_2)       #Using Leaky Relu
        scores = a_2 @ W3 + b3     # neural net output
        
        # compute loss
        loss, dscores = softmax_loss_3(scores, y)
        
        # backward step
        db3 = dscores.sum(axis=0)
        dW3 = a_2.T @ dscores

        da_2 = dscores @ W3.T
        #da_2[a_2 < 0] = 0.0  # ReLU derivative for second hidden layer
        dz_2 = da_2*leaky_relu_derivative(z_2)      #Using Leaky Relu

        db2 = dz_2.sum(axis=0)
        dW2 = a_1.T @ dz_2

        da_1 = dz_2 @ W2.T
        #da_1[a_1 < 0] = 0.0  # ReLU derivative for first hidden layer
        dz_1 = da_1*leaky_relu_derivative(z_1)      #Using Leaky Relu

        db1 = dz_1.sum(axis=0)
        dW1 = X.T @ dz_1

        gradient = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2, 'W3': dW3, 'b3': db3}

        return loss, gradient

        
    def train(self, X_train, y_train, X_valid, y_valid, batch_size=50, 
              alpha=0.001, lmbda=0.0001, num_epochs=20):
        
        m, n = X_train.shape        
        num_batches = m // batch_size
        
        report = "{:3d}: training loss = {:.2f} | validation loss = {:.2f}"
        
        losses = []
        for epoch in range(num_epochs):
            train_loss = 0.0
            
            for _ in range(num_batches):
                W1, b1 = self.params['W1'], self.params['b1']
                W2, b2 = self.params['W2'], self.params['b2']
                W3, b3 = self.params['W3'], self.params['b3']
                
                # select a random mini-batch
                batch_idx = np.random.choice(m, batch_size, replace=False)
                X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]

                # train on mini-batch
                data_loss, gradient = self.train_step(X_batch, y_batch)
                reg_loss = 0.5 * (np.sum(W1 ** 2) + np.sum(W2 ** 2) + np.sum(W3 ** 2))
                train_loss += (data_loss + lmbda * reg_loss)
                losses.append(data_loss + lmbda * reg_loss)

                # regularization
                gradient['W1'] += lmbda * W1
                gradient['W2'] += lmbda * W2
                gradient['W3'] += lmbda * W3

                # update parameters
                for p in self.params:
                    self.params[p] = self.params[p] - alpha * gradient[p]
            
            # report training loss and validation loss
            train_loss /= num_batches
            valid_loss = softmax_loss_2(self.forward(X_valid), y_valid, mode='test')
            print(report.format(epoch + 1, train_loss, valid_loss))
        
        return losses
    
    def predict(self, X):
        """ Predict labels for input data.
        """
        scores = self.forward(X)
        return np.argmax(scores, axis=1)
    
    def predict_proba(self, X):
        """ Predict probabilties of classes for each input data.
        """
        scores = self.forward(X)
        return softmax(scores)

mlp3 = ThreeLayerNeuralNetwork(num_hiddens=40)
losses = mlp3.train(X_train, y_train, X_valid, y_valid, 
                   alpha=0.08, lmbda=0.001, num_epochs=30)

  1: training loss = 2.30 | validation loss = 2.30
  2: training loss = 2.30 | validation loss = 2.30
  3: training loss = 2.30 | validation loss = 2.30
  4: training loss = 2.30 | validation loss = 2.30
  5: training loss = 2.30 | validation loss = 2.30
  6: training loss = 2.30 | validation loss = 2.30
  7: training loss = 2.11 | validation loss = 1.52
  8: training loss = 1.23 | validation loss = 0.80
  9: training loss = 0.58 | validation loss = 0.38
 10: training loss = 0.35 | validation loss = 0.25
 11: training loss = 0.27 | validation loss = 0.20
 12: training loss = 0.25 | validation loss = 0.18
 13: training loss = 0.23 | validation loss = 0.17
 14: training loss = 0.21 | validation loss = 0.16
 15: training loss = 0.20 | validation loss = 0.15
 16: training loss = 0.19 | validation loss = 0.14
 17: training loss = 0.19 | validation loss = 0.14
 18: training loss = 0.18 | validation loss = 0.13
 19: training loss = 0.17 | validation loss = 0.13
 20: training loss = 0.17 | val

In [7]:
train_acc = accuracy(mlp3.predict(X_train), y_train)
print("Train accuracy   = {:.2f}%".format(train_acc))

test_acc = accuracy(mlp3.predict(X_test), y_test)
print("Test accuracy = {:.2f}%".format(test_acc))

Train accuracy   = 98.16%
Test accuracy = 97.02%
