# Assignment *1* - CS 598 - Deep Learning

####  *Backpropagation Neural Network* ####

## Libraries ## 

The libraries being used for are mainly numpy, Keras are also be used only for loading the dataset.

In [1]:
from keras.datasets import mnist
import numpy as np 
import random

## Dataset with Keras ##

The following code block uses Keras to load the mnist dataset and apply normalization by reshaping the matrix and normalizing values to between 0 and 1.

In [2]:
def load_dataset_mnist():
    '''
    function to load the dataset and perform normalization 
    returns the train/test dataset 
    '''
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    # reshape the matrix 
    x_train = x_train.reshape(x_train.shape[0], x_train.shape[1]*x_train.shape[2]).astype('float32')
    x_test = x_test.reshape(x_test.shape[0], x_test.shape[1]*x_test.shape[2]).astype('float32')
    
    # maximum value of a pixel is 255 
    # normalize the value to range 0 - 1
    x_train = x_train/255
    x_test = x_test/255
    
    return (x_train, x_test, y_train, y_test)

In [3]:
# load the dataset by calling the defined function above 
# and print out the shape for confirmation 
x_train, x_test, y_train, y_test = load_dataset_mnist()
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(60000, 784)
(10000, 784)
(60000,)
(10000,)


## Neural Network Class ## 

Self-defined class for a neural network that is based on the combination of feed forward and backpropagtion. 

During the initialization of the class object, it will randomly generate weights and biases for each layer and set the value close 0, it uses sigmoid as the activation function. 

##### Following sequence are being covered: 
Weight/Bias random initialization -> feed forward -> calculate the actual output -> calculate the loss function -> loss/error -> derivative of error -> gradient -> backpropagate -> updates weight/bias based on delta rule -> feed forward ... loop

In [4]:
class MyNeuralNetwork():
    '''
    Self-defined class, a implementation of feed-forward, back-propagation neural network 
    It supports multiple layers and accepts 
    '''
    def __init__(self, sizes, learning_rate, batch_size):
        '''
        default constructor for MyNeuralNetwork, accepts inputs like number of neurons for each layer, learning rate,
        batch size. 
        '''
        self.sizes = sizes
        self.num_layers = len(sizes)
        self.learning_rate = learning_rate
        self.batch_size = batch_size 

        # initilaize the weights/biases with random number, but close to zero
        self.weights = [np.random.randn(i, j)*0.01 for i, j in zip(self.sizes[:-1], self.sizes[1:])]
        self.bias = [np.random.randn(1, j)*0.01 for j in self.sizes[1:]]
    
    def sigmoid(self, x):
        '''
        Activation function that used for predicting probability based outputs
        return value usually between -1 and 1
        '''
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        '''
        Calculate the derivative of the sigmoid of x
        '''
        s = self.sigmoid(x)
        return s * (1-s)

    def softmax(self, z):
        '''
        Softmax function is being used for prediction in multi-class models,
        returns probabilities of each class in a group of different classes
        z: output 
        '''      
        # get the max 
        max_row = np.max(z, axis=-1, keepdims=True)  
        tmp = z - max_row
        return np.exp(tmp) / np.sum(np.exp(tmp), axis=-1, keepdims=True)

    def cross_entropy_loss(self, y, a):
        '''
        compute loss function
        y: expected output
        a: actual output
        '''
        m = y.shape[0]
        s = a[range(m), y]
        l_sum = np.sum(np.log(s))
        l = -(1./m) * l_sum
        return l

    
    def cost_derivative(self, y, a):
        '''
        compute loss function
        y: expected output
        a: actual output
        '''
        m = y.shape[0]
        a[range(m), y] -= 1
        return a
    
    def feed_forward(self, x):
        '''
        Get the output based on current weight/bias 
        '''
        a = x
        for w, b in zip(self.weights[:-1], self.bias[:-1]):
            #print(w.shape)
            #print(b.shape)   
            z = np.dot(a, w) + b
            a = self.sigmoid(z) 
        l = np.dot(a, self.weights[-1]) + self.bias[-1]
        return l

    def backward_backpropagate(self, x, y):
        '''
        container to capture the steps involved for training a neural network
        feed forward, actual output, loss, back propagate
        x, y: training data 
        returns dws, dbs: lists of weights/biases for each layer 
        '''
        # init the weights/biases to 0 for each layer  
        # and the shape varies between layers 
        dbs = [np.zeros(b.shape) for b in self.bias]
        dws = [np.zeros(w.shape) for w in self.weights]

        # zs: list of intermediate outputs
        # activations: list of activations 
        zs = [] 
        activations = []

        a = x
        activations.append(a)
        # feed forward 
        for weight, bias in zip(self.weights[:-1], self.bias[:-1]):
            z = np.dot(a, weight) + bias
            zs.append(z)           
            a = self.sigmoid(z)
            activations.append(a)  
        
        # output layer 
        logits = np.dot(a, self.weights[-1]) + self.bias[-1]
        zs.append(logits)
        #a, loss = self.softmax_cross_entropy(logits, y)
        a = self.softmax(logits)
        loss = self.cross_entropy_loss(y, a)
        activations.append(a)
        
        # back propagate 
        #dl = self.derivation_softmax_cross_entropy(logits, y) # calculate the delta between actual output and expected output
        dl = self.cost_derivative(y, a)
        # update weight/bias for the output layer 
        dws[-1] = np.dot(activations[-2].T, dl) 
        dbs[-1] = np.sum(dl, axis=0, keepdims=True)

        # back propagate the delta if there are more than 2 layers (input and output)
        for i in range(2, self.num_layers):
            dl = np.dot(dl, self.weights[-i+1].T) * self.sigmoid_derivative(zs[-i])
            dws[-i] = np.dot(activations[-i-1].T, dl)
            dbs[-i] = np.sum(dl, axis=0, keepdims=True)
        
        return loss, dws, dbs

    def get_accuracy(self, x_test, y_test):
        '''
        Calculate the accurancy for test data
        x_test, y_test 
        '''
        cnt = 0
        n = len(x_test)
        
        for x, y in zip(x_test, y_test):
            output = self.feed_forward(x)
            res = np.argmax(output) # get highest possibility 
            
            correct = np.sum(res == y)
            cnt += correct
        
        acc = cnt / n            
        return acc  

    
    def adjust_neural_network(self, lr, sz, dws, dbs):    
        '''
        Update the weight/bias based on learning rate
        Use avrage since we process by batch
        lr: learning rate
        sz: batch size
        dws: [weights]
        dbs: [biases]
        '''            
        self.weights = [w - (lr/sz) * dw for w, dw in zip(self.weights, dws)]
        self.bias = [b - (lr/sz) * db for b, db in zip(self.bias, dbs)]

    def train_neural_network(self, x_train, y_train, x_test, y_test, epoches):
        '''
        function to train the neural network by calling all the step functions 
        process data in batches and print the accurancy result 
        x_train, y_train: training data
        x_test, y_test: test data 
        epoches: number of training 
        '''
        val_accs = []
        bs = self.batch_size
        lr = self.learning_rate

        sz = len(x_train)
        tsz = len(x_test)

        for epoch in range(epoches):        
            
            # get data into batches based on batch size 
            x_batches = [x_train[x:x + bs] for x in range(0,sz,bs)]
            y_batches = [y_train[y:y + bs] for y in range(0,sz,bs)] 
            
            for i, (x, y) in enumerate(zip(x_batches, y_batches)):
                # backward propagation by batches 
                loss, dws, dbs = self.backward_backpropagate(x, y)
                # adjust weights/biases 
                self.adjust_neural_network(lr, bs, dws, dbs)

                #if i % 100 == 0:
                #    print("Epoch {}, batch {}, loss {}".format(epoch, i, loss))

            """
            # spilt test data into batches 
            x_test_batches = [x_test[x:x + bs] for x in range(0,tsz,bs)]
            y_test_batches = [y_test[y:y + bs] for y in range(0,tsz,bs)] 

            for j, (x, y) in enumerate(zip(x_test_batches, y_test_batches)):
                acc = self.get_accuracy(x, y)
                val_accs.append(acc)

                if j % 100 == 0:
                    print("Epoch {}, Batch {}, Average Accuracy {}".format(epoch, j, np.average(val_accs)))
                    val_accs=[]
            """
            # calculate the accuracy for test data 
            acc = self.get_accuracy(x_test, y_test)
            print("Epoch: {}, Average Accuracy = {}".format(epoch, acc))
            
            # break the training if reaches 91.5% accuracy 
            if acc >= 0.915: 
                break

## Execution ##

Initialize an self-defined Neural Network object with the following parameters:
- A list of number of neurons for each layer
- Batch Size
- Learning Rate
- Max Epoches 

In [5]:
# hyper-parameters 
learning_rate = 0.001
batch_size = 20
max_epoches = 200  # defines the maximum number of epoches (training will stop once meet the expected accuracy)

# define the layers 
layers = [784, 10]

# load the dataset 
x_train, x_test, y_train, y_test = load_dataset_mnist()

# create an Neural Network Class 
nn = MyNeuralNetwork(layers, learning_rate, batch_size)
nn.train_neural_network(x_train, y_train, x_test, y_test, max_epoches)

Epoch: 0, Average Accuracy = 0.8324
Epoch: 1, Average Accuracy = 0.8563
Epoch: 2, Average Accuracy = 0.8678
Epoch: 3, Average Accuracy = 0.875
Epoch: 4, Average Accuracy = 0.8792
Epoch: 5, Average Accuracy = 0.8828
Epoch: 6, Average Accuracy = 0.8852
Epoch: 7, Average Accuracy = 0.8884
Epoch: 8, Average Accuracy = 0.8906
Epoch: 9, Average Accuracy = 0.8924
Epoch: 10, Average Accuracy = 0.8938
Epoch: 11, Average Accuracy = 0.8956
Epoch: 12, Average Accuracy = 0.897
Epoch: 13, Average Accuracy = 0.8988
Epoch: 14, Average Accuracy = 0.9002
Epoch: 15, Average Accuracy = 0.901
Epoch: 16, Average Accuracy = 0.9016
Epoch: 17, Average Accuracy = 0.9027
Epoch: 18, Average Accuracy = 0.9038
Epoch: 19, Average Accuracy = 0.9048
Epoch: 20, Average Accuracy = 0.9053
Epoch: 21, Average Accuracy = 0.9063
Epoch: 22, Average Accuracy = 0.9064
Epoch: 23, Average Accuracy = 0.9065
Epoch: 24, Average Accuracy = 0.907
Epoch: 25, Average Accuracy = 0.9068
Epoch: 26, Average Accuracy = 0.9075
Epoch: 27, Aver

## References

- *Building a Neural Network from Scratch: Part 1*: https://jonathanweisberg.org/post/A%20Neural%20Network%20from%20Scratch%20-%20Part%201/

- *Neural Networks: Feedforward and Backpropagation Explained & Optimization*: https://mlfromscratch.com/neural-networks-explained/#/

- *Neural networks and back-propagation explained in a simple way*: https://medium.com/datathings/neural-networks-and-backpropagation-explained-in-a-simple-way-f540a3611f5e

- *深度学习－－手写数字识别*: https://blog.csdn.net/akadiao/article/details/78175737