# MLP Algorithm in Scratch

## 1. cost func. = MSE, activation func. = sigmoid

In [12]:
import numpy as np

class Network:
    
    def __init__(self, n_neurons=[784,100,10], learning_rate=3.0, batch_size=10):
        np.random.seed(0)
        self.weights = [np.random.randn(x,y) for x, y in zip(n_neurons[1:], n_neurons[:-1])] 
        self.bias = [np.random.randn(x) for x in n_neurons[1:]]
        self.n_layer = len(n_neurons)
        self.lr = learning_rate
        self.batch_size = batch_size
        
        
    def fit(self, X, y, epochs, x_test=None, y_test=None):
        
        x_batch_list, y_batch_list = self._mini_batch(X, y, self.batch_size)
        
        for epoch in range(epochs):
            
            for i in range(len(x_batch_list)):
                
                dW = [np.zeros(w.shape) for w in self.weights]
                db = [np.zeros(b.shape) for b in self.bias]
            
                for x_i, y_i in zip(x_batch_list[i], y_batch_list[i]):
                    self._feedforward(x_i)

                    mini_dW, mini_db = self._backpropagation(x_i, y_i)
                    dW = [w + nw for w, nw in zip(dW, mini_dW)]
                    db = [b + nb for b, nb in zip(db, mini_db)]

                # update
                self.weights = [w - (self.lr / self.batch_size) * nw for w, nw in zip(self.weights, dW)]
                self.bias = [b - (self.lr / self.batch_size) * nb for b, nb in zip(self.bias, db)]
                
            score = self._evaluate(x_test, y_test)
            print(f"epoch:{epoch} --> {score} / {len(x_test)}")
                
                
    def _evaluate(self, x_test, y_test):
        i = 0
        for x, y in zip(x_test, y_test):
            pred = np.argmax(self._feedforward(x))
            if pred == y:
                i += 1
        return i
    
    def _mini_batch(self, X, y, batch_size):
        n_data = len(X)
        n_batch = int(n_data / batch_size)
        idxSet = np.random.permutation(n_data)
        x_batch_list = []
        y_batch_list = []
        for i in range(n_batch):
            x_batch = X[idxSet[i*batch_size:(i+1)*batch_size], :]
            x_batch_list.append(x_batch)
            y_batch = y[idxSet[i*batch_size:(i+1)*batch_size], :]
            y_batch_list.append(y_batch)
        return x_batch_list, y_batch_list
            
        
    def _feedforward(self, a):
        # return last layer's output
        for w, b in zip(self.weights, self.bias):
            a = np.dot(w, a) + b
        return self._sigmoid(a)
    
    def _backpropagation(self, x, y_i):
        
        mini_dW = [np.zeros(w.shape) for w in self.weights]
        mini_db = [np.zeros(b.shape) for b in self.bias]
        
        # feedforward
        activation = x
        activations = [x]
        zs = []
        for w, b in zip(self.weights, self.bias):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = self._sigmoid(z)
            activations.append(activation)
            
        # backward
        delta = self._cost_derivative(activations[-1], y_i) * self._sigmoid_prime(zs[-1])
        mini_db[-1] = delta
        mini_dW[-1] = np.outer(delta,activations[-2])
        
        for i in range(2, self.n_layer):
            delta = np.multiply(np.dot(self.weights[-i+1].T, delta), self._sigmoid_prime(zs[-i]))
            mini_db[-i] = delta
            mini_dW[-i] = np.outer(delta,activations[-i-1])
    
        return mini_dW, mini_db
        
        
    def _sigmoid(self, x):
        return 1 / (1+np.exp(-x))
    
    def _sigmoid_prime(self, x):
        return self._sigmoid(x) * (1 - self._sigmoid(x))
    
    def _cost_derivative(self, y_pred, y_true):
        return y_pred - y_true

In [13]:
import tensorflow as tf
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train/255.0, x_test/255.0

x_train = x_train.reshape(-1,784)
x_test = x_test.reshape(-1,784)
y_train_onehat = tf.keras.utils.to_categorical(y_train, 10)
y_test_onehat = tf.keras.utils.to_categorical(y_test, 10)

# test
nw = Network()
nw.fit(x_train[:20000], y_train_onehat[:20000], epochs=30, x_test=x_test[:100], y_test=y_test[:100])

epoch:0 --> 33 / 100


  return 1 / (1+np.exp(-x))


epoch:1 --> 30 / 100
epoch:2 --> 29 / 100
epoch:3 --> 29 / 100
epoch:4 --> 28 / 100
epoch:5 --> 32 / 100
epoch:6 --> 30 / 100
epoch:7 --> 27 / 100
epoch:8 --> 26 / 100
epoch:9 --> 26 / 100
epoch:10 --> 39 / 100
epoch:11 --> 36 / 100
epoch:12 --> 37 / 100
epoch:13 --> 38 / 100
epoch:14 --> 38 / 100
epoch:15 --> 38 / 100
epoch:16 --> 38 / 100
epoch:17 --> 38 / 100
epoch:18 --> 39 / 100
epoch:19 --> 39 / 100
epoch:20 --> 39 / 100
epoch:21 --> 38 / 100
epoch:22 --> 38 / 100
epoch:23 --> 38 / 100
epoch:24 --> 39 / 100
epoch:25 --> 38 / 100
epoch:26 --> 38 / 100
epoch:27 --> 39 / 100
epoch:28 --> 39 / 100
epoch:29 --> 38 / 100


## 2. cost func. = cross-entropy, activation func. = sigmoid
- by using **cross-entropy** as a cost function, we can eliminate sigmoid prime form(can avoid saturation) while calculating last layer's delta

In [54]:
import numpy as np

class Network_2:
    
    def __init__(self, n_neurons=[784,100,10], learning_rate=0.01, batch_size=10):
        np.random.seed(0)
        self.weights = [np.random.randn(x,y) for x, y in zip(n_neurons[1:], n_neurons[:-1])] 
        self.bias = [np.random.randn(x) for x in n_neurons[1:]]
        self.n_layer = len(n_neurons)
        self.lr = learning_rate
        self.batch_size = batch_size
        
        
    def fit(self, X, y, epochs, x_test=None, y_test=None):
        
        x_batch_list, y_batch_list = self._mini_batch(X, y, self.batch_size)
        
        for epoch in range(epochs):
            
            for i in range(len(x_batch_list)):
            
                dW = [np.zeros(w.shape) for w in self.weights]
                db = [np.zeros(b.shape) for b in self.bias]
                
                for x_i, y_i in zip(x_batch_list[i], y_batch_list[i]):
                    self._feedforward(x_i)

                    mini_dW, mini_db = self._backpropagation(x_i, y_i)
                    dW = [w + nw for w, nw in zip(dW, mini_dW)]
                    db = [b + nb for b, nb in zip(db, mini_db)]

                # update
                self.weights = [w - (self.lr / self.batch_size) * nw for w, nw in zip(self.weights, dW)]
                self.bias = [b - (self.lr / self.batch_size) * nb for b, nb in zip(self.bias, db)]
                
            score = self._evaluate(x_test, y_test)
            print(f"epoch:{epoch} --> {score} / {len(x_test)}")
                
                
    def _evaluate(self, x_test, y_test):
        i = 0
        for x, y in zip(x_test, y_test):
            pred = np.argmax(self._feedforward(x))
            if pred == y:
                i += 1
        return i
    
    def _mini_batch(self, X, y, batch_size):
        n_data = len(X)
        n_batch = int(n_data / batch_size)
        idxSet = np.random.permutation(n_data)
        x_batch_list = []
        y_batch_list = []
        for i in range(n_batch):
            x_batch = X[idxSet[i*batch_size:(i+1)*batch_size], :]
            x_batch_list.append(x_batch)
            y_batch = y[idxSet[i*batch_size:(i+1)*batch_size], :]
            y_batch_list.append(y_batch)
        return x_batch_list, y_batch_list
            
        
    def _feedforward(self, a):
        # return last layer's output
        for w, b in zip(self.weights, self.bias):
            a = np.dot(w, a) + b
        return self._sigmoid(a)
    
    def _backpropagation(self, x, y_i):
        
        mini_dW = [np.zeros(w.shape) for w in self.weights]
        mini_db = [np.zeros(b.shape) for b in self.bias]
        
        # feedforward
        activation = x
        activations = [x]
        zs = []
        for w, b in zip(self.weights, self.bias):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = self._sigmoid(z)
            activations.append(activation)
            
        # backward
        delta = self._cost_derivative(activations[-1], y_i)
        mini_db[-1] = delta
        mini_dW[-1] = np.outer(delta,activations[-2])
        
        for i in range(2, self.n_layer):
            delta = np.multiply(np.dot(self.weights[-i+1].T, delta), self._sigmoid_prime(zs[-i]))
            mini_db[-i] = delta
            mini_dW[-i] = np.outer(delta,activations[-i-1])
    
        return mini_dW, mini_db
        
        
    def _sigmoid(self, x):
        return 1 / (1+np.exp(-x))
    
    def _sigmoid_prime(self, x):
        return self._sigmoid(x) * (1 - self._sigmoid(x))
    
    def _cost_derivative(self, y_pred, y_true):
        return y_pred - y_true

In [64]:
import tensorflow as tf
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train/255.0, x_test/255.0

x_train = x_train.reshape(-1,784)
x_test = x_test.reshape(-1,784)
y_train_onehat = tf.keras.utils.to_categorical(y_train, 10)
y_test_onehat = tf.keras.utils.to_categorical(y_test, 10)


nw2 = Network_2()
nw2.fit(x_train[:20000], y_train_onehat[:20000], epochs=30, x_test=x_test[:100], y_test=y_test[:100])

epoch:0 --> 35 / 100
epoch:1 --> 44 / 100
epoch:2 --> 52 / 100
epoch:3 --> 51 / 100
epoch:4 --> 52 / 100
epoch:5 --> 54 / 100
epoch:6 --> 53 / 100
epoch:7 --> 54 / 100
epoch:8 --> 55 / 100
epoch:9 --> 56 / 100
epoch:10 --> 55 / 100
epoch:11 --> 55 / 100
epoch:12 --> 57 / 100
epoch:13 --> 58 / 100
epoch:14 --> 58 / 100
epoch:15 --> 58 / 100
epoch:16 --> 58 / 100
epoch:17 --> 58 / 100
epoch:18 --> 57 / 100
epoch:19 --> 57 / 100
epoch:20 --> 57 / 100
epoch:21 --> 57 / 100
epoch:22 --> 57 / 100
epoch:23 --> 56 / 100
epoch:24 --> 56 / 100
epoch:25 --> 55 / 100
epoch:26 --> 54 / 100
epoch:27 --> 54 / 100
epoch:28 --> 53 / 100
epoch:29 --> 53 / 100


## 3. cost func. = cross-entropy, activation func. = sigmoid, +L2 regularization

- **Regularization** not only reduces overfitting and increases classification accuracies but also provides much more easily replicable results regardless of initial weights.

In [71]:
import numpy as np

class Network_3:
    
    def __init__(self, n_neurons=[784,100,10], learning_rate=0.01, batch_size=10, lmbda=10.0):
        np.random.seed(0)
        self.weights = [np.random.randn(x,y) for x, y in zip(n_neurons[1:], n_neurons[:-1])] 
        self.bias = [np.random.randn(x) for x in n_neurons[1:]]
        self.n_layer = len(n_neurons)
        self.lr = learning_rate
        self.batch_size = batch_size
        self.lmbda  = lmbda
        
        
    def fit(self, X, y, epochs, x_test=None, y_test=None):
        
        n = len(X)
        x_batch_list, y_batch_list = self._mini_batch(X, y, self.batch_size)
        
        for epoch in range(epochs):
            
            for i in range(len(x_batch_list)):
            
                dW = [np.zeros(w.shape) for w in self.weights]
                db = [np.zeros(b.shape) for b in self.bias]
                
                for x_i, y_i in zip(x_batch_list[i], y_batch_list[i]):
                    self._feedforward(x_i)

                    mini_dW, mini_db = self._backpropagation(x_i, y_i)
                    dW = [w + nw for w, nw in zip(dW, mini_dW)]
                    db = [b + nb for b, nb in zip(db, mini_db)]

                # update
                self.weights = [(1-self.lr*self.lmbda/n)*w - (self.lr / self.batch_size) * nw for w, nw in zip(self.weights, dW)]
                self.bias = [b - (self.lr / self.batch_size) * nb for b, nb in zip(self.bias, db)]
                
            score = self._evaluate(x_test, y_test)
            print(f"epoch:{epoch} --> {score} / {len(x_test)}")
                
                
    def _evaluate(self, x_test, y_test):
        i = 0
        for x, y in zip(x_test, y_test):
            pred = np.argmax(self._feedforward(x))
            if pred == y:
                i += 1
        return i
    
    def _mini_batch(self, X, y, batch_size):
        n_data = len(X)
        n_batch = int(n_data / batch_size)
        idxSet = np.random.permutation(n_data)
        x_batch_list = []
        y_batch_list = []
        for i in range(n_batch):
            x_batch = X[idxSet[i*batch_size:(i+1)*batch_size], :]
            x_batch_list.append(x_batch)
            y_batch = y[idxSet[i*batch_size:(i+1)*batch_size], :]
            y_batch_list.append(y_batch)
        return x_batch_list, y_batch_list
            
        
    def _feedforward(self, a):
        # return last layer's output
        for w, b in zip(self.weights, self.bias):
            a = np.dot(w, a) + b
        return self._sigmoid(a)
    
    def _backpropagation(self, x, y_i):
        
        mini_dW = [np.zeros(w.shape) for w in self.weights]
        mini_db = [np.zeros(b.shape) for b in self.bias]
        
        # feedforward
        activation = x
        activations = [x]
        zs = []
        for w, b in zip(self.weights, self.bias):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = self._sigmoid(z)
            activations.append(activation)
            
        # backward
        delta = self._cost_derivative(activations[-1], y_i)
        mini_db[-1] = delta
        mini_dW[-1] = np.outer(delta,activations[-2])
        
        for i in range(2, self.n_layer):
            delta = np.multiply(np.dot(self.weights[-i+1].T, delta), self._sigmoid_prime(zs[-i]))
            mini_db[-i] = delta
            mini_dW[-i] = np.outer(delta,activations[-i-1])
    
        return mini_dW, mini_db
        
        
    def _sigmoid(self, x):
        return 1 / (1+np.exp(-x))
    
    def _sigmoid_prime(self, x):
        return self._sigmoid(x) * (1 - self._sigmoid(x))
    
    def _cost_derivative(self, y_pred, y_true):
        return y_pred - y_true

In [73]:
import tensorflow as tf
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train/255.0, x_test/255.0

x_train = x_train.reshape(-1,784)
x_test = x_test.reshape(-1,784)
y_train_onehat = tf.keras.utils.to_categorical(y_train, 10)
y_test_onehat = tf.keras.utils.to_categorical(y_test, 10)


nw3 = Network_3()
nw3.fit(x_train[:20000], y_train_onehat[:20000], epochs=30, x_test=x_test[:100], y_test=y_test[:100])

epoch:0 --> 35 / 100
epoch:1 --> 45 / 100
epoch:2 --> 52 / 100
epoch:3 --> 52 / 100
epoch:4 --> 56 / 100
epoch:5 --> 57 / 100
epoch:6 --> 56 / 100
epoch:7 --> 57 / 100
epoch:8 --> 58 / 100
epoch:9 --> 57 / 100
epoch:10 --> 58 / 100
epoch:11 --> 58 / 100
epoch:12 --> 57 / 100
epoch:13 --> 57 / 100
epoch:14 --> 59 / 100
epoch:15 --> 60 / 100
epoch:16 --> 60 / 100
epoch:17 --> 60 / 100
epoch:18 --> 61 / 100
epoch:19 --> 61 / 100
epoch:20 --> 62 / 100
epoch:21 --> 62 / 100
epoch:22 --> 62 / 100
epoch:23 --> 63 / 100
epoch:24 --> 63 / 100
epoch:25 --> 63 / 100
epoch:26 --> 62 / 100
epoch:27 --> 64 / 100
epoch:28 --> 65 / 100
epoch:29 --> 65 / 100


## 4. cost func. = cross-entropy, activation func. = sigmoid, +L2 regularization, +initialize
- initialize weights not by normal Gaussian random variables but by **Gaussian with mean: 0, standard deviation: (1/sqrt(input_size))** which can avoid sigmoid func. saturation in hidden layers.

In [2]:
import numpy as np

class QuadraticCost:
    
    @staticmethod
    def fn(a, y): # a, y are vectors
        return 0.5*np.linalg.norm(a-y)**2
    
    @staticmethod
    def delta(z, a, y): # z, a, y are vectors
        return (a-y)*_sigmoid_prime(z)
    
    
class CrosssEntropyCost:
    
    @staticmethod
    def fn(a, y): 
        return np.sum(np.nan_to_num(-y*np.log(a) - (1-y)*np.log(1-a)))
    
    def delta(z, a, y):
        return (a-y)

    
class Network_4:
    
    def __init__(self, n_neurons=[784,100,10], learning_rate=0.01, batch_size=10, lmbda=10.0, cost=CrosssEntropyCost):
        self.n_neurons = n_neurons
        self.default_weight_initializer()
        self.n_layer = len(n_neurons)
        self.lr = learning_rate
        self.batch_size = batch_size
        self.lmbda  = lmbda
        self.cost = cost
    
    def default_weight_initializer(self):
        np.random.seed(0)
        self.weights = [np.random.randn(x,y) / np.sqrt(y) for x, y in zip(self.n_neurons[1:], self.n_neurons[:-1])] 
        self.bias = [np.random.randn(x) for x in self.n_neurons[1:]]
        
    def large_weight_initializer(self):
        np.random.seed(0)
        self.weights = [np.random.randn(x,y) for x, y in zip(self.n_neurons[1:], self.n_neurons[:-1])] 
        self.bias = [np.random.randn(x) for x in self.n_neurons[1:]]
        
    def fit(self, X, y, epochs, x_test=None, y_test=None, monitor_test_accuracy=True):
        
        n = len(X)
        x_batch_list, y_batch_list = self._mini_batch(X, y, self.batch_size)
        
        for epoch in range(epochs):
            
            for i in range(len(x_batch_list)):
            
                dW = [np.zeros(w.shape) for w in self.weights]
                db = [np.zeros(b.shape) for b in self.bias]
                
                for x_i, y_i in zip(x_batch_list[i], y_batch_list[i]):
                    self._feedforward(x_i)

                    mini_dW, mini_db = self._backpropagation(x_i, y_i)
                    dW = [w + nw for w, nw in zip(dW, mini_dW)]
                    db = [b + nb for b, nb in zip(db, mini_db)]

                # update
                self.weights = [(1-self.lr*self.lmbda/n)*w - (self.lr / self.batch_size) * nw for w, nw in zip(self.weights, dW)]
                self.bias = [b - (self.lr / self.batch_size) * nb for b, nb in zip(self.bias, db)]
            
            if monitor_test_accuracy:
                score = self._evaluate(x_test, y_test)
                print(f"epoch:{epoch} --> {score} / {len(x_test)}")
                
                
    def _evaluate(self, x_test, y_test):
        i = 0
        for x, y in zip(x_test, y_test):
            pred = np.argmax(self._feedforward(x))
            if pred == y:
                i += 1
        return i
    
    def _mini_batch(self, X, y, batch_size):
        n_data = len(X)
        n_batch = int(n_data / batch_size)
        idxSet = np.random.permutation(n_data)
        x_batch_list = []
        y_batch_list = []
        for i in range(n_batch):
            x_batch = X[idxSet[i*batch_size:(i+1)*batch_size], :]
            x_batch_list.append(x_batch)
            y_batch = y[idxSet[i*batch_size:(i+1)*batch_size], :]
            y_batch_list.append(y_batch)
        return x_batch_list, y_batch_list
            
        
    def _feedforward(self, a):
        # return last layer's output
        for w, b in zip(self.weights, self.bias):
            a = np.dot(w, a) + b
        return self._sigmoid(a)
    
    def _backpropagation(self, x, y_i):
        
        mini_dW = [np.zeros(w.shape) for w in self.weights]
        mini_db = [np.zeros(b.shape) for b in self.bias]
        
        # feedforward
        activation = x
        activations = [x]
        zs = []
        for w, b in zip(self.weights, self.bias):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = self._sigmoid(z)
            activations.append(activation)
            
        # backward
        delta = self.cost.delta(zs[-1], activations[-1], y_i)
        mini_db[-1] = delta
        mini_dW[-1] = np.outer(delta,activations[-2])
        
        for i in range(2, self.n_layer):
            delta = np.multiply(np.dot(self.weights[-i+1].T, delta), self._sigmoid_prime(zs[-i]))
            mini_db[-i] = delta
            mini_dW[-i] = np.outer(delta,activations[-i-1])
    
        return mini_dW, mini_db
        
        
    def _sigmoid(self, x):
        return 1 / (1+np.exp(-x))
    
    def _sigmoid_prime(self, x):
        return self._sigmoid(x) * (1 - self._sigmoid(x))

In [3]:
import tensorflow as tf
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train/255.0, x_test/255.0

x_train = x_train.reshape(-1,784)
x_test = x_test.reshape(-1,784)
y_train_onehat = tf.keras.utils.to_categorical(y_train, 10)
y_test_onehat = tf.keras.utils.to_categorical(y_test, 10)

nw4 = Network_4()
nw4.fit(x_train[:20000], y_train_onehat[:20000], epochs=30, x_test=x_test[:100], y_test=y_test[:100], monitor_test_accuracy=True)

2023-03-03 11:46:36.181435: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


epoch:0 --> 74 / 100
epoch:1 --> 80 / 100
epoch:2 --> 85 / 100
epoch:3 --> 87 / 100
epoch:4 --> 90 / 100
epoch:5 --> 90 / 100
epoch:6 --> 90 / 100
epoch:7 --> 91 / 100
epoch:8 --> 91 / 100
epoch:9 --> 92 / 100
epoch:10 --> 92 / 100
epoch:11 --> 92 / 100
epoch:12 --> 92 / 100
epoch:13 --> 92 / 100
epoch:14 --> 92 / 100
epoch:15 --> 92 / 100
epoch:16 --> 92 / 100
epoch:17 --> 91 / 100
epoch:18 --> 91 / 100
epoch:19 --> 91 / 100
epoch:20 --> 90 / 100
epoch:21 --> 91 / 100
epoch:22 --> 90 / 100
epoch:23 --> 90 / 100
epoch:24 --> 89 / 100
epoch:25 --> 88 / 100
epoch:26 --> 88 / 100
epoch:27 --> 88 / 100
epoch:28 --> 87 / 100
epoch:29 --> 86 / 100
