In [1]:
import numpy as np
import random

class Network():
    
    def __init__(self, layers, weights = None, biases = None):
        self.num_layers = len(layers)
        self.layers = layers
        if weights and biases:
            self.weights = weights
            self.biases = biases
        else:
            self.weights = [np.random.uniform(-1, 1, (r, c))/np.sqrt(c) for r, c in zip(self.layers[1:], self.layers[:-1])]
            self.biases = [np.random.uniform(-1, 1, (r, 1)) for r in self.layers[1:]]
            
        self.v_delta_w = [np.zeros(w.shape) for w in self.weights]
        self.v_delta_b = [np.zeros(b.shape) for b in self.biases]
        
        self.s_delta_w = [np.zeros(w.shape) for w in self.weights]
        self.s_delta_b = [np.zeros(b.shape) for b in self.biases]
        
    def feedforward(self, a):
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            a = np.tanh(np.dot(w, a)+b)
        a = self.softmax(np.dot(self.weights[-1], a)+self.biases[-1])
#         a = self.sigmoids(np.dot(self.weights[-1], a)+self.biases[-1])
        return a    
                
    def sigmoid(self, z):
        return 1.0/(1+np.exp(-z))
    
    def softmax(self, z):
        s  = np.squeeze(np.sum(np.exp(z), axis = 0))
        return np.exp(z)/s
    
    def SGD(self, X_train, y_train, epochs, learning_rate, mini_batch_size, X_test, y_test, lmbda = 0, beta1 = 0, beta2 = 0, epsilon = 1e-8):
        for i in range(epochs):
            permutation = list(np.random.permutation(X_train.shape[0]))
            X_shuffled_train = X_train[permutation, :]
            y_shuffled_train = y_train[permutation, :]
            mini_batches_X = [X_shuffled_train[j:j+mini_batch_size] for j in range(0, len(X_train), mini_batch_size)]
            mini_batches_y = [y_shuffled_train[j:j+mini_batch_size] for j in range(0, len(X_train), mini_batch_size)]
            for mini_batch_X, mini_batch_y in zip(mini_batches_X, mini_batches_y):
                self.update_mini_batch(mini_batch_X, mini_batch_y, len(X_train), mini_batch_size, learning_rate, lmbda, beta1, beta2, epsilon)
            print("Epoch {}/{} done: {}/{}".format(i+1, epochs, self.evaluate(X_test, y_test), len(X_test)))
                
    def update_mini_batch(self, mini_batch_X, mini_batch_y, training_data_size, mini_batch_size, learning_rate, lmbda, beta1, beta2, epsilon):
        
        delta_nabla_b, delta_nabla_w = self.backprob(mini_batch_X.T, mini_batch_y.T, mini_batch_size, beta1, beta2)
        
        self.biases = [b-(learning_rate/mini_batch_size)*(vdb/np.sqrt(sdb+epsilon)) 
                       for b, vdb, sdb in zip(self.biases, self.v_delta_b, self.s_delta_b)]
        self.weights = [(1-(learning_rate*lmbda/training_data_size))*w-(learning_rate/mini_batch_size)*(vdw/np.sqrt(sdw+epsilon)) 
                        for w, vdw, sdw in zip(self.weights, self.v_delta_w, self.s_delta_w)]
        
    def backprob(self, X, y, mini_batch_size, beta1, beta2):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        
        activation = X
        activations = [X]
        zs = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = np.tanh(z)
            activations.append(activation)
            
        #last layer uses sigmoid/softmax activation not tanh
        z = np.dot(self.weights[-1], activations[-1])+self.biases[-1]
        zs.append(z)
        activation = self.sigmoid(z)
#         activation = self.softmax(z)
        activations.append(activation)
        
        delta = activations[-1]-y
        
        nabla_b[-1] = np.sum(delta, axis = 1, keepdims = True)/mini_batch_size
        self.v_delta_b[-1] = beta1*self.v_delta_b[-1]+(1-beta1)*nabla_b[-1]
        self.s_delta_b[-1] = beta2*self.s_delta_b[-1]+(1-beta2)*(nabla_b[-1]**2)
        
        nabla_w[-1] = np.dot(delta, activations[-2].T)/mini_batch_size
        self.v_delta_w[-1] = beta1*self.v_delta_w[-1]+(1-beta1)*nabla_w[-1]
        self.s_delta_w[-1] = beta2*self.s_delta_w[-1]+(1-beta2)*(nabla_w[-1]**2)
        
        for l in range(2, self.num_layers):
            delta = np.dot(self.weights[-l+1].T, delta)*(1-np.power(np.tanh(zs[-l]), 2))
            
            nabla_b[-l] = np.sum(delta, axis = 1, keepdims = True)/mini_batch_size
            self.v_delta_b[-l] = beta1*self.v_delta_b[-l]+(1-beta1)*nabla_b[-l]
            self.s_delta_b[-l] = beta2*self.s_delta_b[-l]+(1-beta2)*(nabla_b[-l]**2)
            
            nabla_w[-l] = np.dot(delta, activations[-l-1].T)/mini_batch_size
            self.v_delta_w[-l] = beta1*self.v_delta_w[-l]+(1-beta1)*nabla_w[-l]
            self.s_delta_w[-l] = beta2*self.s_delta_w[-l]+(1-beta2)*(nabla_w[-l]**2)
            
        return (nabla_b, nabla_w)
    
    def evaluate(self, X_test, y_test):

        activation = X_test.T
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            activation = np.tanh(np.dot(w, activation)+b)
        activation = self.sigmoid(np.dot(self.weights[-1], activation)+self.biases[-1])
        
        res = np.argmax(activation, axis = 0)
        return sum(int(y_hat==y) for y_hat, y in zip(res, y_test))
    
    def predict(self, X):
        res = self.feedforward(X.reshape((self.layers[0], 1)))
        print(res)
        return np.argmax(res, axis = 0)
         
        

In [2]:
dnn =  Network([28*28, 32, 32, 10])

In [10]:
dnn.SGD(X_train, y_train, 30, 0.01, 10, X_test, y_test, lmbda = 6, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8)

Epoch 1/30 done: 9353/10000
Epoch 2/30 done: 9472/10000
Epoch 3/30 done: 9523/10000
Epoch 4/30 done: 9559/10000
Epoch 5/30 done: 9566/10000
Epoch 6/30 done: 9566/10000
Epoch 7/30 done: 9611/10000
Epoch 8/30 done: 9605/10000
Epoch 9/30 done: 9592/10000
Epoch 10/30 done: 9614/10000
Epoch 11/30 done: 9600/10000
Epoch 12/30 done: 9606/10000
Epoch 13/30 done: 9573/10000
Epoch 14/30 done: 9599/10000
Epoch 15/30 done: 9586/10000
Epoch 16/30 done: 9592/10000
Epoch 17/30 done: 9617/10000
Epoch 18/30 done: 9605/10000
Epoch 19/30 done: 9608/10000
Epoch 20/30 done: 9617/10000
Epoch 21/30 done: 9577/10000
Epoch 22/30 done: 9576/10000
Epoch 23/30 done: 9626/10000
Epoch 24/30 done: 9618/10000
Epoch 25/30 done: 9591/10000
Epoch 26/30 done: 9597/10000
Epoch 27/30 done: 9588/10000
Epoch 28/30 done: 9615/10000
Epoch 29/30 done: 9601/10000
Epoch 30/30 done: 9564/10000


In [4]:
import tensorflow.keras as keras

In [5]:
(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()

In [6]:
X_train = X_train/255
X_test = X_test/255

In [7]:
X_train.resize(X_train.shape[0], X_train.shape[1]*X_train.shape[2])

In [8]:
X_test.resize(X_test.shape[0], X_test.shape[1]*X_test.shape[2])

In [9]:
y_train_changed = []
for i in range(y_train.shape[0]):
    y = [0]*10
    y[y_train[i]] = 1
    y_train_changed.append(y)
y_train = np.array(y_train_changed)