In [1]:
import numpy as np
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def one_hot(old_y, m):
    
    n = len(old_y)
    
    y = np.zeros((n, m))
    
    y[np.arange(n), old_y] = 1
    
    return y

## MNIST

In [0]:
# ~~ MNIST dataset ~~

X_train = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_train.npy')
y_train = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_train.npy'), 10)

X_val = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_val.npy')
y_val = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_val.npy'), 10)

X_test = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_test.npy')
y_test = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_test.npy'), 10)

In [0]:
class NN:
    
    # ----- Constructor ----- #
        
    def __init__(self, hidden_dims, n_hidden, initialization_scheme):
        
        dims = [784] + hidden_dims + [10]
        
        self.W = []
        self.b = []
        
        self.n_hidden = n_hidden
        
        self.m = 10 # Number of classes
        
        self.initialize_weights(n_hidden, dims, initialization_scheme)

        
    # ----- Initializations ----- #
        
    def initialize_weights(self, n_hidden, dims, initialization_scheme):
    
        # Zero    
        if initialization_scheme == 'Zero':
            
            for i in range(n_hidden + 1):
            
                # Weights set to 0
            
                self.W.append( np.zeros(( dims[i+1], dims[i] )) )
        
                # Biases set to 0
            
                self.b.append( np.zeros( dims[i+1] ) )
        
        # Normal        
        elif initialization_scheme == 'Normal':
            
            for i in range(n_hidden + 1):
            
                # Weights sampled from N(0,1)
            
                self.W.append( np.random.randn( dims[i+1], dims[i] ) )
        
                # Biases set to 0
            
                self.b.append( np.zeros( dims[i+1] ) )
        
        # Glorot        
        else: #initialization_scheme == 'Glorot'
            
            for i in range(n_hidden + 1):            
            
                # Weights sampled from U(-d^l, d^l), d^l = sqrt( 6 / h^(l-1) + h^l )
            
                d = np.sqrt( 6 / (dims[i] + dims[i+1]) )
                self.W.append( np.random.uniform( -d, d, (dims[i+1], dims[i]) ) )
        
                # Biases ~ 0
            
                self.b.append( np.zeros(dims[i+1]) )

                
    # ----- ReLU activation ----- #
    
    def activation(self, inputs):
        
        zeros = np.zeros_like(inputs)
        
        return np.maximum(zeros, inputs)
   

    # ----- Softmax ----- #
    
    def softmax(self, inputs):
        
        n, _ = inputs.shape
        
        # Numerically stable softmax
        
        b = np.max(inputs, axis=1).reshape((n, 1))
        
        return np.exp(inputs - b) / np.sum( np.exp(inputs - b), axis=1).reshape((n, 1))
        
        
    # ----- Cross Entropy Loss ----- #
    
    def loss(self, predictions, labels):
        
        n, _ = predictions.shape
        
        losses = np.sum( - np.log(predictions + 0.00001) * labels, axis=1 ) # To avoid log(0)
        
        return np.sum(losses) / n # Average loss (Empirical risk)
        
        
    # ----- Forward Propagation ----- #
    
    def forward(self, X, y):
    
        n, _ = X.shape
    
        self.a = [X]
    
        self.h = []
    
        for i in range(self.n_hidden):
            
            self.h.append( np.matmul(self.a[i], self.W[i].T) + self.b[i] )
            
            self.a.append( self.activation(self.h[i])  )
            
        self.h_output = np.matmul(self.a[-1], self.W[-1].T) + self.b[-1]
        
        self.a_output = self.softmax(self.h_output)
        
        y_hat = one_hot(np.argmax(self.a_output, axis=1), self.m)
        
        return y_hat, self.loss(self.a_output, y)   
    
    
    # ----- Backward Propagation ----- #
    
    def backward(self, X, y):
        
        n, d = X.shape
        
        # Set up list storing gradients

        grad_W = []
        grad_b = []

        for i in range(self.n_hidden + 1):
            
            grad_W.append(None)
            grad_b.append(None)
            
        for i in range(self.n_hidden, -1, -1):
            
            # Softmax
            if i == self.n_hidden:    
                grad_h = np.copy(self.a_output)
                grad_h -= y
                grad_h /= n
            
            # ReLU
            else: 
                grad_h = grad_a * ( (self.h[i] > 0) * 1 )
            
            grad_W[i] = np.matmul(grad_h.T, self.a[i])
            
            grad_b[i] = np.sum(grad_h, axis=0)
            
            grad_a = np.matmul(grad_h, self.W[i])
        
        return (grad_W, grad_b)
    
        
    # ----- Update weights ----- # 
    
    def update(self, grads, eta):
        
        grad_W, grad_b = grads
        
        for l in range(self.n_hidden + 1):
                    
            self.W[l] -= eta * grad_W[l]
            self.b[l] -= eta * grad_b[l]

            
    # ----- Train ----- #
    
    def train(self, X, y, epochs, eta, K):
        
        n, d = X.shape
        
        n, m = y.shape
        
        # Batch Gradient Descent set up
        
        nb_batches = int(n / K)
        
        X_batches = np.zeros((nb_batches, K, d))
        y_batches = np.zeros((nb_batches, K, m))
        
        for i in range(nb_batches):
            
            batch_indexes = np.linspace(i*K, (i+1)*K - 1, K).astype(int)
            
            X_batches[i] = X[batch_indexes]
            
            y_batches[i] = y[batch_indexes]
            
        print( self.test(X_train, y_train) )
            
        for i in range(epochs):
            
            for j in range(nb_batches):
                    
                X_batch = X_batches[j]
                y_batch = y_batches[j]    
                
                # Forward pass
                self.forward(X_batch, y_batch)                 
                    
                # Backward pass
                gradients = self.backward(X_batch, y_batch)
                       
                # Update parameters
                self.update(gradients, eta)
                
            print( self.test(X_train, y_train) )
                
    # ----- Test ----- #
    
    def test(self, X, y):
        
        n, _ = X.shape
        
        y_hat, loss = self.forward(X, y)
            
        accuracy = np.sum( np.argmax(y, axis=1) == np.argmax(y_hat, axis=1)) / n * 100
        
        return accuracy, loss

In [38]:
np.random.seed(0)

model = NN(hidden_dims=[666, 666], n_hidden=2, initialization_scheme='Zero')
model.train(X_train, y_train, 10, 0.001, 100)

(9.864, 2.302485097993712)
(11.356, 2.3023334815422447)
(11.356, 2.302196316623517)
(11.356, 2.302072252911978)
(11.356, 2.301960062306546)
(11.356, 2.301858628374674)
(11.356, 2.301766936653709)
(11.356, 2.3016840657433324)
(11.356, 2.301609179128462)
(11.356, 2.301541517676853)
(11.356, 2.3014803927599443)


In [0]:
zero = []
glorot = []
normal = []

In [0]:
from pylab import rcParams
rcParams['figure.figsize'] = 15, 10

plt.xticks(np.arange(0, 11, step=1))
plt.xlabel('Epoch', weight='bold')
plt.ylabel('Average Loss on Training Set', weight='bold')
plt.title('Empirical Risk - Training Set - MNIST - 3 Different Initialization Methods', weight='bold')

plt.plot(np.arange(1, 11, step=1), normal, label='Normal')
plt.plot(np.arange(1, 11, step=1), glorot, label='Glorot')
plt.plot(np.arange(1, 11, step=1), zero, label='Zero')
plt.legend()
plt.show()