# Setup

In [1]:
# For Google Collab

# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

tcmalloc: large alloc 1073750016 bytes == 0x58812000 @  0x7f3bad2c22a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import numpy as np
import matplotlib.pyplot as plt

In [0]:
def one_hot(old_y, m):
    
    n = len(old_y)
    
    y = np.zeros((n, m))
    
    y[np.arange(n), old_y] = 1
    
    return y

In [0]:
# ~~ MNIST dataset ~~

X_train = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_train.npy')
y_train = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_train.npy'), 10)

X_val = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_val.npy')
y_val = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_val.npy'), 10)

X_test = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_test.npy')
y_test = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_test.npy'), 10)

# Model

In [0]:
class NN:
    
    # ----- Initializations ----- #
    
    def initialize_weights(self, n_hidden, dims, initialization_scheme):
    
        # Zero    
        if initialization_scheme == 'Zero':
            
            for i in range(n_hidden + 1):
            
                # Weights set to 0
            
                self.W.append( np.zeros(( dims[i+1], dims[i] )) )
        
                # Biases set to 0
            
                self.b.append( np.zeros( dims[i+1] ) )
        
        # Normal        
        elif initialization_scheme == 'Normal':
            
            for i in range(n_hidden + 1):
            
                # Weights sampled from N(0,1)
            
                self.W.append( np.random.randn( dims[i+1], dims[i] ) )
        
                # Biases set to 0
            
                self.b.append( np.zeros( dims[i+1] ) )
        
        # Glorot        
        else: #initialization_scheme == 'Glorot'
            
            for i in range(n_hidden + 1):            
            
                # Weights sampled from U(-d^l, d^l), d^l = sqrt( 6 / h^(l-1) + h^l )
            
                d = np.sqrt( 6 / (dims[i] + dims[i+1]) )
                self.W.append( np.random.uniform( -d, d, (dims[i+1], dims[i]) ) )
        
                # Biases ~ 0
            
                self.b.append( np.zeros(dims[i+1]) )
    
    
    # ----- ReLU activation ----- #
    
    def activation(self, inputs):
        
        zeros = np.zeros_like(inputs)
        
        return np.maximum(zeros, inputs)
    
    
    # ----- Softmax ----- #
    
    def softmax(self, inputs):
        
        # Numerically stable softmax
        
        b = inputs.max()
        
        return np.exp(inputs - b) / np.exp(inputs - b).sum()
    
        
    # ----- Update weights ----- #
    
    def update(self, grads, eta):
        
        grad_W, grad_b = grads
        
        for l in range(self.n_hidden + 1):
                    
            self.W[l] -= eta * grad_W[l]
            self.b[l] -= eta * grad_b[l]
        
    # ----- Constructor ----- #
        
    def __init__(self, hidden_dims, n_hidden, initialization_scheme):
        
        dims = [784] + hidden_dims + [10]
        
        self.W = []
        self.b = []
        
        self.n_hidden = n_hidden
        
        self.initialize_weights(n_hidden, dims, initialization_scheme)
    
    # ----- Train ----- #
    
    def train(self, X, y, epochs, eta):
            
        n = len(X)
            
        # Stochastic Gradient Descent
            
        for epoch in range(epochs):
                
            empirical_risk = 0
            accuracy = 0
                
            for i in range(n):
                
                # Forward pass
                self.forward(X[i], y[i])
                
                # Backward pass
                    
                grads = self.backward(X[i], y[i])
                
                self.update(grads, eta)
                
                if i % 5000 == 0:
                    print(i / 500, '%')
                    print( self.test(X, y) )
            
            print('Epoch', epoch+1)
            print('Empirical risk', empirical_risk)
            print('Accuracy', accuracy, '%')
                       
    # ----- Cross Entropy Loss ----- #
    
    def loss(self, prediction, label):
        
        L_x = - np.log(prediction + 0.00001) # To avoid log(0)
        
        return np.sum(L_x * label)
        
    # ----- Test ----- #
    
    def test(self, X, y):
        
        n = len(X)
        
        correct_predictions = 0
        total_loss = 0
        
        for i in range(n):
            
            y_hat, loss = self.forward(X[i], y[i])
            
            correct_predictions += int(np.argmax(y[i]) == y_hat)
            total_loss += loss
            
        accuracy = (correct_predictions / n) * 100
        empirical_risk = total_loss / n
        
        return accuracy, empirical_risk
        
    # ----- Forward Propagation ----- #
    
    def forward(self, X, y):
    
        n = len(X)
    
        self.a = [X]
    
        self.h = []
    
        for i in range(self.n_hidden):
            
            self.h.append( np.matmul(self.a[i], self.W[i].T) + self.b[i] )
            
            self.a.append( self.activation(self.h[i])  )
            
        self.h_output = np.matmul(self.a[-1], self.W[-1].T) + self.b[-1]
        
        self.a_output = self.softmax(self.h_output)
        
        y_hat = np.argmax(self.a_output)
        
        return y_hat, self.loss(self.a_output, y)
    
    
    # ----- Backward Propagation ----- #
    
    def backward(self, X, y):
        
        # Set up list storing gradients

        grad_W = []
        grad_b = []

        for i in range(self.n_hidden + 1):
            
            grad_W.append(None)
            grad_b.append(None)
            
        for i in range(self.n_hidden, -1, -1):
            
            # Softmax
            if i == self.n_hidden:
                grad_h = np.copy(self.a_output)
                grad_h -= y
            
            # ReLU
            else:
                grad_h = grad_a * ( (self.h[i] > 0) * 1 )
            
            grad_W[i] = np.outer(grad_h, self.a[i])
            
            grad_b[i] = np.sum(grad_h, axis=0)
            
            grad_a = np.matmul(grad_h, self.W[i])
        
        return (grad_W, grad_b)

In [128]:
model = NN(hidden_dims=[666, 666], n_hidden=2, initialization_scheme='Glorot')
#print(model.forward(X_train[0], y_train[0]))
model.train(X_train, y_train, 10, 0.01)


0.0 %
(9.017999999999999, 2.3756848987413455)
10.0 %
(90.056, 0.3253343502915492)
20.0 %
(93.074, 0.22763789574673704)
30.0 %
(93.65, 0.20543586034756267)
40.0 %
(95.018, 0.16604558423965657)
50.0 %
(94.192, 0.1864998380342239)
60.0 %
(95.062, 0.15894003348533656)
70.0 %
(96.908, 0.103701639289533)
80.0 %
(96.136, 0.12289095760663359)
90.0 %
(96.426, 0.1115924255981566)
Epoch 1
Empirical risk 0
Accuracy 0 %
0.0 %
(96.452, 0.11303513065647999)
10.0 %
(96.648, 0.10906568751836086)


KeyboardInterrupt: ignored