# Setup

In [0]:
# For Google Collab

# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

In [16]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import numpy as np
import matplotlib.pyplot as plt

In [0]:
def one_hot(old_y, m):
    
    n = len(old_y)
    
    y = np.zeros((n, m))
    
    y[np.arange(n), old_y] = 1
    
    return y

In [0]:
# ~~ MNIST dataset ~~

X_train = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_train.npy')
y_train = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_train.npy'), 10)

X_val = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_val.npy')
y_val = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_val.npy'), 10)

X_test = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_test.npy')
y_test = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_test.npy'), 10)

# Model

In [0]:
class NN:
    
    # ----- Initializations ----- #
    
    def initialize_weights(self, n_hidden, dims, initialization_scheme):
    
        # Zero    
        if initialization_scheme == 'Zero':
            
            for i in range(n_hidden + 1):
            
                # Weights set to 0
            
                self.W.append( np.zeros(( dims[i+1], dims[i] )) )
        
                # Biases set to 0
            
                self.b.append( np.zeros( dims[i+1] ) )
        
        # Normal        
        elif initialization_scheme == 'Normal':
            
            for i in range(n_hidden + 1):
            
                # Weights sampled from N(0,1)
            
                self.W.append( np.random.randn( dims[i+1], dims[i] ) )
        
                # Biases set to 0
            
                self.b.append( np.zeros( dims[i+1] ) )
        
        # Glorot        
        else: #initialization_scheme == 'Glorot'
            
            for i in range(n_hidden + 1):            
            
                # Weights sampled from U(-d^l, d^l), d^l = sqrt( 6 / h^(l-1) + h^l )
            
                d = np.sqrt( 6 / dims[i] + dims[i+1] )
            
                self.W.append( np.random.uniform( -d, d, (dims[i+1], dims[i]) ) )
        
                # Biases ~ 0
            
                self.b.append( np.zeros(dimensions[i+1]) )
    
    
    # ----- ReLU activation ----- #
    
    def activation(self, inputs):
        
        zeros = np.zeros_like(inputs)
        
        return np.maximum(zeros, inputs)
    
    
    # ----- Softmax ----- #
    
    def softmax(self, inputs):
        
        # Numerically stable softmax
        
        b = np.max(inputs, axis=1)
        
        return np.exp(inputs - b) / np.sum( np.exp(inputs - b), axis=1 )
    
        
    # ----- Update weights ----- #
    
    def update(self, grads, eta):
        
        grad_W, grad_b = grads
        
        for l in range(self.n_hidden + 1):
                    
            self.W[l] -= eta * grad_W[l]
            self.b[l] -= eta * grad_b[l]
        
    # ----- Constructor ----- #
        
    def __init__(self, hidden_dims, n_hidden, initialization_scheme):
        
        dims = [784] + hidden_dims + [10]
        
        self.W = []
        self.b = []
        
        self.n_hidden = n_hidden
        
        self.initialize_weights(n_hidden, dims, initialization_scheme)
    
    # ----- Train ----- #
    
    def train(self, X, y, epochs, eta):
            
        n = len(X)
            
        # Stochastic Gradient Descent
            
        for epoch in range(epochs):
                
            empirical_risk = 0
            accuracy = 0
                
            for i in range(n):
                
                # Forward pass
                [y_hat], loss, _ = self.forward(X[i].reshape((1,784)), y[i])                 
                
                empirical_risk += loss / n
                
                accuracy += int(np.argmax(y[i]) == y_hat)
                
                # Backward pass
                    
                grads = self.backward(X[i], y[i])
                
                self.update(grads, eta)
                
                if i % 500 == 0:
                    print(i / 500, '%')
            
            print('Epoch', i+1)
            print('Empirical risk', empirical_risk)
            print('Accuracy', accuracy, '%')
                       
    # ----- Cross Entropy Loss ----- #
    
    def loss(self, prediction, label):
        
        L_x = - np.log(prediction + 0.00001) # To avoid log(0)
        
        return np.sum(L_x * label, axis=1)
        
        
    # ----- Forward Propagation ----- #
    
    def forward(self, X, y):
    
        n = len(X)
    
        self.a = [X]
    
        self.h = []
    
        for i in range(self.n_hidden):
            
            self.h.append( np.matmul(self.a[i], self.W[i].T) + self.b[i] )
            
            self.a.append( self.activation(self.h[i])  )
            
        self.h_output = np.matmul(self.a[-1], self.W[-1].T) + self.b[-1]
        
        self.a_output = self.softmax(self.h_output)
        
        y_hat = np.argmax(self.a_output, axis=1)
        
        accuracy = np.sum(y == y_hat) / n
        
        return y_hat, self.loss(self.a_output, y), accuracy
    
    
    # ----- Backward Propagation ----- #
    
    def backward(self, X, y):
        
        # Set up list storing gradients

        grad_W = []
        grad_b = []

        for i in range(self.n_hidden + 1):
            
            grad_W.append(None)
            grad_b.append(None)
            
        for i in range(self.n_hidden, -1, -1):
            
            # Softmax
            if i == self.n_hidden:
                grad_h = self.a_output
                grad_h -= y
            
            # ReLU
            else: 
                grad_h = grad_a * ( (self.h[i] > 0) * 1 )
            
            grad_W[i] = np.matmul(grad_h.T, self.a[i])
            
            grad_b[i] = np.sum(grad_h, axis=0)
            
            grad_a = np.matmul(grad_h, self.W[i])
        
        return (grad_W, grad_b)

In [79]:
model = NN(hidden_dims=[666, 666], n_hidden=2, initialization_scheme='Normal')

model.train(X_train, y_train, 10, 0.05)



0.0 %


  return umr_maximum(a, axis, None, out, keepdims)


1.0 %
2.0 %
3.0 %
4.0 %
5.0 %
6.0 %
7.0 %
8.0 %
9.0 %
10.0 %
11.0 %
12.0 %
13.0 %
14.0 %
15.0 %
16.0 %
17.0 %
18.0 %
19.0 %
20.0 %
21.0 %
22.0 %
23.0 %
24.0 %
25.0 %
26.0 %
27.0 %
28.0 %
29.0 %
30.0 %
31.0 %
32.0 %
33.0 %
34.0 %
35.0 %
36.0 %
37.0 %
38.0 %
39.0 %
40.0 %
41.0 %
42.0 %
43.0 %
44.0 %
45.0 %
46.0 %
47.0 %
48.0 %
49.0 %
50.0 %
51.0 %
52.0 %
53.0 %
54.0 %
55.0 %
56.0 %
57.0 %
58.0 %
59.0 %
60.0 %
61.0 %
62.0 %
63.0 %
64.0 %
65.0 %
66.0 %
67.0 %
68.0 %
69.0 %
70.0 %
71.0 %
72.0 %
73.0 %
74.0 %
75.0 %
76.0 %
77.0 %
78.0 %
79.0 %
80.0 %
81.0 %
82.0 %
83.0 %
84.0 %
85.0 %
86.0 %
87.0 %
88.0 %
89.0 %
90.0 %
91.0 %
92.0 %
93.0 %
94.0 %
95.0 %
96.0 %
97.0 %
98.0 %
99.0 %
Epoch 50000
Empirical risk [nan]
Accuracy 4922 %
0.0 %
1.0 %


KeyboardInterrupt: ignored