# Setup

In [0]:
# For Google Collab

# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

In [16]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import numpy as np
import matplotlib.pyplot as plt

In [0]:
def one_hot(old_y, m):
    
    n = len(old_y)
    
    y = np.zeros((n, m))
    
    y[np.arange(n), old_y] = 1
    
    return y

In [0]:
# ~~ MNIST dataset ~~

X_train = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_train.npy')
y_train = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_train.npy'), 10)

X_val = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_val.npy')
y_val = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_val.npy'), 10)

X_test = np.load('/content/gdrive/My Drive/Datasets/MNIST/x_test.npy')
y_test = one_hot(np.load('/content/gdrive/My Drive/Datasets/MNIST/y_test.npy'), 10)

# Model

In [0]:
class NN:
    
    # ReLU activation
    
    def activation(self, inputs):
        
        zeros = np.zeros_like(inputs)
        
        return np.maximum(zeros, inputs)
    
    # Softmax
    
    def softmax(self, inputs):
        
        # Numerically stable softmax
        
        b = np.max(inputs, axis=1).reshape((n, 1))
        
        return np.exp(inputs - b) / np.sum( np.exp(inputs - b), axis=1 ).reshape((n, 1))
        
    # Different types of initialization
    
    def initialize_weights(self, n_hidden, dims):
    
        # Zero
    
        if self.initialization = 'Zero':
            
            for i in range(n_hidden + 1):
            
                # Weights set to 0
            
                self.W.append( np.zeros(( dims[i+1], dims[i] )) )
        
                # Biases set to 0
            
                self.b.append( np.zeros( dims[i+1] ) )
        
        # Normal
        
        elif self.initialization = 'Normal':
            
            for i in range(n_hidden + 1):
            
                # Weights sampled from N(0,1)
            
                self.W.append( np.random.randn( dims[i+1], dims[i] ) )
        
                # Biases set to 0
            
                self.b.append( np.zeros( dims[i+1] ) )
        
        # Glorot
        
        else: #self.initialization = 'Glorot'
            
            for i in range(L + 1):            
            
                # Weights sampled from U(-d^l, d^l), d^l = sqrt( 6 / h^(l-1) + h^l )
            
                d = np.sqrt( 6 / dims[i] + dims[i+1] )
            
                self.W.append( np.random.uniform( -d, d, (dims[i+1], dims[i]) ) )
        
                # Biases ~ 0
            
                self.b.append( np.zeros(dimensions[i+1]) )
        
        
    def __init__(self, hidden_dims, initialization):
        
        
    
    def __init__(self, L, d, d_h, m, lambdas):
        
        self.d = d
        self.d_h = d_h
        self.m = m

        self.W = []
        self.b = []
        
        self.L = L
        
        dims = [d] + d_h + [m]
        
        self.dimensions = dimensions
        
        for i in range(L + 1):
            
            boundary = 1 / np.sqrt(dimensions[i])
            
            self.W.append( np.random.uniform(-boundary, boundary, (dimensions[i+1], dimensions[i])) )
        
            # Biases ~ 0
            
            self.b.append( np.zeros(dimensions[i+1]) )
            
        # Lambdas - Elastic net (L1 + L2 regularization)
            
        self.lambdasL1, self.lambdasL2 = lambdas
        
    
    def update(self, grads)
    
    def train(self, X, y, epochs, eta, K):
        
        n, d = X.shape
        
        n, m = y.shape
            
        # Stochastic Gradient Descent
            
        for i in range(epochs):
                
            # Forward pass
                
            self.forward_propagation(X_batch, y_batch)                 
                    
            # Backward pass
                    
            grad_W, grad_b = self.backward_propagation(X_batch, y_batch)
                       
            # Gradient updates
                
            for l in range(self.L + 1):
                    
                self.W[l] -= eta * grad_W[l]
                self.b[l] -= eta * grad_b[l]
    
    def loss(self, predictions, labels):
        
        
    
    def forward(self, X, y):
    
        n, d = X.shape
    
        self.a = [X]
    
        self.h = []
    
        for i in range(self.L):
            
            self.h.append(np.matmul(self.a[i], self.W[i].T) + self.b[i])
            
            self.a.append( self.activation(self.h[i])  )
            
        self.h_output = np.matmul(self.a[-1], self.W[-1].T) + self.b[-1]
        
        self.a_output = self.softmax(self.h_output)
        
        y_hat = one_hot(np.argmax(self.a_output, axis=1), self.m)
        
        L_x = - np.log(self.a_output + 0.00001) # To avoid log(0)
        
        L_x_y = np.sum(L_x * y, axis=1)
        
        R = np.sum(L_x_y) / n
            
        for i in range(self.L + 1):
            
            R += self.lambdasL1[i] * np.sum(np.abs(self.W[i])) + self.lambdasL2[i] * np.sum(self.W[i] ** 2)
    
        return y_hat, R
    
    
    def backward(self, X, y):
    
        n, d = X.shape
        
        # Set up list storing gradients

        grad_W = []
        grad_b = []

        for i in range(self.L + 1):
            
            grad_W.append(None)
            grad_b.append(None)
            
        for i in range(self.L, -1, -1):
            
            # ∇ h_output ~ Softmax
            if i == self.L:    
                grad_h = self.a_output
                grad_h -= y
                grad_h /= n
            
            # ∇ h_hidden ~ ReLU
            else: 
                grad_h = grad_a * ( (self.h[i] > 0) * 1 )
            
            # ∇ W ~ ∇ W_L -> ∇ W_1
            grad_W[i] = np.matmul(grad_h.T, self.a[i]) + self.lambdasL1[i] * np.sign(self.W[i]) + \
                                                            2 * self.lambdasL2[i] * self.W[i]
            # ∇ b_i ~ ∇ b_L -> ∇ b_1
            grad_b[i] = np.sum(grad_h, axis=0)
            
            # ∇ a ~ ∇ a_L -> ∇ X 
            grad_a = np.matmul(grad_h, self.W[i])
        
        return grad_W, grad_b

In [0]:
np.random.seed(0)

n, d = X_train_fashion.shape
n, m = y_train_fashion.shape

d_h = []

L = len(d_h)

lambdasL1 = [0.00002]*(L+1)
lambdasL2 = [0.00002]*(L+1)

lambdas = [lambdasL1, lambdasL2]

model = Feedforward_Neural_Network(L, d, d_h, m, lambdas)

model.train(X_train_fashion, y_train_fashion, 30, 0.05, 10)

# Train

n, _ = X_train_fashion.shape

y_hat_train, _ = model.forward_propagation(X_train_fashion, y_train_fashion)
     
missclassification = np.sum( np.argmax(y_train_fashion, axis=1) != np.argmax(y_hat_train, axis=1)) / n * 100
    
print("Final missclassification on training set: ", missclassification, "%")

# Validation

n, _ = X_val_fashion.shape

y_hat_val, _ = model.forward_propagation(X_val_fashion, y_val_fashion)
            
missclassification = np.sum( np.argmax(y_val_fashion, axis=1) != np.argmax(y_hat_val, axis=1)) / n * 100
    
print("Final missclassification on validation set: ", missclassification, "%")

Epoch 0 , Missclassification: 87.146 %
Epoch 1 , Missclassification: 29.396 %
Epoch 2 , Missclassification: 30.98 %
Epoch 3 , Missclassification: 24.468 %
Epoch 4 , Missclassification: 31.044 %
Epoch 5 , Missclassification: 21.07 %
Epoch 6 , Missclassification: 22.79 %
Epoch 7 , Missclassification: 25.15 %
Epoch 8 , Missclassification: 29.836000000000002 %


KeyboardInterrupt: 