In [1]:
import numpy as np
import matplotlib.pyplot as plt

import layers
from layers import affine_relu_forward, affine_backward
from data_utils import get_CIFAR10_data
from fc_net import FullyConnectedNet
from solver import Solver
%matplotlib inline

In [2]:
np.set_printoptions(formatter={'all': lambda x:"%.2f" %x})

In [3]:
cifar10_dir = "/Users/mahrokh/Desktop/cifar-10-batches-py"
data = get_CIFAR10_data(cifar10_dir)
for k, v in data.items():
    print('%-7s:' %k, v.shape)

X_train: (49000, 3, 32, 32)
y_train: (49000,)
X_val  : (1000, 3, 32, 32)
y_val  : (1000,)
X_test : (1000, 3, 32, 32)
y_test : (1000,)


In [4]:
x_train, y_train = data['X_train'], data['y_train']
x_val,   y_val   = data['X_val']  , data['y_val']
x_test,  y_test  = data['X_test'] , data['y_test']

# implementing two layer NN

In [5]:
from layers import *

def two_layer_network_loss(x, y, params):
    w1, b1 = params['w1'], params['b1']
    w2, b2 = params['w2'], params['b2']
    
    ################
    # forward step #
    ################
    h, cache_h = affine_relu_forward(x, w1, b1)
    scores, cache_o = affine_forward(h, w2, b2)
    
    #######################
    # compute data a loss #
    #######################
    loss, dscore = softmax_loss(scores, y)
    
    ################
    # backward step #
    ################
    dh, dw2, db2 = affine_backward(dscore, cache_o)
    _, dw1, db1 = affine_relu_backward(dh, cache_h)
    
    grads = {'w1': dw1, 'b1': db1, 'w2': dw2, 'b2': db2}
    return loss, grads

In [6]:
def predict(x, params):
    w1, b1 = params['w1'], params['b1']
    w2, b2 = params['w2'], params['b2']
    
    ################
    # forward step #
    ################
    h, _ = affine_relu_forward(x, w1, b1)
    scores, _ = affine_forward(h, w2, b2)
    
    return np.argmax(scores, axis=1)

In [7]:
def accuracy(y_pred, y_true):
    return 100.0* np.mean(y_pred == y_true)

# Multi Layers NN

In [8]:
def multi_layers_network_loss(x, y, params):
    num_layers = len(params) // 2
    
    # forward step
    loss = 0.0
    cache, h = {}, x
    
    for i in range (1, num_layers+1):
        w, b = params['w%d'%i], params['b%d'%i]
        
        if i < num_layers: #hidden layers
            h, cache[i] = affine_relu_forward(h, w, b)
        else:
            scores, cache[i] = affine_forward(h, w, b)
            
        loss += 0.5* reg* np.sum(w*w)
            
    # compute data loss
    data_loss, dscores = softmax_loss(scores, y)
    loss += data_loss
    
    # backward step
    grads ={}
    for i in reversed(range(1, num_layers+1)):
        
        if i == num_layers: #output layer
            dout, dw, db = affine_backward(dscores, cache[i])
                         
        else:
            dout, dw, db = affine_relu_backward(dout, cache[i])
            
        w = params['w%d' % i]
        grads['w%d' %i] = dw + reg * w
        grads['b%d' %i] = db
        
    return loss, grads      

### predict

In [9]:
def predict(x, params):
    h = x
    for i in range (1, num_layers+1):
        w, b = params['w%d'%i], params['b%d'%i]
        
        if i < num_layers: #hidden layers
            h, _ = affine_relu_forward(h, w, b)
        else:
            scores, _ = affine_forward(h, w, b)
        
    return np.argmax(scores, axis=1)

# Minimize Loss Function

In [10]:
num_input = 3072
num_output = 10
hidden_dims = [200, 200, 200, 200]

#### gradient descent

In [11]:
def affine_relu_forward(x, w, b):
    out, cache_a = affine_forward(x, w, b)
    out, cache_r = relu_forward(out)
    return out, (cache_a, cache_r)

def affine_relu_backward(dout, cache):
    cache_a, cache_r = cache
    dout = relu_backward(dout, cache_r)
    dx, dw, db = affine_backward(dout, cache_a)
    return dx, dw, db

In [12]:
# initialize parameters
params = {}
dims = [num_input] + hidden_dims + [num_output]
num_layers = len(dims) - 1

for i in range(1, num_layers+1):
    params['w%d'%i] = 0.3 * np.random.randn(dims[i-1], dims[i])
    params['b%d'%i] = np.zeros(dims[i])
    
#N = x_train.shape[0] 
N = 3072

# hyper parameters
n_iterations = 2000
batch_size = 256

lr = 0.05
reg = 1e-5

loss_history = []
best_params = params.copy()
best_val = 0

# gradient descent
for i in range(n_iterations+1):
    # create a batch of training data
    idx = np.random.choice(N, batch_size, replace=False)
    x_batch, y_batch = x_train[idx], y_train[idx]
    
    loss, grads = multi_layers_network_loss(x_batch, y_batch, params)
    loss_history.append(loss)
    
    # report every 100 iterations
    if i %100 == 0:
        
        y_train_pred = predict(x_batch, params)
        train_acc = accuracy(y_train_pred, y_batch)
        
        y_val_pred = predict(x_val, params)
        val_acc = accuracy(y_val_pred, y_val)
        
        if val_acc > best_val:
            best_val = val_acc
            best_params = params.copy()
            
        print('%4d: loss = %.2f |train accuracy = %5.2f |validation accuracy = %5.2f'
              %(i, loss, train_acc, val_acc))
        
    # update parameters
    for p in params:
        params[p] -= lr * grads[p]

   0: loss = 168636.97 |train accuracy = 10.55 |validation accuracy = 14.20
 100: loss = 4626929983462448963765185977560501775299195035153113940419307110400.00 |train accuracy = 12.50 |validation accuracy =  7.90
 200: loss = 4626467313482320655410563829977636489998049319713539023798181822464.00 |train accuracy = 10.94 |validation accuracy =  7.90
 300: loss = 4626004689766888799043515409793027919569443934240407553818247561216.00 |train accuracy = 12.50 |validation accuracy =  7.90
 400: loss = 4625542112311525228199072200117542091179919570969714695777011367936.00 |train accuracy =  7.81 |validation accuracy =  7.90
 500: loss = 4625079581111607762722972191440397994289091728032704125671676313600.00 |train accuracy = 11.72 |validation accuracy =  7.90
 600: loss = 4624617096162508984439085180294755776350135448402277072637211443200.00 |train accuracy =  9.38 |validation accuracy =  7.90
 700: loss = 4624154657459602971748957590058363825389494476525146893483509809152.00 |train accuracy = 