
# Two Layer Neural Network
# hyperparameter tunning
This new notebook is to:
1. clean up the cs231n official note
2. isolate the tunning part of the work.

In [None]:
# send wechat message at the end of training
import itchat

itchat.auto_login(enableCmdQR=-2)
train_flag = False

In [1]:
# A bit of setup
import numpy as np
import matplotlib.pyplot as plt
from __future__ import print_function
import copy
import time

from cs231n.classifiers.neural_net import TwoLayerNet
from cs231n.classifiers.neural_net_tunning import hyper_params_comb, net_tunning

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

# some of the functions that might come in handy later
def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


# load CIFAR-10 data
from cs231n.data_utils import load_CIFAR10, get_CIFAR10_data

data_set = get_CIFAR10_data()


# unpack the data
# re-run this to reload X_train and y_train
X_train = data_set['X_train']
y_train = data_set['y_train']
X_val = data_set['X_val']
y_val = data_set['y_val']
X_test = data_set['X_test']
y_test = data_set['y_test']

# check dimensions
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Train data shape:  (49000, 3072)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3072)
Validation labels shape:  (1000,)
Test data shape:  (1000, 3072)
Test labels shape:  (1000,)


In [None]:
# build the developing dataset
num_dev = 500
mask_dev = np.random.choice(X_train.shape[0], num_dev, replace=False)
X_dev = X_train[mask_dev]
y_dev = y_train[mask_dev]
print('Dev data shape: ', X_dev.shape)
print('Dev labels shape: ', y_dev.shape)

# use developing data if dev_mode is on
dev_mode = False  # developing mode flag: True(on) or False(off)
if dev_mode:
    X_train = X_dev
    y_train = y_dev
    print('Notice: using dev dataset!')
else:
    del X_dev, y_dev
    print('Caution: using full training dataset!')

**Note that all list/array of hyperparameters are arranged in the order of:  
[hidden_size, learning_rate, num_epochs, reg]**

### 0. Computation speed comparison: gcloud vs macbook
The quotas on this gcloud VM instance is:
- 8 vCPU(s): Intel Xeon @ 2.2 GHz
- RAM: 30GB
- ROM: 10GB persistent disk

On Macbook:  
- 1 CPU: Intel(R) Core(TM) i5-5250U CPU @ 1.60GHz (boost 2.7GHz)
- 4GB 1600MHz DDR3

The following section tests actual running time of training 5 epochs, averaged over 5 runs.

In [None]:
# hidden_size_range = [204]
# learning_rate_range = [1e-3]
# num_epochs_range = [7]
# reg_range = [1.20e-06]

# hyper_params_range = [hidden_size_range, learning_rate_range, num_epochs_range, reg_range]
# hyper_params_list = hyper_params_comb(hyper_params_range)

# tic = time.time()
# best_net, results = net_tunning(X_train, y_train, 
#                                X_val, y_val, 
#                                hyper_params_list, verbose=False)
# toc = time.time()

# # time
# total_time = float(toc-tic)
# avg_time = total_time / len(hyper_params_list)
# print('Time elapsed: %f' % (toc-tic))
# print('Average per training: %f' % (avg_time))
# print()

Results (train + val time per 5 epochs):  
- on macbook: 94.24 seconds
- on gcloud VM: 31.35 seconds

### 1. **Hidden size tunning:**  
Coarse tunning:  
- tried [100~300] for 10 evenly spaced numbers
- best validation accuracy: 0.513000 @ H = 210
- best training accuracy: 0.533612

Fine tunning:  
- try [200 - 220] every one
- best validation accuracy:   @ H = 204
- best training accuracy:


In [None]:
# # 1.1 hidden size: coarse search
# # set range of tunning for hyperparameters
# hidden_size_range = np.arange(201, 221, 1)
# learning_rate_range = [1e-3]
# num_epochs_range = [5]
# reg_range = [0.25]
# # drop = 1  # single on/off flag
# # drop_prob = 0.5


# hyper_params_range = [hidden_size_range, 
#                       learning_rate_range, 
#                       num_epochs_range, 
#                       reg_range]

# # net model tunning
# hyper_params_list = hyper_params_comb(hyper_params_range)
# print('Number of hyperparms to tune: %d' % len(hyper_params_list))
# tic = time.time()
# best_net, results = net_tunning(X_train, y_train, 
#                                X_val, y_val, 
#                                hyper_params_list, verbose=False)
# toc = time.time()

# # time
# print()
# print('Number of hyperparams to tune: %d' % len(hyper_params_list))
# print('Total time used: %f (seconds)' % (toc-tic))
# print('Total time per hyperparam: %f (seconds)' % (float(toc-tic)/float(len(hyper_params_list))))
# print()


# # visualise of results: hidden_size vs train/val accuracy
# hidden_size_axis = [hyper_params[0] for hyper_params in sorted(results)]
# train_acc_history = [results[hyper_params][0] for hyper_params in sorted(results)]
# val_acc_history = [results[hyper_params][1] for hyper_params in sorted(results)]

# plt.plot(hidden_size_axis, val_acc_history, label='val')
# plt.plot(hidden_size_axis, train_acc_history, label='train')
# plt.title('Coarse tunning of hidden size')
# plt.legend()
# plt.show()

### 2. Regluarisation running:
Notice: fixed hidden size = 210
Coarse tunning:  
- 1st tunning: 10^[-2~2] for 20 evenly spaced number
    - results: best validation accuracy 0.498 @ reg = 0.042813
    - best results at the lower edge
- 2nd tunning: 10^[-5~1] for 20 evenly spaced numbers    
    - results: best validation accuracy 0.521000 @ **reg = 0.003360**
    
Fine tunning:
- 10^[-6 ~ -5] random searching 20 numbers (uniform distribution)
    - results: best validation accuracy 0.512 @ **reg = 1.20e-06**

In [None]:
# # 2.1 regularisation: fine search
# # coarse search of reg with hidden_size = 204 (best from above)
# # set range of tunning for hyperparameters
# # reg_range = 10 ** np.linspace(-5, 1, 20)  # 1st coarse tunning
# # reg_range = 10 ** (np.random.uniform(-6, -5, 20)) # 2nd coarse tunning
# reg_range = 10 ** (np.random.uniform(-6, -5.8, 10))  # 1st fine tunning

# hidden_size_range = [204]
# learning_rate_range = [1e-3]
# num_epochs_range = [5]
# # drop = 1  # single on/off flag
# # drop_prob = 0.5


# hyper_params_range = [hidden_size_range, 
#                       learning_rate_range, 
#                       num_epochs_range, 
#                       reg_range]

# # net model tunning
# hyper_params_list = hyper_params_comb(hyper_params_range)
# tic = time.time()
# best_net, results = net_tunning(X_train, y_train, 
#                                X_val, y_val, 
#                                hyper_params_list, verbose=False)
# toc = time.time()

# # time
# print()
# print('Number of hyperparams to tune: %d' % len(hyper_params_list))
# print('Total time used: %f (seconds)' % (toc-tic))
# print('Total time per hyperparam: %f (seconds)' % (float(toc-tic)/float(len(hyper_params_list))))
# print()


# # visualise of results: hidden_size vs train/val accuracy
# reg_axis = [hyper_params[3] for hyper_params in sorted(results)]
# train_acc_history = [results[hyper_params][0] for hyper_params in sorted(results)]
# val_acc_history = [results[hyper_params][1] for hyper_params in sorted(results)]

# plt.subplot(2,1,1)
# plt.plot(reg_axis, val_acc_history, label='val')
# plt.plot(reg_axis, train_acc_history, label='train')
# plt.title('Coarse tunning of regularisation strength')
# plt.legend()
             
# plt.subplot(2,1,2)
# plt.plot(np.log10(reg_axis), val_acc_history, label='val')
# plt.plot(np.log10(reg_axis), train_acc_history, label='train')
# plt.title('Coarse tunning of regularisation strength, logirithm axis')
# plt.legend()
# plt.show()


In [2]:
# temp teseting: 2017-08-23 22:15
# when learning rate at 10 ** -2 and higher, overflow happens.
# let's find out what the weight matrix W looks like for learning_rate = 10**-1
learning_rate_range = [10 ** (-3)]

hidden_size_range = [204]
num_epochs_range = [5]
reg_range = [1.20e-06]


# pipe ranges into a list
hyper_params_range = [hidden_size_range, 
                      learning_rate_range, 
                      num_epochs_range, 
                      reg_range]

hyper_params_list = hyper_params_comb(hyper_params_range)
tic = time.time()
best_net, results, best_val_acc = net_tunning(X_train, y_train, 
                               X_val, y_val, 
                               hyper_params_list, verbose=True)
toc = time.time()

print('Paramters in W1 are:')
print(best_net.params['W1'])
print('Paramters in W2 are:')
print(best_net.params['W2'])
print()


iteration 10 / 1225: loss 2.300941
W2 norm: 0.000035
W1 norm: 0.006286
iteration 20 / 1225: loss 2.286067
W2 norm: 0.000173
W1 norm: 0.006422
iteration 30 / 1225: loss 2.195848
W2 norm: 0.000977
W1 norm: 0.007222
iteration 40 / 1225: loss 2.113286
W2 norm: 0.002412
W1 norm: 0.008644
iteration 50 / 1225: loss 2.057474
W2 norm: 0.004035
W1 norm: 0.010253
iteration 60 / 1225: loss 2.048025
W2 norm: 0.005432
W1 norm: 0.011657
iteration 70 / 1225: loss 2.034057
W2 norm: 0.007150
W1 norm: 0.013387
iteration 80 / 1225: loss 2.003117
W2 norm: 0.008834
W1 norm: 0.015099
iteration 90 / 1225: loss 1.958606
W2 norm: 0.010357
W1 norm: 0.016739
iteration 100 / 1225: loss 1.901248
W2 norm: 0.011991
W1 norm: 0.018480
iteration 110 / 1225: loss 1.887584
W2 norm: 0.013908
W1 norm: 0.020572
iteration 120 / 1225: loss 1.890167
W2 norm: 0.015330
W1 norm: 0.022170
iteration 130 / 1225: loss 1.837199
W2 norm: 0.017206
W1 norm: 0.024204
iteration 140 / 1225: loss 1.734436
W2 norm: 0.018640
W1 norm: 0.025857
i

In [None]:
# 3. learning rate tunning
###### setting hyperparameters ###### 
# tunning parameter:
# learning_rate_range = [1e-3]
learning_rate_range = 10 ** np.linspace(-6, 1, 20)  # 1st coarse tunning

# locked paramters:
hidden_size_range = [204]
num_epochs_range = [5]
reg_range = [1.20e-06]

# pipe ranges into a list
hyper_params_range = [hidden_size_range, 
                      learning_rate_range, 
                      num_epochs_range, 
                      reg_range]

hyper_params_list = hyper_params_comb(hyper_params_range)
tic = time.time()
best_net, results, best_val_acc = net_tunning(X_train, y_train, 
                               X_val, y_val, 
                               hyper_params_list, verbose=True)
toc = time.time()


####### time consumed ######
print()
print('Number of hyperparams to tune: %d' % len(hyper_params_list))
print('Total time used: %f (seconds)' % (toc-tic))
print('Total time per hyperparam: %f (seconds)' % (float(toc-tic)/float(len(hyper_params_list))))
print()


###### Visualisation of results ######
# visualise: [learning rate] vs [train/val accuracy]
reg_axis = [hyper_params[1] for hyper_params in sorted(results)]
train_acc_history = [results[hyper_params][0] for hyper_params in sorted(results)]
val_acc_history = [results[hyper_params][1] for hyper_params in sorted(results)]

plt.subplot(2,1,1)
plt.plot(reg_axis, val_acc_history, label='val')
plt.plot(reg_axis, train_acc_history, label='train')
plt.title('Tunning of learning rate')
plt.legend()
             
plt.subplot(2,1,2)
plt.plot(np.log10(reg_axis), val_acc_history, label='val')
plt.plot(np.log10(reg_axis), train_acc_history, label='train')
plt.title('Tunning of regularisation strength, logirithm axis')
plt.legend()
plt.show()

##### send a messgae to wechat #####
itchat.send('Learning rate tunning completed! \n '
            '\nTime used: \n%f (secs)\nBest validation accuracy: \n%f' % (toc-tic, best_val_acc),
            toUserName='filehelper')

In [None]:
# 4. Number of epochs tunning

###### setting hyperparameters ###### 
# tunning parameter:
# learning_rate_range = [1e-3]
# num_epochs_range = np.arange(5, 15, 1)  # 1st coarse tunning, result: best 0.54 @ 14 epochs
num_epochs_range = np.arange(14, 21, 1) # 2nd coarse tunning, rsult: best 0.54 @ 17 epochs

# locked paramters:
hidden_size_range = [204]
reg_range = [1.20e-06]
learning_rate_range = [1e-3]

# pipe ranges into a list
hyper_params_range = [hidden_size_range, 
                      learning_rate_range, 
                      num_epochs_range, 
                      reg_range]

hyper_params_list = hyper_params_comb(hyper_params_range)
tic = time.time()
best_net, results, best_val_acc = net_tunning(X_train, y_train, 
                               X_val, y_val, 
                               hyper_params_list, verbose=False)
toc = time.time()


####### time consumed ######
print()
print('Number of hyperparams to tune: %d' % len(hyper_params_list))
print('Total time used: %f (seconds)' % (toc-tic))
print('Total time per hyperparam: %f (seconds)' % (float(toc-tic)/float(len(hyper_params_list))))
print()


###### Visualisation of results ######
# visualise: [learning rate] vs [train/val accuracy]
num_ep_axis = [hyper_params[2] for hyper_params in sorted(results)]
train_acc_history = [results[hyper_params][0] for hyper_params in sorted(results)]
val_acc_history = [results[hyper_params][1] for hyper_params in sorted(results)]

plt.plot(num_ep_axis, val_acc_history, label='val')
plt.plot(num_ep_axis, train_acc_history, label='train')
plt.title('Tunning of number of epochs')
plt.legend()


##### send a messgae to wechat #####
itchat.send('Number of epochs tunning completed!'
            '\nTime used: \n%f (secs)\nBest validation accuracy: \n%f'
            '\nBest number of epochs: %d' 
            % (toc-tic, best_val_acc, best_net.hyper_params['num_epochs']),
            toUserName='filehelper')

In [None]:
# combined fine tuning

In [None]:
# dropout

In [None]:
# Adam