In [14]:
# As usual, a bit of setup
%load_ext autoreload
%autoreload 2

import time
import numpy as np
import matplotlib.pyplot as plt
from nnet.data.data_utils import get_CIFAR10_data
from nnet.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
from nnet.solver import Solver
from nnet.data.data_utils import load_CIFAR10
from nnet.res_net import ResNet

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-2, np.abs(x) + np.abs(y))))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Load the (preprocessed) CIFAR10 data.

data = get_CIFAR10_data()
for k, v in data.iteritems():
  print '%s: ' % k, v.shape

X_val:  (1000, 3, 32, 32)
X_train:  (49000, 3, 32, 32)
X_test:  (1000, 3, 32, 32)
y_val:  (1000,)
y_train:  (49000,)
y_test:  (1000,)


In [9]:
# Check the training-time forward pass by checking means and variances
# of features both before and after batch normalization

# Simulate the forward pass for a two-layer network
N, D1, D2, D3 = 50, 3, 10, 10
X = np.random.randn(N, D1, D2, D3)
#W1 = np.random.randn(D1, D2)
#W2 = np.random.randn(D2, D3)
#a = np.maximum(0, X.dot(W1)).dot(W2)

print 'Before batch normalization:'
print '  means: ', X.mean(axis=(0, 2, 3))
print '  stds: ', X.std(axis=(0, 2, 3))

# Means should be close to zero and stds close to one
print 'After batch normalization (gamma=1, beta=0)'
a_norm, _ = spatial_batchnorm_forward(X, np.ones(D1), np.zeros(D1), {'mode': 'train'})
print '  mean: ', a_norm.mean(axis=(0, 2, 3))
print '  std: ', a_norm.std(axis=(0, 2, 3))

# Now means should be close to beta and stds close to gamma
gamma = np.asarray([1.0, 2.0, 3.0])
beta = np.asarray([11.0, 12.0, 13.0])
a_norm, _ = spatial_batchnorm_forward(X, gamma, beta, {'mode': 'train'})
print 'After batch normalization (nontrivial gamma, beta)'
print '  means: ', a_norm.mean(axis=(0, 2, 3))
print '  stds: ', a_norm.std(axis=(0, 2, 3))

Before batch normalization:
  means:  [ 0.00894057  0.02568426 -0.00970689]
  stds:  [ 0.99063808  1.00463239  0.98970638]
After batch normalization (gamma=1, beta=0)
  mean:  [ -2.33146835e-17  -8.10129741e-17  -2.00228722e-17]
  std:  [ 1.  1.  1.]
After batch normalization (nontrivial gamma, beta)
  means:  [ 11.  12.  13.]
  stds:  [ 1.  2.  3.]


In [2]:
# Check the test-time forward pass by running the training-time
# forward pass many times to warm up the running averages, and then
# checking the means and variances of activations after a test-time
# forward pass.

N, D1, D2, D3 = 20, 3, 1, 1
#W1 = np.random.randn(D1, D2)
#W2 = np.random.randn(D2, D3)

bn_param = {'mode': 'train'}
gamma = np.ones(D1)
beta = np.zeros(D1)
for t in xrange(100):
  X = np.random.randn(N, D1, D2, D3)
  
  spatial_batchnorm_forward(X, gamma, beta, bn_param)
bn_param['mode'] = 'test'
X = np.random.randn(N, D1, D2, D3)
a_norm, _ = spatial_batchnorm_forward(X, gamma, beta, bn_param)

# Means should be close to zero and stds close to one, but will be
# noisier than training-time forward passes.
print 'After batch normalization (test-time):'
print '  means: ', a_norm.mean(axis=(0, 2, 3))
print '  stds: ', a_norm.std(axis=(0, 2, 3))

After batch normalization (test-time):
  means:  [ 0.21431437  0.26381058 -0.11471238]
  stds:  [ 1.11203999  1.04931796  0.97440943]


In [3]:
# Gradient check batchnorm backward pass

N, D, H, W = 20, 3, 10, 10
x = 5 * np.random.randn(N, D, H, W) + 12
gamma = np.random.randn(D)
beta = np.random.randn(D)
dout = np.random.randn(N, D, H, W)

bn_param = {'mode': 'train'}
fx = lambda x: spatial_batchnorm_forward(x, gamma, beta, bn_param)[0]
fg = lambda a: spatial_batchnorm_forward(x, gamma, beta, bn_param)[0]
fb = lambda b: spatial_batchnorm_forward(x, gamma, beta, bn_param)[0]

dx_num = eval_numerical_gradient_array(fx, x, dout)
da_num = eval_numerical_gradient_array(fg, gamma, dout)
db_num = eval_numerical_gradient_array(fb, beta, dout)
#dmean_num = np.sum(dx_num, axis = (0,2,3))*N
#dmean_num = np.sum(dx_num, axis = 0)*N
_, cache = spatial_batchnorm_forward(x, gamma, beta, bn_param)
dx, dgamma, dbeta = spatial_batchnorm_backward(dout, cache)
#print 'dmean error: ', rel_error(dmean_num, dmean), dmean, dmean_num
print 'dx error: ', rel_error(dx_num, dx)
print 'dgamma error: ', rel_error(da_num, dgamma)
print 'dbeta error: ', rel_error(db_num, dbeta)

dx error:  1.15957511654e-05
dgamma error:  2.91997839797e-12
dbeta error:  3.60408390141e-12


## Fully Connected Nets with Batch Normalization
Now that you have a working implementation for batch normalization, go back to your `FullyConnectedNet` in the file `cs2312n/classifiers/fc_net.py`. Modify your implementation to add batch normalization.

Concretely, when the flag `use_batchnorm` is `True` in the constructor, you should insert a batch normalization layer before each ReLU nonlinearity. The outputs from the last layer of the network should not be normalized. Once you are done, run the following to gradient-check your implementation.

HINT: You might find it useful to define an additional helper layer similar to those in the file `cs231n/layer_utils.py`. If you decide to do so, do it in the file `cs231n/classifiers/fc_net.py`.

In [21]:
N, D, H1, H2, C = 1, 3, 20, 30, 10
#X = np.random.randn(N, D)
#y = np.random.randint(C, size=(N,))
X = np.random.randn(N, 3, 32, 32)
y = np.random.randint(C, size=(N,))

for reg in [0, 3.14]:
  print 'Running check with reg = ', reg

  model = ResNet(n_size=1, num_starting_filters=2)
  loss, grads = model.loss(X, y)
  print 'Initial loss: ', loss

  for name in sorted(grads):
    f = lambda _: model.loss(X, y)[0]
    grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-6)
    print '%s relative error: %.2e' % (name, rel_error(grad_num, grads[name]))
  if reg == 0: print

Running check with reg =  0
[2, 2, 2, 4, 4, 8, 8]
Initial loss:  2.3309981823
W1 relative error: 1.00e+00
W2 relative error: 1.00e+00
W3 relative error: 1.00e+00
W4 relative error: 1.00e+00
W5 relative error: 1.00e+00
W6 relative error: 1.00e+00
W7 relative error: 1.00e+00
W9 relative error: 1.00e+00
b1 relative error: 1.00e+00
b2 relative error: 1.00e+00
b3 relative error: 1.00e+00
b4 relative error: 1.00e+00
b5 relative error: 1.00e+00
b6 relative error: 1.00e+00
b7 relative error: 1.00e+00
b9 relative error: 1.00e+00
beta1 relative error: 5.16e-01
beta2 relative error: 9.74e-01
beta3 relative error: 6.72e-01
beta4 relative error: 1.00e+00
beta5 relative error: 6.49e-01
beta6 relative error: 1.00e+00
beta7 relative error: 1.00e+00
gamma1 relative error: 1.00e+00
gamma2 relative error: 9.78e-01
gamma3 relative error: 1.00e+00
gamma4 relative error: 9.80e-01
gamma5 relative error: 1.00e+00
gamma6 relative error: 1.00e+00
gamma7 relative error: 1.00e+00

Running check with reg =  3.14
[

In [29]:
num_train = 100
small_data = {
  'X_train': data['X_train'][:num_train],
  'y_train': data['y_train'][:num_train],
  'X_val': data['X_val'],
  'y_val': data['y_val'],
}

model = ResNet(n_size=1)

solver = Solver(model, small_data,
                num_epochs=20, batch_size=20,
                update_rule='sgd_th',
                optim_config={
                  'learning_rate': .1, 
                  'nesterov': True,
                  'momentum': .9,
                },
                verbose=True, print_every=1)
solver.train()

[16, 16, 16, 32, 32, 64, 64]
Training for 20 epochs, or 100 iterations.
2016-03-05 22:07:19.119856: Step 1, loss: 2.898 train acc: 0.160; val_acc: 0.104 (0.32 sec/batch)
2016-03-05 22:07:26.312264: Step 2, loss: 2.388 train acc: 0.140; val_acc: 0.116 (0.32 sec/batch)
2016-03-05 22:07:33.543391: Step 3, loss: 2.382 train acc: 0.150; val_acc: 0.112 (0.32 sec/batch)
2016-03-05 22:07:40.774980: Step 4, loss: 2.943 train acc: 0.180; val_acc: 0.127 (0.32 sec/batch)
2016-03-05 22:07:47.943591: Step 5, loss: 3.051 train acc: 0.210; val_acc: 0.143 (0.31 sec/batch)
*Epoch 0 / 20 Ended: best_val_acc: 0.143000
2016-03-05 22:07:55.160426: Step 6, loss: 2.723 train acc: 0.240; val_acc: 0.146 (0.32 sec/batch)
2016-03-05 22:08:02.449930: Step 7, loss: 3.289 train acc: 0.130; val_acc: 0.108 (0.31 sec/batch)
2016-03-05 22:08:09.794936: Step 8, loss: 2.294 train acc: 0.180; val_acc: 0.130 (0.31 sec/batch)
2016-03-05 22:08:17.048611: Step 9, loss: 3.307 train acc: 0.240; val_acc: 0.167 (0.31 sec/batch)
20