## Graident check

In [14]:
import numpy as np
from nn.layers import Conv, MaxPool, Linear, Relu
from nn.cnn import CNN
from check_gradient import *
from time import time

In [11]:
def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

check_conv = True
check_pool_back = True
check_conv_back = True
check_cnn_back = True
check_time = True

In [93]:
if check_conv:

    x_shape = (2, 3, 4, 4)
    w_shape = (3, 3, 4, 4)
    x = np.linspace(-0.1, 0.5, num=np.prod(x_shape)).reshape(x_shape)
    w = np.linspace(-0.2, 0.3, num=np.prod(w_shape)).reshape(w_shape)
    b = np.linspace(-0.1, 0.2, num=3)

    conv = Conv(3, 3, 4, 4, 2, 1)
    conv.params['w']['param'] = w
    conv.params['b']['param'] = b
    out = conv(x)
    # The correct output
    correct_out = np.array([[[[-0.08759809, -0.10987781],
                               [-0.18387192, -0.2109216 ]],
                              [[ 0.21027089,  0.21661097],
                               [ 0.22847626,  0.23004637]],
                              [[ 0.50813986,  0.54309974],
                               [ 0.64082444,  0.67101435]]],
                             [[[-0.98053589, -1.03143541],
                               [-1.19128892, -1.24695841]],
                              [[ 0.69108355,  0.66880383],
                               [ 0.59480972,  0.56776003]],
                              [[ 2.36270298,  2.36904306],
                               [ 2.38090835,  2.38247847]]]])

    print('Testing conv_forward')
    print('difference: ', rel_error(out, correct_out))

Testing conv_forward
difference:  2.2121476417505994e-08


In [94]:
if check_pool_back:
    np.random.seed(231)
    x = np.random.randn(3, 2, 8, 8)
    dout = np.random.randn(3, 2, 4, 4)

    pool = MaxPool(kernel_size=2, stride=2, padding=0)
    out = pool(x)

    dx = pool.backward(dout, x)

    dx_num = eval_numerical_gradient_array(pool, x, dout)

    # Your error should be around 1e-12
    print('Testing pooling backward:')
    print('dx error: ', rel_error(dx, dx_num))

Testing pooling backward:
dx error:  3.27562514223145e-12


In [98]:
# check convolutional backpropogation is correct:
if check_conv_back:
    x = np.random.randn(2, 3, 16, 16)
    w = np.random.randn(3, 3, 3, 3)
    b = np.random.randn(3, 1)
    dout = np.random.randn(2, 3, 14, 14)
    conv = Conv(in_channels=3, out_channels=3, height=3, width=3, stride=1, padding=0)
    conv.params['b']['param'] = b
    conv.params['w']['param'] = w # 此行为后添加的
    out = conv(x)
    dx = conv.backward(dout, x)

    dx_num = eval_numerical_gradient_array(conv, x, dout)

    params = conv.params


    def fw(v):
        tmp = params['w']['param']
        params['w']['param'] = v
        f_w = conv(x)
        params['w']['param'] = tmp
        return f_w


    dw = params['w']['grad']
    dw_num = eval_numerical_gradient_array(fw, w, dout)

    db = params['b']['grad']


    def fb(v):
        tmp = params['b']['param']
        params['b']['param'] = v
        f_b = conv(x)
        params['b']['param'] = tmp
        return f_b


    db_num = eval_numerical_gradient_array(fb, b, dout)

    print('Testing conv')
    print('dx error: ', rel_error(dx_num, dx))
    print('dw error: ', rel_error(dw_num, dw))
    print('db error: ', rel_error(db_num, db))

Testing conv
dx error:  1.5295788875965517e-08
dw error:  8.309913739514208e-10
db error:  2.4300919730707304e-11


In [18]:
# TODO: write script to check the backpropagation on the whole CNN is correct
if check_cnn_back:
    inputs=2
    input_dim=(3,16,16)
    hidden_units=10
    num_classes=10
    num_filters=3
    filter_size=3
    pool_size=2

    np.random.seed(231)
    X = np.random.randn(inputs, *input_dim)
    y = np.random.randint(num_classes, size=inputs)

    model = CNN(input_dim, num_filters, filter_size, pool_size, hidden_units, num_classes)
    start_time=time()
    loss, score = model.oracle(X, y)
    end_time=time()
    print('Time consuming: %fs' % (end_time-start_time))
    print('loss:', loss)
    a=['w1','w2','w3']
    b=['b1','b2','b3']

    for param_name in sorted(a):
        f = lambda _: model.oracle(X, y)[0]
        param_grad_num = eval_numerical_gradient(f, model.param_groups['w'][param_name]['param'], verbose=False, h=0.00001)
        e = rel_error(param_grad_num, model.param_groups['w'][param_name]['grad'])
        print('%s relative error: %e' % (param_name, rel_error(param_grad_num, model.param_groups['w'][param_name]['grad'])))

    for param_name in sorted(b):
        f = lambda _: model.oracle(X, y)[0]
        param_grad_num = eval_numerical_gradient(f, model.param_groups['b'][param_name]['param'], verbose=False, h=0.00001)
        e = rel_error(param_grad_num, model.param_groups['b'][param_name]['grad'])
        print('%s relative error: %e' % (param_name, rel_error(param_grad_num, model.param_groups['b'][param_name]['grad'])))

Time consuming: 0.055562s
loss: 2.3024449434898395
w1 relative error: 4.104933e-07
w2 relative error: 1.043347e-05
w3 relative error: 7.761086e-06
b1 relative error: 7.614735e-09
b2 relative error: 3.194184e-08
b3 relative error: 4.500063e-06


## Overfit small data

In [15]:
from __future__ import division, print_function, absolute_import
import numpy as np
from nn.optimizer import SGD
from nn.utils import accuracy
from dataset import get_cifar10_data
from nn.cnn import CNN

In [16]:
def train(model, X_train, y_train, X_val, y_val, batch_size, n_epochs, lr=1e-2,
          lr_decay=0.8, verbose=True, print_level=100):
    n_train = X_train.shape[0]
    iterations_per_epoch = max(n_train // batch_size, 1)
    n_iterations = n_epochs * iterations_per_epoch
    epoch = 0
    
    loss_hist = []

    # Define optimizer and set parameters
    opt_params = {'lr': 1e-3}
    sgd = SGD(model.param_groups, **opt_params)

    for t in range(n_iterations):
        batch_mask = np.random.choice(n_train, batch_size)
        X_batch = X_val[batch_mask]
        y_batch = y_val[batch_mask]        
        # Evaluate function value and gradient
        loss, score = model.oracle(X_batch, y_batch)
        loss_hist.append(loss)

        # Perform stochastic gradient descent

        sgd.step()

        # Maybe print training loss
        if verbose and t % print_level == 0:
            train_acc = accuracy(score, y_batch)
            print('(Iteration %d / %d , epoch %d) loss: %f, train_accu: %f' % (
                t + 1, n_iterations, epoch, loss_hist[-1], train_acc))

        # At the end of every epoch, adjust the learning rate.
        epoch_end = (t + 1) % iterations_per_epoch == 0
        if epoch_end:
            epoch += 1
            opt_params['lr'] *= lr_decay


if __name__ == '__main__':
    model = CNN()
    data = get_cifar10_data()
    num_train = 100
    data = {
        'X_train': data['X_train'][:num_train],
        'y_train': data['y_train'][:num_train],
        'X_val': data['X_val'],
        'y_val': data['y_val'],
    }
    X_train, y_train, X_val, y_val = data['X_train'], data['y_train'], data['X_val'], data['y_val']

    train(model, X_train, y_train, X_val, y_val, batch_size=50, n_epochs=50, print_level=1)

(Iteration 1 / 100 , epoch 0) loss: 2.426176, train_accu: 0.000000
(Iteration 2 / 100 , epoch 0) loss: 2.327081, train_accu: 0.080000
(Iteration 3 / 100 , epoch 1) loss: 2.301251, train_accu: 0.080000
(Iteration 4 / 100 , epoch 1) loss: 2.263435, train_accu: 0.140000
(Iteration 5 / 100 , epoch 2) loss: 2.236490, train_accu: 0.240000
(Iteration 6 / 100 , epoch 2) loss: 2.167822, train_accu: 0.240000
(Iteration 7 / 100 , epoch 3) loss: 2.169647, train_accu: 0.280000
(Iteration 8 / 100 , epoch 3) loss: 2.277372, train_accu: 0.100000
(Iteration 9 / 100 , epoch 4) loss: 2.205201, train_accu: 0.180000
(Iteration 10 / 100 , epoch 4) loss: 2.088595, train_accu: 0.380000
(Iteration 11 / 100 , epoch 5) loss: 2.092491, train_accu: 0.260000
(Iteration 12 / 100 , epoch 5) loss: 2.089308, train_accu: 0.300000
(Iteration 13 / 100 , epoch 6) loss: 2.055046, train_accu: 0.300000
(Iteration 14 / 100 , epoch 6) loss: 2.119532, train_accu: 0.220000
(Iteration 15 / 100 , epoch 7) loss: 2.065627, train_accu

Yes, it overfits on small data.

## Accerleration

In [26]:
import numba
from time import time
from numba import jit
from nn.loss import SoftmaxCE

In [35]:
class CNN(object):
    """Convolutional neural network with the following structures:
        (conv + relu + pooling) + (linear + relu + linear) + softmax
    """
    def __init__(self, image_size=(3, 32, 32), channels=32, conv_kernel=7,
                 pool_kernel=2, hidden_units=100, n_classes=10):
        """
        :param image_size: an 3 * H * W image, for color image,

        :param channels: channels in the convolution layer
        :param conv_kernel: kernel size of convolutional layer
        :param pool_kernel: kernel size of pooling layer
        :param hidden_units: number of hidden units in linear transform
        """

        # TODO: initialize the neural network. Define the layers
        C,H,W=image_size

        # Convolutional layer
        self.conv = Conv(C,channels,conv_kernel,conv_kernel)
        self.relu = Relu()
        self.pool = MaxPool(pool_kernel)

        # Fc layer
        pad_out = 1 + (H - conv_kernel)
        linear_in = (pad_out // 2 ) * (pad_out // 2 ) * channels
        self.linear1 = Linear(linear_in, hidden_units)
        self.linear2 = Linear(hidden_units, n_classes)
        Scaling = self.linear1.init_scale

        # TODO: Add the layers' parameters to the network, which will be assigned to optimizers
        self.param_groups={
            'w':{
                'w1':{'param':Scaling * np.random.randn(channels, C, conv_kernel, conv_kernel),'grad':{}},
                'w2':{'param':Scaling * np.random.randn(linear_in, hidden_units),'grad':{}},
                'w3':{'param':Scaling * np.random.randn(hidden_units, n_classes),'grad':{}}
            },
            'b':{
                'b1':{'param':np.zeros(channels),'grad':{}},
                'b2':{'param':np.zeros(hidden_units),'grad':{}},
                'b3':{'param':np.zeros(n_classes),'grad':{}}
            }
        }

In [44]:
if check_time:
    inputs=2
    input_dim=(3,16,16)
    hidden_units=10
    num_classes=10
    num_filters=3
    filter_size=3
    pool_size=2

    np.random.seed(231)
    X = np.random.randn(inputs, *input_dim)
    y = np.random.randint(num_classes, size=inputs)

    model = CNN(input_dim, num_filters, filter_size, pool_size, hidden_units, num_classes)
    
    model.conv.params['w']['param']=model.param_groups['w']['w1']['param']
    model.conv.params['b']['param']=model.param_groups['b']['b1']['param']
    model.linear1.params['w']['param']=model.param_groups['w']['w2']['param']
    model.linear1.params['b']['param']=model.param_groups['b']['b2']['param']
    model.linear2.params['w']['param']=model.param_groups['w']['w3']['param']
    model.linear2.params['b']['param']=model.param_groups['b']['b3']['param']

    start=time()
    conv1=model.conv.forward(x)
    end=time()
    print('conv1 forward time: %fs' % (end-start))

    start=time()
    conv1_relu=model.relu.forward(conv1)
    end=time()
    print('conv1_relu forward time: %fs' % (end-start))

    start=time()
    conv1_relu_pool=model.pool.forward(conv1_relu)
    end=time()
    print('conv1_relu_pool forward time: %fs' % (end-start))

    # FC-layer2
    start=time()
    linear1=model.linear1.forward(conv1_relu_pool)
    end=time()
    print('linear1 forward time: %fs' %(end-start))

    start=time()
    linear1_relu=model.relu.forward(linear1)
    end=time()
    print('linear1_relu forward time: %fs' % (end-start))

    start=time()
    linear2=model.linear2.forward(linear1_relu)
    end=time()
    print('linear2 forward time: %fs' % (end-start)) 
    
    s = linear2
    fx,dloss=SoftmaxCE.__call__(s,y)
        
    start=time()
    dlinear2= model.linear2.backward(dloss,linear1_relu)
    end=time()
    print('linear2 backward time: %fs' % (end-start))

    start=time()
    dlinear2_relu=model.relu.backward(dlinear2,linear1)
    end=time()
    print('linear2_relu backward time: %fs' % (end-start))
        
    start=time()
    dlinear1=model.linear1.backward(dlinear2_relu,conv1_relu_pool)
    end=time()
    print('linear1 backward time: %fs' % (end-start))

    # FC-layer2
    start=time()
    dpool = model.pool.backward(dlinear1,conv1_relu)
    end=time()
    print('pooling backward time: %fs' % (end-start))

    start=time()
    dr = model.relu.backward(dpool,conv1)
    end=time()
    print('conv_relu backward time: %fs' % (end-start))

    start=time()
    dconv= model.conv.backward(dr,x)
    end=time()
    print('conv backward time: %fs' % (end-start)) 

conv1 forward time: 0.008931s
conv1_relu forward time: 0.000022s
conv1_relu_pool forward time: 0.002731s
linear1 forward time: 0.000046s
linear1_relu forward time: 0.000102s
linear2 forward time: 0.000019s
linear2 backward time: 0.000036s
linear2_relu backward time: 0.000013s
linear1 backward time: 0.000470s
pooling backward time: 0.011582s
conv_relu backward time: 0.000044s
conv backward time: 0.022517s


**挑选了时间量级较大的conv_forward,conv_backward,pool_forward,pool_backward适用jit**

经过验证发现jit在简单运算上反而变慢了，但是如果带入train进行迭代，速度就变快了很多，大概能节省将近一半的时间.