## Convolutional Neural Network (coded in Python/NumPy)

### overview

| layer   | description    | depth      | dimensions                          | hyperparameters (other than length)  | 
|:-------:|:--------------:|:----------:|:-----------------------------------:|:------------------------------------:|
| Layer 1 | Input          | 3          | 32 x 32                             |                                      |
| Layer 2 | Convolution    | \*10       | weights(3 x 5 x 5), output(32 x 32) | fieldsize=\*5, stride=\*1, pad=\*2   |
| Layer 3 | Pooling        | 10         | output(16 x 16)                     | fieldsize=\*2, stride=\*2, pad=\*0   |
| Layer 4 | Convolution    | \*20       | weights(10 x 5 x 5), output(16 x 16)| (same as layer 2)                    |
| Layer 5 | Pooling        | 20         | output(8 x 8)                       | (same as layer 3)                    |
| Layer 6 | Fully-Connected|            | input (1280)  output (\*100)        |                                      | 
| Layer 7 | Output         |            | input (100)  output(10)             |                                      |
|         |                |\* set by user                                    |                                      |

### instantiation

In [1]:
import numpy as np

class CNN:
    '''    
    Parameters applicable to convolutional layers 2 & 4:
       self.ksize         :  kernel size with respect to weight filters/receptive fields
       self.stride        :  number of pixels moved by filter at one time
       self.pad           :  amount of zero padding to surround border of input volume
    
    Weights filters (numbering based on whether L is a convolutional layer):
       self.W2            :  layer 2 weights (depth(L), depth(L-1), ksize, ksize)
       self.W4            :  layer 4 weights  "
       self.n_inpts       :  total input depth (images and neurons)
       self.calfactor     :  factor to help calibrate variances between network layers
    
    Other variables accessible inside convolution functions:
       self.Xcol2         :  'im2col' aligned data, associated with convolutional layer 2
       self.Xcol4         :  'im2col' aligned data, associated with convolutional layer 4
       self.images        :   cached input to convolutional layer 2
       self.Z3            :   cached input to convolutional layer 4
    '''   
    def __init__(self):
        self.ksize      = 5
        self.stride     = 1
        self.pad        = 2 
        self.n_inpts    = 3+10+20+1280+100
        self.calfactor  = np.sqrt(2.0/self.n_inpts) 
        self.W2         = np.random.randn(10, 3, self.ksize, self.ksize) * self.calfactor  
        self.W4         = np.random.randn(20, 10, self.ksize, self.ksize) * self.calfactor
        self.Xcol2      = []
        self.Xcol4      = []
        self.images     = []
        self.Z3         = []
        
# instantiate
m = CNN()

### initialization of other variables

In [2]:
N = 1000                            # number of training examples in batch
imap = list( range(N) )             # index of training examples
imagesshape = (3,32,32)             # input shape (depth, height & width dims)
Z2shape = (10,32,32)                # layer 2 output shape (depth, height & width dims)
Z3shape = (10,16,16)                # layer 3 output shape "
Z4shape = (20,16,16)                # layer 4 output shape "
Z5shape = (20,8,8)                  # layer 5 output shape "
hiddenshape = (100)                 # layer 6 output shape (length of hidden layer)
outputshape = (10)                  # layer 7 output shape (number of output classes) 
W6 = np.random.randn(1280, 100) * m.calfactor # layer 6 weights (input size, hidden size)
W7 = np.random.randn(100, 10) * m.calfactor   # layer 7 weigths (hidden size, output size)
b2 = np.zeros( (Z2shape[0], 1) )              # layer 2 bias parameters
b4 = np.zeros( (Z4shape[0], 1) )              # layer 4 bias parameters
b6 = np.zeros( (1, hiddenshape) )             # layer 6 bias parameters
b7 = np.zeros( (1, outputshape) )             # layer 7 bias parameters 

### im2col 

In [3]:
def get_im2col_indices(x_shape, fld_hgt, fld_wdth, padding, stride):
    N, C, H, W = x_shape
    assert (H + 2 * padding - fld_hgt) % stride == 0
    assert (W + 2 * padding - fld_hgt) % stride == 0
    out_height = int((H + 2 * padding - fld_hgt) / stride + 1)
    out_width = int((W + 2 * padding - fld_wdth) / stride + 1)
    i0 = np.repeat(np.arange(fld_hgt), fld_wdth)
    i0 = np.tile(i0, C)
    i1 = stride * np.repeat(np.arange(out_height), out_width)
    j0 = np.tile(np.arange(fld_wdth), fld_hgt * C)
    j1 = stride * np.tile(np.arange(out_width), out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
    k = np.repeat(np.arange(C), fld_hgt * fld_wdth).reshape(-1, 1)
    return (k.astype(int), i.astype(int), j.astype(int))

def im2col_indices(x, fld_hgt, fld_wdth, padding, stride): 
    p = padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')
    k, i, j = get_im2col_indices(x.shape, fld_hgt, fld_wdth, padding, stride)
    cols = x_padded[:, k, i, j]
    C = x.shape[1]
    cols = cols.transpose(1, 2, 0).reshape(fld_hgt * fld_wdth * C, -1)
    return cols

def col2im_indices(cols, x_shape, fld_hgt, fld_wdth, padding, stride):
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    k, i, j = get_im2col_indices(x_shape, fld_hgt, fld_wdth, padding, stride)
    cols_reshaped = cols.reshape(C * fld_hgt * fld_wdth, -1, N)
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
    if padding == 0:
        return x_padded
    return x_padded[:, :, padding:-padding, padding:-padding]

### convolution functions

In [4]:
def convolve_forward(m, inpt, W, b, layer, padding=m.pad, stride=m.stride):
    '''
    Performs matrix multiplication of weights (flattened by depth col), with receptive fields 
    (flattened by depth cols & training examples).          (see comments, for example shapes)
                                                             ---------------------------------'''
    Wshape0, Wshape1, Wshape2, Wshape3 = W.shape                                # 10, 3, 5, 5
    Xshape0, Xshape1, Xshape2, Xshape3 = inpt.shape                         # 1000, 3, 32, 32
    Zshape2 = int( (Xshape2 - Wshape2 + 2 * padding) / stride + 1 )                      # 32
    Zshape3 = int( (Xshape3 - Wshape3 + 2 * padding) / stride + 1 )                      # 32
    Xcol = im2col_indices(inpt, Wshape2, Wshape3, padding, stride)        # 3*5*5, 32*32*1000
    Wcol = W.reshape(Wshape0, -1)                                                 # 10, 3*5*5
    Z = Wcol @ Xcol + b                                                      # 10, 32*32*1000 
    Z = Z.reshape(Wshape0, Zshape2, Zshape3, Xshape0)                      # 10, 32, 32, 1000
    if layer == 2: m.Xcol2 = Xcol
    if layer == 4: m.Xcol4 = Xcol
    return Z.transpose(3, 0, 1, 2)                                         # 1000, 10, 32, 32

def conv_backward(m, inpt, layer, padding=m.pad, stride=m.stride):
    '''  Input:         inpt - backprop_error(L)
       Returns:        dW(L) - backprop_error(L) * X(L).T   
                       db(L) - sum(backprop_error(L))
               outp_err(L-1) - W(L).T * backprop_error(L) '''
    if layer == 2: Xcol, W, cached_inpt = m.Xcol2, m.W2, m.images
    if layer == 4: Xcol, W, cached_inpt = m.Xcol4, m.W4, m.Z3       
    Wshape0, Wshape1, Wshape2, Wshape3 = W.shape                                # 10, 3, 5, 5
    in_reshaped = inpt.transpose(1, 2, 3, 0).reshape(Wshape0, -1)               # 10, 1024000
    dW = in_reshaped @ Xcol.T   
    dW = dW.reshape(W.shape)
    db = np.sum(inpt, axis=(0, 2, 3))
    db = db.reshape(Wshape0, -1)
    W_reshape = W.reshape(Wshape0, -1)                                               # 10, 75
    outcol = W_reshape.T @ in_reshaped                                          # 75, 1024000
    outp = col2im_indices(outcol, cached_inpt.shape, Wshape2, Wshape3,  
                          padding=padding, stride=stride)                   # 1000, 3, 32, 32
    return dW, db, outp

### pooling functions

In [5]:
def maxpool_forward(inpt, size=2, stride=2):
    '''Input:    inpt      -  2D output from convolution layer, per neuron, per example
       Returns:  outp      -  output to pooling layer (spatial dimensions cut in half)
                 argmaxes  -  max indices with respect to 4x4 input fields'''
    inshape0, inshape1, inshape2, inshape3 = inpt.shape            #example: 1000,10,32,32
    outshape2 = int( (inshape2 - size) / stride + 1 )                                  #16
    outshape3 = int( (inshape3 - size) / stride + 1 )                                  #16
    in_reshaped = inpt.reshape(inshape0*inshape1, 1, inshape2, inshape3)    #10000,1,32,32
    Xcol = im2col_indices(in_reshaped, size, size, padding=0, stride=stride)    #4,2560000
    argmaxes = np.argmax(Xcol, axis=0)                                            #2560000
    outp = Xcol[argmaxes, range(argmaxes.size)]                                   #2560000
    argmaxes = argmaxes.reshape(outshape2, outshape3, inshape0, inshape1)   #16,16,1000,10    
    outp = outp.reshape(outshape2, outshape3, inshape0, inshape1)           #16,16,1000,10
    return outp.transpose(2, 3, 0, 1), argmaxes.transpose(2, 3, 0, 1)       #1000,10,16,16

def maxpool_backward(argmaxes, inpt, outshape):
    '''This function is used in place of 'im2col' related functions. 
        Input:   argmaxes  -  max indices with respect to each 2x2 output block
                 inpt      -  output error backpropogated from network
                 outshape  -  shape of output error to apply to preceeding layer
        Each input value maps to a position in a 2x2 output block, per argmax.'''         
    outp = np.zeros(outshape)
    for i in range(inpt.shape[0]):                              
        for j in range(inpt.shape[1]):                           
            r, c = 0, 0                                         
            for k in range(inpt.shape[2]):                      
                for m in range(inpt.shape[3]):                  
                    maxix = argmaxes[i][j][k][m]
                    if maxix == 0:   outp[i][j][r,c]     = inpt[i][j][k][m]
                    elif maxix == 1: outp[i][j][r,c+1]   = inpt[i][j][k][m]
                    elif maxix == 2: outp[i][j][r+1,c]   = inpt[i][j][k][m]
                    else:            outp[i][j][r+1,c+1] = inpt[i][j][k][m]
                    c += 2
                r, c = r+2, 0
    return outp

### activation functions

In [6]:
def ReLU(z):
    return z * (z > 0)
    
def Softmax(z):
    ''' e^z1 / (e^z1 + e^z2 + e^z3)'''
    z = z - np.max(z)     
    return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)

def derivative_Softmax(z):
    ''' ( e^z1 ⋅ (e^z2 + e^z3) ) / (e^z1 + e^z2 + e^z3)^2 '''
    z = z - np.max(z)
    A = np.exp(z)
    C = np.sum(np.exp(z))
    B = C - A
    return A * B / C**2

def derivative_ReLU(z):
    return 1. * (z > 0)            

### input data & preprocessing

In [7]:
# load data
def load(N):
    '''original images shape: length:N, height:32, width:32, depth:3'''
    import cifar_data as cf
    d = cf.CifarDataManager()
    return d.train.images[:N], d.train.labels[:N]
imgs, labels = load(N)
imgs = np.transpose( imgs, (0,3,1,2) )  # depth to precede height & width

# preprocessing (mean subtraction)
m.images = imgs - imgs.mean(axis=0)

# produce numerical label values from binary vectors 
Y = []
for i in range(N): 
    Y.append( np.argmax(labels[i]) )

### training loop

In [8]:
# training parameters
numsteps = 1
learnrate = 1
penalty = 0.001  
p = 0.5  # keep_probability, with respect to dropout

# training loop
for iteration in range(numsteps):

    # forward pass layers 2 through 5
    Z2 = ReLU( convolve_forward(m, m.images, m.W2, b2, layer=2) )
    m.Z3, argmaxes3 = maxpool_forward(Z2)
    Z4 = ReLU( convolve_forward(m, m.Z3, m.W4, b4, layer=4) )
    Z5, argmaxes5 = maxpool_forward(Z4) 

    # forward pass fully-connected layer 6
    mask6 = (np.random.rand(*W6.shape) < p) / p
    Z5flat = np.zeros( (N, 20*8*8) )
    for i in range(N): 
        Z5flat[i] = np.ndarray.flatten(Z5[i]) #flatten pooling output
    Z6 = ReLU( np.dot( Z5flat, mask6*W6 ) + b6 )

    # forward pass output layer 7
    mask7 = (np.random.rand(*W7.shape) < p) / p   
    Z7 = np.dot(Z6, mask7*W7) + b7 
    Yhat = Softmax(Z7)
    
    # evaluate loss
    logloss = -np.log( Yhat[imap, Y] ) #wrt yhats, wrt correct classes
    loss = np.sum(logloss)/N
    print ('Iteration', iteration, 'loss', loss)

    # compute errors with respect to output probabilities
    outerr7 = - ( labels - Yhat ) 

    # backward pass output layer 7
    delta7 = np.zeros( (N, outputshape) )
    for i in range(N):
        delta7[i] = outerr7[i] * derivative_Softmax( Z7[i] ) 
    dW7 = np.dot( Z6.T, delta7 )
    db7 = np.sum( delta7, axis=0, keepdims=True )
    outerr6 = np.dot( delta7, (mask7*W7).T ) 

    # backward pass fully-connected layer 6
    delta6 = outerr6 * derivative_ReLU(Z6)  
    dW6 = np.dot( Z5flat.T, delta6 )
    db6 = np.sum( delta6, axis=0, keepdims=True )
    outerr5 = np.dot( delta6, (mask6*W6).T )
 
    # backward pass pooling layer 5
    outerr5 = outerr5.reshape( (N, *Z5shape) )  # unflatten
    outerr4 = maxpool_backward(argmaxes5, inpt=outerr5, outshape=((N, *Z4shape)))

    # backward pass convolutional layer 4
    delta4 = outerr4 * derivative_ReLU(Z4)  
    dW4, db4, outerr3 = conv_backward(m, inpt=delta4, layer=4)

    # backward pass pooling layer 3
    outerr2 = maxpool_backward(argmaxes3, inpt=outerr3, outshape=((N, *Z2shape)))

    # backward pass convolutional layer 2
    delta2 = outerr2 * derivative_ReLU(Z2)  
    dW2, db2, outerr1 = conv_backward(m, inpt=delta2, layer=2)
    
    # regularization penalty
    dW7 += penalty * W7
    dW6 += penalty * W6
    dW4 += penalty * m.W4
    dW2 += penalty * m.W2    
    
    # parameter update
    W7   += -learnrate * dW7
    W6   += -learnrate * dW6
    m.W4 += -learnrate * dW4
    m.W2 += -learnrate * dW2
    b7   += -learnrate * db7
    b6   += -learnrate * db6
    b4   += -learnrate * db4
    b2   += -learnrate * db2

Iteration 0 loss 2.30170511133


### credits

I would like to acknowledge Stanford CS231 http://cs231n.github.io/convolutional-networks/ for providing excellent materials on this topic, and to https://github.com/wiseodd/hipsternet for sharing python code. Also thanks to https://github.com/stephencwelch/Neural-Networks-Demystified, https://www.youtube.com/watch?v=bxe2T-V8XRs for an outstanding video series on introducing ordinary neural networks, forward/back propogation, and gradient descent. 