In [None]:
using Knet, Plots, JLD #, NBInclude
# @nbinclude("02.mnist.ipynb")  # loads MNIST, defines dtrn,dtst,Atype,train,softmax,zeroone
include("mnist.jl")
mlpdata = load("mlp.jld") # loads MLP results for comparison
ENV["COLUMNS"]=80         # column width for array printing
plotlyjs()                # for interactive plots
Plots.scalefontsizes(1.5)

## Introduction to convolution

In [None]:
# Convolution operator in Knet
@doc conv4

In [None]:
# Convolution in 1-D
@show w = reshape([1.0,2.0,3.0], (3,1,1,1))
@show x = reshape([1.0:7.0...], (7,1,1,1))
@show y = conv4(w, x);  # size Y = X - W + 1 = 5 by default

In [None]:
# Padding
@show y2 = conv4(w, x, padding=(1,0));  # size Y = X + 2P - W + 1 = 7 with padding=1
# To preserve input size (Y=X) for a given W, what padding P should we use?

In [None]:
# Stride
@show y3 = conv4(w, x; padding=(1,0), stride=3);  # size Y = 1 + floor((X+2P-W)/S)

In [None]:
# Mode
@show y4 = conv4(w, x, mode=0);  # Default mode (convolution) inverts w
@show y5 = conv4(w, x, mode=1);  # mode=1 (cross-correlation) does not invert w

In [None]:
# Convolution in more dimensions
x = reshape([1.0:9.0...], (3,3,1,1))

In [None]:
w = reshape([1.0:4.0...], (2,2,1,1))

In [None]:
y = conv4(w, x)

In [None]:
# Convolution with multiple channels, filters, and instances
# size X = [X1,X2,...,Xd,Cx,N] where d is the number of dimensions, Cx is channels, N is instances
x = reshape([1.0:18.0...], (3,3,2,1)) 

In [None]:
# size W = [W1,W2,...,Wd,Cx,Cy] where d is the number of dimensions, Cx is input channels, Cy is output channels
w = reshape([1.0:24.0...], (2,2,2,3));

In [None]:
# size Y = [Y1,Y2,...,Yd,Cy,N]  where Yi = 1 + floor((Xi+2Pi-Wi)/Si), Cy is channels, N is instances
y = conv4(w,x)

See http://cs231n.github.io/assets/conv-demo/index.html for an animated example.

## Introduction to Pooling

In [None]:
# Pooling operator in Knet
@doc pool

In [None]:
# 1-D pooling example
@show x = reshape([1.0:6.0...], (6,1,1,1))
@show pool(x);

In [None]:
# Window size
@show pool(x; window=3);  # size Y = floor(X/W)

In [None]:
# Padding
@show pool(x; padding=(1,0));  # size Y = floor((X+2P)/W)

In [None]:
# Stride
@show x = reshape([1.0:10.0...], (10,1,1,1));
@show pool(x; stride=4);  # size Y = 1 + floor((X+2P-W)/S)

In [None]:
# Mode
x = ka(reshape([1.0:6.0...], (6,1,1,1)))
@show Array(x)
@show Array(pool(x; padding=(1,0), mode=0))  # max pooling
@show Array(pool(x; padding=(1,0), mode=1))  # avg pooling
@show Array(pool(x; padding=(1,0), mode=2)); # avg pooling excluding padded values (is not implemented on CPU)

In [None]:
# More dimensions
x = reshape([1.0:16.0...], (4,4,1,1))

In [None]:
pool(x)

In [None]:
# Multiple channels and instances
x = reshape([1.0:32.0...], (4,4,2,1))

In [None]:
# each channel and each instance is pooled separately
pool(x)  # size Y = (Y1,...,Yd,Cx,N) where Yi are spatial dims, Cx and N are identical to input X

## A convolutional neural network model for MNIST

In [None]:
function convnet(w,x; pdrop=(0,0,0))    # pdrop[1]:input, pdrop[2]:conv, pdrop[3]:fc
    for i=1:2:length(w)
        if ndims(w[i]) == 4     # convolutional layer
            x = dropout(x, pdrop[i==1?1:2])
            x = conv4(w[i],x) .+ w[i+1]
            x = pool(relu.(x))
        elseif ndims(w[i]) == 2 # fully connected layer
            x = dropout(x, pdrop[i==1?1:3])
            x = w[i]*mat(x) .+ w[i+1]
            if i < length(w)-1; x = relu.(x); end
        else
            error("Unknown layer type: $(size(w[i]))")
        end
    end
    return x
end;

In [None]:
# Weight initialization for multiple layers
# h[i] is an integer for a fully connected layer, a triple of integers for convolution filters
# Output is an array [w0,b0,w1,b1,...,wn,bn] where wi,bi is the weight matrix/tensor and bias vector for the i'th layer
function cinit(h...)  # use cinit(x,h1,h2,...,hn,y) for n hidden layer model
    w = Any[]
    x = h[1]
    for i=2:length(h)
        if isa(h[i],Tuple)
            (x1,x2,cx) = x
            (w1,w2,cy) = h[i]
            push!(w, xavier(w1,w2,cx,cy))
            push!(w, zeros(1,1,cy,1))
            x = (div(x1-w1+1,2),div(x2-w2+1,2),cy) # assuming conv4 with p=0, s=1 and pool with p=0,w=s=2
        elseif isa(h[i],Integer)
            push!(w, xavier(h[i],prod(x)))
            push!(w, zeros(h[i],1))
            x = h[i]
        else
            error("Unknown layer type: $(h[i])")
        end
    end
    map(Atype, w)
end;

In [None]:
lenet=cinit((28,28,1), (5,5,20), (5,5,50), 500, 10)

In [None]:
(x,y) = first(dtst)
softmax(lenet,x,y,convnet)

In [None]:
if !isfile("cnn.jld")
    setseed(1)
    lenet=cinit((28,28,1), (5,5,20), (5,5,50), 500, 10)
    @time weights=train(lenet,dtrn,convnet,lr=0.1,pdrop=(0,0,0.3)) # 233.8s
    @time trnloss = [ softmax(w,dtrn,convnet) for w in weights ]   # 85.4s
    @time tstloss = [ softmax(w,dtst,convnet) for w in weights ]   # 14.3s
    @time trnerr = [ zeroone(w,dtrn,convnet) for w in weights ]    # 84.9s
    @time tsterr = [ zeroone(w,dtst,convnet) for w in weights ]    # 14.1s
    @save "cnn.jld" trnloss tstloss trnerr tsterr
else    
    @eval (@load "cnn.jld")
end
minimum(tstloss),minimum(tsterr)  # 0.0176, 0.0046

In [None]:
plot([mlpdata["trnloss"] mlpdata["tstloss"] trnloss tstloss],ylim=(0.0,0.1),
    labels=[:trnMLP :tstMLP :trnCNN :tstCNN],xlabel="Epochs",ylabel="Loss")  

In [None]:
plot([mlpdata["trnerr"] mlpdata["tsterr"] trnerr tsterr],ylim=(0.0,0.03),
    labels=[:trnMLP :tstMLP :trnCNN :tstCNN],xlabel="Epochs",ylabel="Error")  

## Convolution vs Matrix Multiplication

In [None]:
# Convolution and matrix multiplication can be implemented in terms of each other.
# Convolutional networks have no additional representational power, only statistical efficiency.
# Our original 1-D example
@show w = reshape([1.0,2.0,3.0], (3,1,1,1))
@show x = reshape([1.0:7.0...], (7,1,1,1))
@show y = conv4(w, x);  # size Y = X - W + 1 = 5 by default

In [None]:
# Convolution as matrix multiplication (1)
# Turn w into a (Y,X) sparse matrix
w2 = Float64[3 2 1 0 0 0 0; 0 3 2 1 0 0 0; 0 0 3 2 1 0 0; 0 0 0 3 2 1 0; 0 0 0 0 3 2 1]

In [None]:
@show y2 = w2 * mat(x);

In [None]:
# Convolution as matrix multiplication (2)
# Turn x into a (W,Y) dense matrix (aka the im2col operation)
# This is used to speed up convolution with known efficient matmul algorithms
x3 = Float64[1 2 3 4 5; 2 3 4 5 6; 3 4 5 6 7]

In [None]:
@show w3 = [3.0 2.0 1.0]
@show y3 = w3 * x3;

In [None]:
# Matrix multiplication as convolution
# This could be used to make a fully connected network accept variable sized inputs.
w = reshape([1.0:6.0...], (2,3))

In [None]:
x = reshape([1.0:3.0...], (3,1))

In [None]:
y = w * x

In [None]:
# Consider w with size (Y,X)
# Treat each of the Y rows of w as a convolution filter
w2 = reshape(Array(w)', (3,1,1,2))

In [None]:
# Reshape x for convolution
x2 = reshape(x, (3,1,1,1))

In [None]:
# Use conv4 for matrix multiplication
y2 = conv4(w2, x2; mode=1)

In [None]:
# So there is no difference between the class of functions representable with an MLP vs CNN.
# Sparse connections and weight sharing give CNNs more generalization power with images.
# Number of parameters in MLP256: (256x784)+256+(10x256)+10 = 203530
# Number of parameters in LeNet: (5*5*1*20)+20+(5*5*20*50)+50+(500*800)+500+(10*500)+10 = 431080