In [None]:
using Knet, Plots, JLD #, NBInclude
# @nbinclude("02.mnist.ipynb")  # loads MNIST, defines dtrn,dtst,Atype,train,softmax,zeroone
include("mnist.jl")
lin = load("lin.jld")     # loads linear model results for comparison
ENV["COLUMNS"]=80         # column width for array printing
plotlyjs()                # for interactive plots
Plots.scalefontsizes(1.5)

## Multiple linear layers do not improve over linear model

In [None]:
# Let us try to concatenate multiple linear layers
function multilinear(w,x)
    for i=1:2:length(w)
        x = w[i]*mat(x) .+ w[i+1]
    end
    return x
end;

In [None]:
# Weight initialization for multiple layers: h=array of layer sizes
# Output is an array [w0,b0,w1,b1,...,wn,bn] where wi,bi is the weight matrix and bias vector for the i'th layer
function winit(h...)  # use winit(x,h1,h2,...,hn,y) for n hidden layer model
    w = Any[]
    for i=2:length(h)
        push!(w, xavier(h[i],h[i-1]))
        push!(w, zeros(h[i],1))
    end
    map(Atype, w)
end;

In [None]:
w64=winit(784,64,10) # gives weights and biases for a multi layer model with a single hidden layer of size 64

In [None]:
(x,y) = first(dtst)
softmax(w64,x,y,multilinear)

In [None]:
if !isfile("mlp1.jld")
    setseed(1)
    @time weightsML=train(winit(784,64,10),dtrn,multilinear,lr=0.1)       # 33.9s
    @time trnlossML = [ softmax(w,dtrn,multilinear) for w in weightsML ]  # 22.2s
    @time tstlossML = [ softmax(w,dtst,multilinear) for w in weightsML ]  # 3.73s
    @time trnerrML =  [ zeroone(w,dtrn,multilinear) for w in weightsML ]  # 22.8s
    @time tsterrML =  [ zeroone(w,dtst,multilinear) for w in weightsML ]  # 3.84s
    @save "mlp1.jld" trnlossML tstlossML trnerrML tsterrML
else
    @eval (@load "mlp1.jld")
end
minimum(tstlossML),minimum(tsterrML)  # 0.2856, 0.0795

In [None]:
plot([lin["trnloss"] lin["tstloss"] trnlossML tstlossML],ylim=(0.2,0.4),
    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel="Epochs",ylabel="Loss")  
# multilinear converges to a similar solution, not identical because problem is non-convex

In [None]:
plot([lin["trnerr"] lin["tsterr"] trnerrML tsterrML],ylim=(0.06,0.12),
    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel="Epochs",ylabel="Error")  
# error results also close to the linear model

## Multiple linear layers are useless because they are equivalent to a single linear layer
If we write down what is being computed and do some algebra, we can show that what is being computed is still an affine function of the input, i.e. stacking multiple linear layers does not increase the representational capacity of the model:

\begin{align*}
\hat{p} &= \mbox{soft}(W_2 (W_1 x + b_1) + b_2) \\
&= \mbox{soft}((W_2 W_1)\, x + W_2 b_1 + b_2) \\
&= \mbox{soft}(W x + b)
\end{align*}

In [None]:
weightsML = nothing; knetgc() # to save gpu memory

## Multi Layer Perceptron (MLP) adds non-linearities between layers

In [None]:
# Using nonlinearities (relu) results in a model with higher capacity which helps underfitting
function mlp(w,x)
    for i=1:2:length(w)
        x = w[i]*mat(x) .+ w[i+1]
        if i < length(w)-1; x = relu.(x); end
    end
    return x
end;

In [None]:
w64=winit(784,64,10) # gives weights and biases for an MLP with a single hidden layer of size 64

In [None]:
softmax(w64,x,y,mlp)

In [None]:
if !isfile("mlp2.jld")
    setseed(1)
    @time weightsMLP=train(winit(784,64,10),dtrn,mlp,lr=0.1)        # 35.4s
    @time trnlossMLP = [ softmax(w,dtrn,mlp) for w in weightsMLP ]  # 23.7s
    @time tstlossMLP = [ softmax(w,dtst,mlp) for w in weightsMLP ]  # 3.99s
    @time trnerrMLP =  [ zeroone(w,dtrn,mlp) for w in weightsMLP ]  # 23.3s
    @time tsterrMLP =  [ zeroone(w,dtst,mlp) for w in weightsMLP ]  # 3.91s
    @save "mlp2.jld" trnlossMLP tstlossMLP trnerrMLP tsterrMLP
else
    @eval (@load "mlp2.jld")
end
minimum(tstlossMLP),minimum(tsterrMLP)  # 0.0808, 0.0235

## MLP solves underfitting but still has an overfitting problem

In [None]:
plot([lin["trnloss"] lin["tstloss"] trnlossMLP tstlossMLP],ylim=(0.0,0.4),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Loss")  
# Solves the underfitting problem!
# A more serious overfitting problem remains.

In [None]:
plot([lin["trnerr"] lin["tsterr"] trnerrMLP tsterrMLP],ylim=(0,0.1),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Error")  
# test error improves from 7.5% to 2.3%!

In [None]:
weightsMLP = nothing; knetgc() # to save gpu memory

## MLP with L1 regularization

In [None]:
# Redefine softmax loss function to accept keyword parameters l1 and l2 for regularization
# Use non-zero l1 or l2 for regularization (only on matrices not biases)
function softmax(w,x,y,predict;l1=0,l2=0,o...)
    J = nll(predict(w,x;o...),y)
    if l1 != 0; J += Float32(l1) * sum(sum(abs,wi)  for wi in w[1:2:end]); end
    if l2 != 0; J += Float32(l2) * sum(sum(abs2,wi) for wi in w[1:2:end]); end
    return J
end;

In [None]:
# We still have overfitting, let's try L1 regularization
if !isfile("mlp3.jld")
    srand(1)
    @time weightsL1=train(winit(784,64,10),dtrn,mlp;lr=0.1,l1=0.00004)  # 47.3s
    @time trnlossL1= [ softmax(w,dtrn,mlp) for w in weightsL1 ]  # 24.8s
    @time tstlossL1= [ softmax(w,dtst,mlp) for w in weightsL1 ]  # 4.17s
    @time trnerrL1=  [ zeroone(w,dtrn,mlp) for w in weightsL1 ]  # 23.7s
    @time tsterrL1=  [ zeroone(w,dtst,mlp) for w in weightsL1 ]  # 3.95s
    @save "mlp3.jld" trnlossL1 tstlossL1 trnerrL1 tsterrL1
else
    @eval (@load "mlp3.jld")
end
minimum(tstlossL1),minimum(tsterrL1)  # 0.0759, 0.0220

In [None]:
plot([trnlossMLP tstlossMLP trnlossL1 tstlossL1],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel="Epochs", ylabel="Loss")  
# overfitting less, test loss improves from 0.0808 to 0.0759

In [None]:
plot([trnerrMLP tsterrMLP trnerrL1 tsterrL1],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel="Epochs", ylabel="Error")    
# however test error does not change significantly: 0.0235 -> 0.0220

In [None]:
weightsL1 = nothing; knetgc() # to save gpu memory

## MLP with dropout

In [None]:
# Dropout is another way to address overfitting
function mlpdrop(w,x; pdrop=(0,0))
    for i=1:2:length(w)
        x = dropout(x, pdrop[i==1?1:2])  # apply one of two dropout rates
        x = w[i]*mat(x) .+ w[i+1]
        if i < length(w)-1; x = relu.(x); end
    end
    return x
end;

In [None]:
@doc dropout

In [None]:
if !isfile("mlp4.jld")
    setseed(1)
    @time weightsDR=train(winit(784,64,10),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 38.9s
    @time trnlossDR = [ softmax(w,dtrn,mlpdrop) for w in weightsDR ]     # 25.7s
    @time tstlossDR = [ softmax(w,dtst,mlpdrop) for w in weightsDR ]     # 4.25s
    @time trnerrDR =  [ zeroone(w,dtrn,mlpdrop) for w in weightsDR ]     # 24.3s
    @time tsterrDR =  [ zeroone(w,dtst,mlpdrop) for w in weightsDR ]     # 4.11s
    @save "mlp4.jld" trnlossDR tstlossDR trnerrDR tsterrDR
else
    @eval (@load "mlp4.jld")
end
minimum(tstlossDR),minimum(tsterrDR)  # 0.0639, 0.0188

In [None]:
plot([trnlossMLP tstlossMLP trnlossDR tstlossDR],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Loss")
# overfitting less, loss results improve 0.0808 -> 0.0639

In [None]:
plot([trnerrMLP tsterrMLP trnerrDR tsterrDR],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Error")  
# this time error also improves 0.0235 -> 0.0188

In [None]:
:mlperr,minimum(tsterrMLP),:L1err,minimum(tsterrL1),:dropouterr,minimum(tsterrDR)

In [None]:
:mlploss,minimum(tstlossMLP),:L1loss,minimum(tstlossL1),:dropoutloss,minimum(tstlossDR)

In [None]:
weightsDR = nothing; knetgc() # to save gpu memory

## MLP with larger hidden layer and dropout

In [None]:
# The current trend is to use models with higher capacity tempered with dropout
if !isfile("mlp.jld")
    setseed(1)
    @time weights=train(winit(784,256,10),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 34.6s
    @time trnloss = [ softmax(w,dtrn,mlpdrop) for w in weights ] # 21.2s
    @time tstloss = [ softmax(w,dtst,mlpdrop) for w in weights ] # 3.61s
    @time trnerr =  [ zeroone(w,dtrn,mlpdrop) for w in weights ] # 21.7s
    @time tsterr =  [ zeroone(w,dtst,mlpdrop) for w in weights ] # 3.63s
    @save "mlp.jld" weights trnloss tstloss trnerr tsterr
else
    @eval (@load "mlp.jld")
end
minimum(tstloss),minimum(tsterr)  # 0.0494, 0.0148

In [None]:
plot([trnlossDR tstlossDR trnloss tstloss],ylim=(0,0.15),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Loss")

In [None]:
plot([trnerrDR tsterrDR trnerr tsterr],ylim=(0,0.04),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Error")
# We are down to 0.015 error.

## Visualizing hidden weights

In [None]:
ENV["COLUMNS"]=120
w = weights[end]
w1 = reshape(Array(w[1])', (28,28,1,256))
w2 = clamp.(2.5.*w1.+0.5,0,1)
IJulia.clear_output(true)
display(hvcat(16, [mnistview(w2,i) for i=1:256]...))