## A complete MNIST training code from the codes at L8-Knet and L9

In [3]:
# Definition of pacakages and functions as well as the download of MNIST data go here.
using AutoGrad
using Knet, Plots, DataFrames
gr()

for p in ("GZip",)
    Pkg.installed(p) == nothing && Pkg.add(p)
end

using GZip

# atype definition:
atype=Array{Float32};

# Functions for MNIST data download next:
"Where to download mnist from"
mnisturl = "http://yann.lecun.com/exdb/mnist"

"Where to download mnist to"
mnistdir = "./"

"""
This utility loads the [MNIST](http://yann.lecun.com/exdb/mnist)
hand-written digits dataset.  There are 60000 training and 10000 test
examples. Each input x consists of 784 pixels representing a 28x28
grayscale image.  The pixel values are converted to Float32 and
normalized to [0,1].  Each output y is a UInt8 indicating the correct
class.  10 is used to represent the digit 0.
```
# Usage:
include(Pkg.dir("Knet/data/mnist.jl"))
xtrn, ytrn, xtst, ytst = mnist()
# xtrn: 28×28×1×60000 Array{Float32,4}
# ytrn: 60000-element Array{UInt8,1}
# xtst: 28×28×1×10000 Array{Float32,4}
# ytst: 10000-element Array{UInt8,1}
```
"""
function mnist()
    global _mnist_xtrn,_mnist_ytrn,_mnist_xtst,_mnist_ytst
    if !isdefined(:_mnist_xtrn)
        info("Loading MNIST...")
        _mnist_xtrn = _mnist_xdata("train-images-idx3-ubyte.gz")
        _mnist_xtst = _mnist_xdata("t10k-images-idx3-ubyte.gz")
        _mnist_ytrn = _mnist_ydata("train-labels-idx1-ubyte.gz")
        _mnist_ytst = _mnist_ydata("t10k-labels-idx1-ubyte.gz")
    end
    return _mnist_xtrn,_mnist_ytrn,_mnist_xtst,_mnist_ytst
end

"Utility to view a MNIST image, requires the Images package"
mnistview(x,i)=colorview(Gray,permutedims(x[:,:,1,i],(2,1)))

function _mnist_xdata(file)
    a = _mnist_gzload(file)[17:end]
    reshape(a ./ 255f0, (28,28,1,div(length(a),784)))
end

function _mnist_ydata(file)
    a = _mnist_gzload(file)[9:end]
    a[a.==0] = 10
    # full(sparse(a,1:length(a),1f0,10,length(a)))
    return a
end

function _mnist_gzload(file)
    if !isdir(mnistdir)
        mkpath(mnistdir)
    end
    path = joinpath(mnistdir,file)
    if !isfile(path)
        url = "$mnisturl/$file"
        download(url, path)
    end
    f = gzopen(path)
    a = read(f)
    close(f)
    return(a)
end

# The function to set the initial weights of Network:
function initweights(d, scale=0.01; hidden=[2], atype=Array{Float32})
    model = Vector{Any}(2 * length(hidden))
    X = d
    for k = 1:length(hidden)
        H = hidden[k]
        model[2k - 1] = scale * randn(H, X) 
        model[2k]     = scale * randn(H, 1)
        X = H
    end
    return map(atype, model)
end

# Function to initialize the model Neural Network:
#    num_inputs: Number of input values in input layer
#    num_hidden: Number of nodes at each hidden layers
#    num_outputs: Number of values at output layer
#    hidden: the list of layers that does not include input layer
function initmodel(atype;num_inputs=784,num_hidden=256,num_outputs=10)
    return initweights(num_inputs,hidden=[num_hidden,num_hidden,num_outputs]; atype=atype);
end

# The next function defines how we determine the prediction:
#  w: tensor of weights (See length(w) for first dimension, which is the number of layers in NN!)
#     But wait, the number of layers are doubled since there are also bias values. 
#     Check the initweights function!
#  x: input values
function predict(w, x)
    x = mat(x)
    for i=1:2:length(w) - 2
        x = relu.(w[i] * x .+ w[i+1]) # bias an weights are concatendated 
    end
    return w[end - 1]*x .+ w[end]
end

# Definition of optimizer (Here it is SGD, Stochastic Gradient Descent)
optim(w; lr=0.01) = optimizers(w, Sgd;  lr=lr);

# Definition of loss function and its gradient:
loss(w, x, ygold, predict) = nll(predict(w, x), ygold);
lossgradient = grad(loss); # AutoGrad means we don't need backpropagation

# Definition of training function:
function train(w, dtrn, optim, predict; epochs=10)
    for epoch = 1:epochs
        for (x, y) in dtrn
            g = lossgradient(w, x, y, predict)
            update!(w, g, optim) ## this a generic train loop the gradient update can be replaced as appropriate
        end
    end
end

# Fancy printing of how successful is the network predictions: accuracy of training (trn) and test (tst) datasets
function report(epoch, w, dtrn, dtst, predict)
    println((:epoch, epoch, :trn, accuracy(w, dtrn, predict), :tst, accuracy(w, dtst, predict)))
end

# MNIST download:
xtrn, ytrn, xtst, ytst = mnist()


[1m[36mINFO: [39m[22m[36mLoading MNIST...
[39m

(Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

...

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], UInt8[0x05, 0x0a, 0x04, 0x01, 0x09, 0x02, 0x01, 0x03, 0x01, 0x04  …  0x09, 0x02, 0x09, 0x05, 0x01, 0x08, 0x03, 0x05, 0x06, 0x08], Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

...

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 

In [103]:
# Initialization of the model and training go here.
 
# Minibatch definitions:
size_of_batch = 100
dtrn = minibatch(xtrn, ytrn, size_of_batch; xtype=atype);
dtst = minibatch(xtst, ytst, size_of_batch; xtype=atype);

# Activation function definitions:
#softmax(z) = exp.(z) ./ sum(exp.(z), 1)
#cross_entropy(yhat, y) = - sum(y .* log.(yhat), 1)

# And finally training the network!
w   = initmodel(atype);
opt = optim(w, lr=1e-1);
fast=true
nepochs=2
if fast
    train(w, dtrn, opt, predict; epochs=nepochs)
    report(1, w, dtrn, dtst, predict)
else
    for epoch = 1:nepochs
        train(w, dtrn, opt, predict, epochs=1)
        report(epoch, w, dtrn, dtst, predict)
    end
end

(:epoch, 1, :trn, 0.9135666666666666, :tst, 0.9123)


In [101]:
# The weights of the trained NN so far
w

6-element Array{Array{Float32,2},1}:
 Float32[9.44334f-5 0.00981919 … -0.0123621 -0.0124799; 0.0120075 0.0051364 … -0.0145879 -0.00522884; … ; 0.0111125 -0.00912967 … 0.00251989 -0.00502382; -0.0110743 -0.00690804 … -0.00635217 -0.000927986]
 Float32[0.0215018; 0.00813538; … ; -0.0160255; -0.0335421]                                                                                                                                                 
 Float32[-0.011663 -0.00700956 … -0.019803 0.0092691; -0.0115634 -0.00333796 … -0.016449 0.0503878; … ; -0.0168314 0.0159894 … 0.0102819 0.0152978; -0.0065773 -0.00888762 … -0.017012 -0.0184079]          
 Float32[0.0288304; 0.00993935; … ; 0.0075435; 0.0251265]                                                                                                                                                   
 Float32[-0.0327712 -0.161604 … -0.00946362 0.064118; 0.0884028 0.00108924 … -0.00425101 -0.0382556; … ; -0.035406 -0.0317789 … -0.0146121 -0.0

In [102]:
# Accuracy of the trained NN for training (trn) and test (tst) datasets (minibatches)
println((:trn, accuracy(w, dtrn, predict), :tst, accuracy(w, dtst, predict)))

(:trn, 0.9142666666666667, :tst, 0.9117)


## Finding BEST classified MNIST digits using a pretrained NN

In [1]:
using JLD
# Loading weights of pretrained NN.
w = load("weights.jld","w")

8-element Array{Array{Float32,N} where N,1}:
 Float32[-0.205926 -0.0904408 … 0.164117 -0.0761175; 0.161502 0.0331632 … -0.030826 0.0367094; … ; -0.117497 -0.105343 … -0.0505276 0.152032; -0.122442 0.035822 … 0.143265 -0.0221839]

Float32[-0.196986 0.0898157 … 0.219502 0.29099; -0.228842 -0.242728 … 0.121453 0.167709; … ; -0.262132 -0.113112 … 0.0419672 0.213271; 0.0574061 -0.217951 … -0.00433231 -0.0806912]

Float32[-0.0390116 -0.0933435 … -0.0483773 -0.161176; -0.1537 0.170491 … -0.0569636 0.00612008; … ; -0.102554 -0.167769 … -0.114964 0.137601; 0.191619 -0.10391 … 0.197754 0.225219]

...

Float32[0.0675215 0.186577 … -0.206832 -0.154209; -0.0110244 0.223018 … -0.196005 0.0518611; … ; 0.00557064 0.184096 … 0.0273884 -0.129886; -0.160689 0.272189 … -0.0360744 -0.184851]

Float32[-0.0108836 -0.0140184 … -0.0652887 0.0456719; 0.139839 0.356439 … 0.0073903 -0.0205404; … ; 0.182038 0.255407 … 0.00759476 -0.156602; 0.131637 -0.143448 … 0.135704 -0.202735]

Float32[0.232463 0.0505198 … -0.0

In [4]:
# Input images of test dataset
xtst

28×28×1×10000 Array{Float32,4}:
[:, :, 1, 1] =
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0       0.0       0.0       0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.239216  0.47451   0.47451   0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.94902   0.996078  0.996078  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.996078  0.996078  0.811765  0.0
 ⋮  

In [5]:
# Target digits for test dataset
# Labels are from 0 to 9
1ytst-1

10000-element Array{Int64,1}:
 6
 1
 0
 9
 3
 0
 3
 8
 4
 8
 9
 5
 8
 ⋮
 4
 5
 6
 7
 8
 9
 0
 1
 2
 3
 4
 5

In [6]:
# Select all test data for one big batch. mini batch step = 1 <==> every image in dataset
dtrn = minibatch(xtrn, ytrn, 1; xtype=atype);
dtst = minibatch(xtst, ytst, 1; xtype=atype);

In [7]:
dtst

Knet.MB(Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], UInt8[0x07 0x02 … 0x05 0x06], 1, 10000, false, 1:10000, [28, 28, 1, 10000], [10000], Array{Float32,N} where N, Array{UInt8,1})

In [14]:
# Let's check test dataset of 10000 images
for (x, y) in dtst
    prediction = predict(w, x)
    correct = accuracy(prediction, y)
    acc = nll(prediction, y)
    if correct > 0 && acc < 1.0e-4
        println((:correct, correct, :accuracy, acc, :digit, 1y[1]))
    end
end

LoadError: [91mMethodError: no method matching *(::Array{Float32,4}, ::Array{Float32,2})[0m
Closest candidates are:
  *(::Any, ::Any, [91m::Any[39m, [91m::Any...[39m) at operators.jl:424
  *([91m::Type{AutoGrad.Grad{1}}[39m, ::Any, [91m::Any[39m, [91m::AutoGrad.Rec{##1109<:Number}[39m, [91m::AutoGrad.Rec{##1110<:Number}[39m) where {##1109<:Number, ##1110<:Number} at :0
  *([91m::Type{AutoGrad.Grad{1}}[39m, ::Any, [91m::Any[39m, [91m::##1109<:Number[39m, [91m::AutoGrad.Rec{##1110<:Number}[39m) where {##1109<:Number, ##1110<:Number} at :0
  ...[39m