# Multilayer Perceptron

The code below is borrowed from Flux Model-Zoo [example](hhttps://github.com/FluxML/model-zoo/blob/master/vision/mlp_mnist/mlp_mnist.jl). I'm adding annotation and explanation in order to better understand what is happening

In [15]:
using Flux, Statistics
using Flux.Data: DataLoader
using Flux: onehotbatch, onecold, @epochs
using Flux.Losses: logitcrossentropy
using Base: @kwdef
using CUDA
using MLDatasets

In [16]:
function getdata(args, device)
    ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"

    # Loading Dataset
    xtrain, ytrain = MLDatasets.MNIST.traindata(Float32)
    xtest, ytest = MLDatasets.MNIST.testdata(Float32)

    # Reshape Data in order to flatten each image into a linear array
    xtrain = Flux.flatten(xtrain)
    xtest = Flux.flatten(xtest)

    # One-hot-encode the labels
    ytrain, ytest = onehotbatch(ytrain, 0:9), onehotbatch(ytest, 0:9)

    # Create DataLoaders (mini-batch iterators)
    train_loader = DataLoader((xtrain, ytrain), batchsize=args.batchsize, shuffle=true)
    test_loader = DataLoader((xtest, ytest), batchsize=args.batchsize)

    return train_loader, test_loader
end

getdata (generic function with 1 method)

A dense neural network with:
1. Input layer: size 28x28x1
2. Hidden layer: size 32x1 with reLU activation function
3. Hidden layer: size 32x1 with no activation function (linear transformation)
4. Output layer: size 10


Questions:
1. Why doesn't the second dense layer have an activation function?
2. Why doesn't the output layer have an activation function?

In [17]:
function build_model(; imgsize=(28,28,1), nclasses=10)
    return Chain(
        Dense(prod(imgsize), 32, relu),
        Dense(32, nclasses))
end

build_model (generic function with 1 method)

softmax function outputs a vector of probabilities of n length.
$$\frac{e^{y_i}}{\sum^n_{j}e^{y_j}}$$

loss function is logit crossentropy

$$\frac{-\sum^{n}{\bf{y} * \log{softmax(\hat{\bf{y}})}}}{n}$$

accuracy



In [18]:
function loss_and_accuracy(data_loader, model, device)
    acc = 0
    ls = 0.0f0
    num = 0
    for (x, y) in data_loader
        x, y = device(x), device(y)
        ŷ = model(x)
        ls += logitcrossentropy(model(x), y, agg=sum)
        acc += sum(onecold(cpu(model(x))) .== onecold(cpu(y)))
        num +=  size(x, 2)
    end
    return ls / num, acc / num
end

loss_and_accuracy (generic function with 1 method)

Arguements accepted by train function

In [19]:
@kwdef mutable struct Args
    η::Float64 = 3e-4       # learning rate
    batchsize::Int = 256    # batch size
    epochs::Int = 10        # number of epochs
    use_cuda::Bool = true   # use gpu (if cuda available)
end

Args

In [20]:
function train(; kws...)
    args = Args(; kws...) # collect options in a struct for convenience

    if CUDA.functional() && args.use_cuda
        @info "Training on CUDA GPU"
        CUDA.allowscalar(false)
        device = gpu
    else
        @info "Training on CPU"
        device = cpu
    end

    # Create test and train dataloaders
    train_loader, test_loader = getdata(args, device)

    # Construct model
    model = build_model() |> device
    ps = Flux.params(model) # model's trainable parameters
    
    ## Optimizer
    opt = ADAM(args.η)
    
    ## Training
    for epoch in 1:args.epochs
        for (x, y) in train_loader
            x, y = device(x), device(y) # transfer data to device
            gs = gradient(() -> logitcrossentropy(model(x), y), ps) # compute gradient
            Flux.Optimise.update!(opt, ps, gs) # update parameters
        end
        
        # Report on train and test
        train_loss, train_acc = loss_and_accuracy(train_loader, model, device)
        test_loss, test_acc = loss_and_accuracy(test_loader, model, device)
        println("Epoch=$epoch")
        println("  train_loss = $train_loss, train_accuracy = $train_acc")
        println("  test_loss = $test_loss, test_accuracy = $test_acc")
    end
    return model
end

train (generic function with 1 method)

In [21]:
model=train()

┌ Info: Training on CPU
└ @ Main In[20]:9


Epoch=1
  train_loss = 0.58702844, train_accuracy = 0.8582166666666666
  test_loss = 0.5663061, test_accuracy = 0.8667
Epoch=2
  train_loss = 0.39506525, train_accuracy = 0.8982166666666667
  test_loss = 0.38087246, test_accuracy = 0.9048
Epoch=3
  train_loss = 0.33085406, train_accuracy = 0.9101
  test_loss = 0.3210697, test_accuracy = 0.9145
Epoch=4
  train_loss = 0.295957, train_accuracy = 0.9188
  test_loss = 0.28966498, test_accuracy = 0.9211
Epoch=5
  train_loss = 0.2740223, train_accuracy = 0.9240666666666667
  test_loss = 0.27111995, test_accuracy = 0.9254
Epoch=6
  train_loss = 0.25454944, train_accuracy = 0.9292
  test_loss = 0.25350928, test_accuracy = 0.9301
Epoch=7
  train_loss = 0.24209398, train_accuracy = 0.9321666666666667
  test_loss = 0.24446595, test_accuracy = 0.9332
Epoch=8
  train_loss = 0.23079434, train_accuracy = 0.9363833333333333
  test_loss = 0.23403232, test_accuracy = 0.9335
Epoch=9
  train_loss = 0.22088745, train_accuracy = 0.9381
  test_loss = 0.225104

Chain(
  Dense(784, 32, relu),                 [90m# 25_120 parameters[39m
  Dense(32, 10),                        [90m# 330 parameters[39m
)[90m                   # Total: 4 arrays, [39m25_450 parameters, 99.664 KiB.

In [31]:
Flux.params(model)

Params([Float32[0.049993202 -0.06393898 … -0.06987077 0.04271984; -0.061495777 -0.014502867 … 0.0018826268 0.051979214; … ; 0.042647757 -0.049728796 … -0.08447711 0.07487877; 0.054329116 -0.062309947 … 0.03573599 0.047698807], Float32[0.058615033, 0.07206579, 0.10467072, 0.09704774, -0.03483924, 0.04263985, 0.14307258, -0.06871432, -0.066273786, 0.035063792  …  0.10930989, 0.088000655, 0.12883368, 0.021055035, 0.119266994, -0.057405416, 0.14141738, 0.119340084, 0.08043698, -0.06795475], Float32[-0.26071203 -0.42110112 … 0.15960893 -0.3156442; 0.46470702 0.070996575 … 0.21388298 0.26626477; … ; 0.30768564 0.26401868 … -0.45963842 0.41523233; -0.4036806 0.44423026 … 0.12616383 -0.44311807], Float32[-0.07924302, 0.08012125, -0.047721677, -0.026696526, 0.03917007, 0.087582976, 0.027365515, 0.062117185, -0.13889477, -0.04218835]])

(Dense(784, 32, relu), Dense(32, 10))