In [1]:
using BackwordsDiffLibrary
using Random
using Statistics 

abstract type Layer end

struct Dense <: Layer
    W::Array{Float32,2}
    b::Array{Float32,1}
    activation::Function     
end

function Dense(in_features::Int, out_features::Int, activation::Function)
    weights = glorot_uniform(out_features, in_features)
    baias = zeros(out_features)
    return Dense(weights, baias, activation)
end

function glorot_uniform(fan_out::Int, fan_in::Int)
    limit = sqrt(6f0 / (fan_in + fan_out))
    # rand(fan_out, fan_in) generates values in [0, 1), so we scale them to [-limit, limit)
    return rand(Float32, fan_out, fan_in) .* (2f0 * limit) .- limit
end

function (layer::Dense)(x::AbstractArray)
    z = layer.W * x .+ layer.b  
    return layer.activation.(z)
end

struct MLP
    layers::Vector{Layer}
end

function MLP(layers::Vector{Layer})
    return MLP(layers)    
end

# --- Optimizers ---

abstract type Optimiser end

struct SGD <: Optimiser
    lr::Float64
end

function update!(opt::SGD, ps, grads)
    for i in eachindex(ps)
        ps[i] .-= opt.lr .* grads[i]
    end
end

struct Adam <: Optimiser
    lr::Float32
    beta1::Float32
    beta2::Float32
    eps::Float32
    m::Vector{Array{Float32}}
    v::Vector{Array{Float32}}
    t::Vector{Int}
end

function Adam(lr::Float32, ps; beta1=0.9f0, beta2=0.999f0, eps=1e-8)
    m = [zeros(Float32, size(p)) for p in ps]
    v = [zeros(size(p)) for p in ps]
    Adam(lr, beta1, beta2, convert(Float32, eps), m, v, [0])
end

function update!(opt::Adam, ps, grads)
    opt.t[1] += 1
    for i in eachindex(ps)
        opt.m[i] .= opt.beta1 .* opt.m[i] .+ (1 - opt.beta1) .* grads[i]
        opt.v[i] .= opt.beta2 .* opt.v[i] .+ (1 - opt.beta2) .* (grads[i].^2)
        m_hat = opt.m[i] ./ (1 - opt.beta1^opt.t[1])
        v_hat = opt.v[i] ./ (1 - opt.beta2^opt.t[1])
        ps[i] .-= opt.lr .* m_hat ./ (sqrt.(v_hat) .+ opt.eps)
    end
end

# --- Passes ---

function forward(model::MLP, x::Array)
    A = Vector{AbstractMatrix{Float32}}()
    Z = Vector{AbstractMatrix{Float32}}()

    push!(A, x)

    for layer in model.layers
        z = layer.W * x .+ layer.b
        x = layer.activation.(z)

        push!(A, x)
        push!(Z, z)
    end
    return x, A, Z
end

function backword(loss_grad, model::MLP, A, Z, epoch, n)
    dA_prev = loss_grad
    gradients = []

    for i in length(model.layers):-1:1
        layer = model.layers[i]
        W = layer.W
        b = layer.b
        @diffunction d_activation(x) = layer.activation.(x) 

        z = Z[i]
        a = A[i]

        dZ = dA_prev .* grad(d_activation, [z])[1]

        dW = (dZ * a')
        db = sum(dZ, dims=2)
        dA_prev = W' * dZ

        B = size(a, 2)
        dW ./= B
        db ./= B

        push!(gradients, db)
        push!(gradients, dW)

        dA_prev = W' * dZ
    end

    reverse!(gradients)
    return gradients
end

# --- Helpers ---

function batch_accuracy(y_pred::Array, y_true::Array)
    if size(y_pred,1)==1
      # binary: predict “1” if p≥0.5, else “0”
      preds = vec(y_pred .>= 0.5f0)
      trues = vec(y_true .== 1f0)
    else
      # multi‑class: usual argmax
      preds = vec(argmax(y_pred, dims=1))
      trues = vec(argmax(y_true, dims=1))
    end
    return mean(preds .== trues)
end

function get_params(model::MLP)
    ps = []
    for layer in model.layers
        push!(ps, layer.W)
        push!(ps, layer.b)
    end
    return ps
end

# --- Training ---

function train!(model::MLP, loss_fun, X_train, y_train, X_test, y_test;
    epochs=5, lr=0.001, batchsize=64, optimizer=:SGD)

    # Extract parameters for possible use by the optimizer.
    ps = get_params(model)
    opt = nothing
    if optimizer == :SGD
        opt = SGD(lr)
    elseif optimizer == :Adam
        opt = Adam(convert(Float32, lr), ps)
    else
        error("Unsupported optimizer type. Choose :SGD or :Adam.")
    end

    n_samples = size(X_train, 2)  # assuming samples are arranged in columns

    for epoch in 1:epochs
        total_loss = 0.0
        total_acc = 0.0
        num_batches = 0

        # Process mini-batches.
        for i in 1:batchsize:n_samples
            last = min(i + batchsize - 1, n_samples)
            x_batch = X_train[:, i:last]
            y_batch = y_train[:, i:last]

            # FORWARD PASS
            y_pred, A, Z = forward(model, x_batch)
            
            # Compute current loss value for reporting.
            batch_loss = loss_fun(y_pred, y_batch)

            # accuracy
            batch_acc = batch_accuracy(y_pred, y_batch)

            # Compute Loss Gradient 
            @diffunction loss_wrapper(a, y) = loss_fun(a, y)
            delta = grad(loss_wrapper, [y_pred, y_batch])[1]
            
            # BACKWARD PASS
            grads = backword(delta, model, A, Z, epoch, num_batches)

            update!(opt, ps, grads)

            # accumulate
            total_loss += batch_loss
            total_acc  += batch_acc
            num_batches += 1
        end

        # average over all batches
        avg_train_loss = total_loss / num_batches
        avg_train_acc  = total_acc  / num_batches

        # optional validation on test set
        y_test_pred, _, _ = forward(model, Matrix(X_test))
        test_loss = loss_fun(y_test_pred, y_test)
        test_acc  = batch_accuracy(y_test_pred, y_test)

        println(
            "Epoch $epoch ▶ ",
            "Train Loss=$(round(avg_train_loss, digits=4)), ",
            "Train Acc=$(round(100*avg_train_acc, digits=2))%  │ ",
            "Test Loss=$(round(test_loss, digits=4)), ",
            "Test Acc=$(round(100*test_acc, digits=2))%"
        )
    end
end

nothing

In [2]:
using JLD2
X_train = load("data/imdb_dataset_prepared.jld2", "X_train")
y_train = load("data/imdb_dataset_prepared.jld2", "y_train")
y_train = Float32.(y_train)
X_test = load("data/imdb_dataset_prepared.jld2", "X_test")
y_test = load("data/imdb_dataset_prepared.jld2", "y_test")
y_test  = Float32.(y_test)

# --- Model Definition ---

loss_fun(y_pred, y_true) = -mean(y_true .* log.(y_pred .+ 1e-7) .+ (1 .- y_true) .* log.(1 .- y_pred .+ 1e-7))

in_features = size(X_train, 1)
hidden = 32
out_features = 1

layers = [Dense(in_features, hidden, ReLU), Dense(hidden, out_features, Sigmoid)]
model = MLP(layers)

nothing

In [3]:
train!(model, loss_fun, X_train, y_train, X_test, y_test; epochs=5, lr=0.001, batchsize=64, optimizer=:Adam)

Epoch 1 ▶ Train Loss=0.6463, Train Acc=77.81%  │ Test Loss=0.5795, Test Acc=82.65%
Epoch 2 ▶ Train Loss=0.46, Train Acc=90.72%  │ Test Loss=0.4401, Test Acc=85.85%
Epoch 3 ▶ Train Loss=0.2991, Train Acc=94.11%  │ Test Loss=0.3658, Test Acc=87.3%
Epoch 4 ▶ Train Loss=0.2045, Train Acc=96.26%  │ Test Loss=0.3323, Test Acc=87.0%
Epoch 5 ▶ Train Loss=0.1464, Train Acc=97.65%  │ Test Loss=0.3185, Test Acc=87.15%
