In [13]:
function batch(xTrain, yTrain, batches,batch_size, i)
    
    #println((batches,i, a, b))
    if length(size(yTrain))==1
        a, b = Int(batch_size*(i-1)+1), min(batch_size * i, length(yTrain))
        if i==batches
            input, label = xTrain[a:end,:], yTrain[a:end]
        else
            input, label = xTrain[a:b,:], yTrain[a:b]
        end 
    elseif length(yTrain[:,1])==1
        a, b = Int(batch_size*(i-1)+1), min(batch_size * i, length(yTrain[:,1]))
        input, label = xTrain[a:b,:], yTrain[:,a:b]
    elseif length(yTrain[:,1])==10
        a, b = Int(batch_size*(i-1)+1), min(batch_size * i, length(yTrain[1,:]))
        input, label = xTrain[a:b,:], yTrain[:,a:b]
    else
        #println("here")
        a, b = Int(batch_size*(i-1)+1), min(batch_size * i, length(yTrain[:,1]))
        input, label = xTrain[a:b,:], yTrain[a:b,:]
        #display(yTrain)
    end
    return input, label
end

function accuracy_score(y_true::Matrix{Int}, y_pred::Matrix{Float64})
    # Find the index of the maximum value for each column in y_true and y_pred
    true_indices = argmax(y_true, dims=1)
    pred_indices = argmax(y_pred, dims=1)
    
    # Flatten the result to convert from a 1xN matrix to a vector
    true_indices = vec(true_indices)
    pred_indices = vec(pred_indices)
    
    # Calculate the number of correct predictions
    correct_predictions = sum(true_indices .== pred_indices)
    
    # Calculate the accuracy as a percentage
    accuracy = correct_predictions / length(true_indices)
    
    return accuracy
end
function initialize(method, layers, learning_rate, num_batches,batch_size;γ=0.9, ρ=0.9, ρ_1 = 0.9, ρ_2=0.999)
    if method==SGD!
        return (learning_rate, num_batches,batch_size)

    elseif method==Momentum!
        vW = [zeros(size(W)) for (W, b) in layers]
        vb = [zeros(size(b)) for (W, b) in layers]
        return (vW, vb, learning_rate, γ, num_batches,batch_size)
    
    elseif method==AdaGrad!
        rW = [zeros(size(W)) for (W, b) in layers]
        rb = [zeros(size(b)) for (W, b) in layers]
        return (rW, rb, learning_rate, num_batches,batch_size)
    elseif method==RMSProp!
        rW = [zeros(size(W)) for (W, b) in layers]
        rb = [zeros(size(b)) for (W, b) in layers]
        return (rW, rb, ρ, learning_rate, num_batches,batch_size)
    elseif method==ADAM!
        rW = [zeros(size(W)) for (W, b) in layers]
        rb = [zeros(size(b)) for (W, b) in layers]
        sW = [zeros(size(W)) for (W, b) in layers]
        sb = [zeros(size(b)) for (W, b) in layers]
        t=0
        return (ρ_1, ρ_2, rW, sW, rb, sb, t, learning_rate, num_batches,batch_size)
    end
end

initialize (generic function with 2 methods)

In [10]:
function Train(xdata, ydata, layers, activation_funcs, learning_rate=0.001, epochs=100; batch_size, optimizer, loss, evaluate)
    xTrain, xTest, yTrain, yTest = SplitData(xdata, ydata, 0.7)
    num_batches =Int(round(length(yTrain[:,1])/batch_size))
    params = initialize(optimizer, layers, learning_rate, num_batches)
    
    for epoch in 1:epochs
        (layers, params) = optimizer(xTrain,yTrain, layers,  activation_funcs,loss, params)
        if epoch%10==0
            println("Epoch:  ", epoch, "  Loss: ", evaluate(yTest, feed_forward_batched(layers, xTest, activation_funcs;backprop=false)))
        end
    end
    return layers
end

function SGD!(xTrain, yTrain, layers, activation_funcs, loss_func, params)
    (η,num_batches,batch_size) = params
    #γ = 0.9
    for i in 1:num_batches
        input, target = batch(xTrain, yTrain, num_batches,batch_size, i)

        grads = Zygote.gradient(layers -> loss_func(input, layers, activation_funcs, target), layers)[1]
        for ((W, b), (W_g, b_g)) in zip(layers, grads)
            W .-=η.*W_g
            b .-=η.*b_g
        end
    end
    params = (η,num_batches,batch_size)
    return (layers, params)
end
                                                            
function Momentum!(xTrain, yTrain, layers, activation_funcs, loss_func, params)
    (vW, vb, η, γ, num_batches,batch_size) = params

    # Iterate over batches
    for i in 1:num_batches

        # Get the current batch of data
        input, target = batch(xTrain, yTrain, num_batches,batch_size, i)
        # Calculate the gradients
        grads = Zygote.gradient(layers -> loss_func(input, layers, activation_funcs, target), layers)[1]

        # Update weights and biases using momentum
        for j in 1:length(layers)
            W, b = layers[j]
            W_g, b_g = grads[j]
            
            # Update velocity terms
            vW[j] = γ .* vW[j] .+ η .* W_g
            vb[j] = γ .* vb[j] .+ η .* b_g

            # Update parameters
            layers[j] = (W .- vW[j], b .- vb[j])  # In-place update for weights and biases
        end
    end
    params = (vW, vb, η, γ, num_batches,batch_size)

    return (layers, params)
end

function AdaGrad!(xTrain, yTrain, layers, activation_funcs, loss_func, params)
    (rW, rb, η, num_batches,batch_size) = params
    δ=1e-7
    # Iterate over batches
    for i in 1:num_batches
        # Get the current batch of data
        input, target = batch(xTrain, yTrain, num_batches,batch_size, i)
        grads = Zygote.gradient(layers -> loss_func(input, layers, activation_funcs, target), layers)[1]
        # Update weights and biases using momentum
        for j in 1:length(layers)
            W, b = layers[j]
            W_g, b_g = grads[j]
            rW[j] = rW[j] .+ (W_g .* W_g)
            rb[j] = rb[j] .+ (b_g .* b_g)
            # Update velocity terms
            Δθ_W = @. η/(δ+√rW[j])* W_g
            Δθ_b = @. η/(δ+√rb[j])* b_g

            # Update parameters
            layers[j] = (W .- Δθ_W, b .- Δθ_b)  # In-place update for weights and biases
        end
    end
    params = (rW, rb, η, num_batches,batch_size)
    return (layers, params)
end

function RMSProp!(xTrain, yTrain, layers, activation_funcs, loss_func, params)
    (rW, rb, ρ, η, num_batches,batch_size) = params
    δ=1e-7
    # Iterate over batches
    for i in 1:num_batches
        # Get the current batch of data
        input, target = batch(xTrain, yTrain, num_batches,batch_size, i)
        
        # Calculate the gradients
        grads = Zygote.gradient(layers -> loss_func(input, layers, activation_funcs, target), layers)[1]
        
        # Update weights and biases using momentum
        for j in 1:length(layers)
            W, b = layers[j]
            W_g, b_g = grads[j]
            
            rW[j] = @. ρ*rW[j] + (1-ρ)*(W_g .* W_g)
            rb[j] = @. ρ*rb[j] + (1-ρ)*(b_g .* b_g)
            # Update velocity terms
            Δθ_W = @. η/(δ+√rW[j]) .* W_g
            Δθ_b = @. η/(δ+√rb[j]) .* b_g

            # Update parameters
            layers[j] = (W .- Δθ_W, b .- Δθ_b)  # In-place update for weights and biases
        end
    end
    params = (rW, rb, ρ, η, num_batches,batch_size)
    return (layers, params)
end

function ADAM!(xTrain, yTrain, layers, activation_funcs, loss_func, params)
    (ρ_1, ρ_2, rW, sW, rb, sb, t, η, num_batches, batch_size) = params
    #println(params)
    δ=1e-7
    # Iterate over batches

    for i in 1:num_batches
        # Get the current batch of data
        input, target = batch(xTrain, yTrain, num_batches,batch_size, i)
        # Calculate the gradients
        grads = Zygote.gradient(layers -> loss_func(input, layers, activation_funcs, target), layers)[1]
        
        # Update weights and biases using momentum
        t=t+1
        #println(t)
        for j in 1:length(layers)
            W, b = layers[j]
            W_g, b_g = grads[j]
            
            sW[j] .= ρ_1.*sW[j] .+ (1-ρ_1)*W_g
            rW[j] .= ρ_2.*rW[j] .+ (1-ρ_2)*(W_g.*W_g)
            
            sb[j] .= ρ_1.*sb[j] .+ (1-ρ_1)*b_g
            rb[j] .= ρ_2.*rb[j] .+ (1-ρ_2)*(b_g.*b_g)
            
            sW_temp = sW[j] ./(1-ρ_1^t)
            rW_temp = rW[j] ./(1-ρ_2^t)
            sb_temp = sb[j] ./(1-ρ_1^t)
            rb_temp = rb[j] ./(1-ρ_2^t)

            # Update velocity terms
            Δθ_W = @. η*sW_temp/(δ+√rW_temp)
            Δθ_b = @. η*sb_temp/(δ+√rb_temp)

            # Update parameters
            layers[j] = (W .- Δθ_W, b .- Δθ_b)  # In-place update for weights and biases
        end
    end
    params=(ρ_1, ρ_2, rW, sW, rb, sb, t, η, num_batches,batch_size)
    return (layers, params)
end

ADAM! (generic function with 1 method)