In [None]:

# Create layers
function create_layers_batch(network_input_size, layer_output_sizes)
    input_size = network_input_size
    layers = []
    for output_size in layer_output_sizes
        W = randn(output_size, input_size)  # W: (output_size, input_size)
        b = randn(output_size)
        push!(layers, (W, b))
        input_size = output_size
    end
    return layers
end

function feed_forward_batched(layers, input, activation_functions; backprop=false)
    if length(size(input)) == 1
        input = reshape(input, :, 1)  # Ensure input is a column vector
    end
    if backprop
        Back = []
        a = input
        push!(Back, (a, nothing))  # Store input activations; z is nothing
        for ((W, b), func) in zip(layers, activation_functions)
            z = a * W' .+ b'  # Compute pre-activation
            a = func(z)       # Compute activation
            push!(Back, (a, z))  # Store activation after current layer and pre-activation
        end

        return a, Back
    else
        a = input

        for ((W, b), func) in zip(layers, activation_functions)
            z = a * W' .+ b'  # Compute pre-activation
            a = func(z)       # Compute activation
        end
        return a
    end
end
ReLU(z) = max.(0, z)

function Sigmoid(z)
    return 1 ./ (1 .+ exp.(-z))
end
function Softmax(z)
    max_z = maximum(z, dims=2)  # Maximum along the class dimension
    e_z = exp.(z .- max_z)      # Subtract max for numerical stability
    sum_e_z = sum(e_z, dims=2)  # Sum across the class dimension
    return e_z ./ sum_e_z       # Normalize by sum of exponentials
end

In [5]:
function BackwardsPropagation(layers, y, backs, activation_functions)
    batch_size = size(y, 1)
    L = length(layers)
    δ = Vector{Any}(undef, L)  # List to store δ_l at each layer
    gradients = Vector{Any}(undef, L)
    η = 0.001  # Learning rate

    # Compute delta for the output layer (for binary cross-entropy with sigmoid)
    (a_L, z_L) = backs[end]  # a_L is the final output, z_L is the pre-activation
    δ_L = activation_derivative(a_L, activation_functions[L])  # δ_L = p - y, where p is sigmoid(a_L)
    δ[L] = δ_L

    # Backpropagate the error through the hidden layers
    for l in (L-1):-1:1
        # Get weights and delta from the next layer
        W_next, _ = layers[l + 1]
        δ_next = δ[l + 1]

        # Compute delta for the current layer
        δ_temp = δ_next * W_next  # Propagate delta back

        # Compute derivative of activation function
        (a_l, z_l) = backs[l+1]  # z_l corresponds to layer l
        σ_prime = activation_derivative(a_l, activation_functions[l])  # Derivative of sigmoid: a_l * (1 - a_l)

        # Compute delta for the current layer
        δ_l = δ_temp .* σ_prime  # Element-wise multiplication
        δ[l] = δ_l
    end

    # Update weights and biases
    for l in 1:L
        W_l, b_l = layers[l]
        (a_prev, _) = backs[l]  # Activations from the previous layer
        δ_l = δ[l]

        # Compute gradients
        grad_W = (a_prev' * δ_l)' / batch_size  # Gradient for weights
        grad_b = (mean(δ_l, dims=1)')  # Gradient for biases

        # Update weights and biases using gradient descent
        W_l -= η * grad_W
        b_l -= η * grad_b

        # Update the layer in the list
        layers[l] = (W_l, b_l)
        gradients[l] = (grad_W, grad_b)
    end

    return layers
end
function activation_derivative(a_l, activation_func)
    if activation_func == Sigmoid
        # Derivative of sigmoid: a_l * (1 - a_l)
        return a_l .* (1 .- a_l)
    elseif activation_func == ReLU
        # Derivative of ReLU: 1 if a_l > 0, else 0
        return Float64(a_l .> 0)
    elseif activation_func == Softmax
        # Softmax derivative is handled with cross-entropy in the output layer
        # No need for a separate derivative here
        return a_l
    else
        error("Unknown activation function: $activation_func")
    end
end

function ConfusionMatrix(model, input, target, act_func=nothing)
    # Obtain predictions from the model
    predictions = act_func == nothing ? model(input) : feed_forward_batched(model, input, activation_functions; backprop=false)

    # Determine if targets are one-hot encoded
    if ndims(target) == 2 && size(target, 1) > 1 && size(target, 2) != 1
        # One-hot encoded targets (e.g., MNIST)
        target_labels = vec(map(argmax, eachcol(target)))
        predicted_labels = size(predictions, 1) >= size(predictions, 2) ? vec(map(argmax, eachrow(predictions))) : vec(map(argmax, eachcol(predictions)))
    else
        # Non-one-hot encoded targets (e.g., Wisconsin dataset)
        target_labels = vec(target)
        println(size(predictions, 2))
        
        if size(predictions, 1) == 1
            predicted_labels = predictions .>= 0.5
            predicted_labels = vec(predicted_labels)
        elseif size(predictions, 2) == 1
            predicted_labels = predictions .>= 0.5
            predicted_labels = vec(predicted_labels)
        else
            predicted_labels = vec(map(argmax, eachcol(predictions)))
        end
    end

    # Ensure labels are integers starting from 1
    unique_labels = sort(unique(vcat(target_labels)))
    label_to_index = Dict(label => idx for (idx, label) in enumerate(unique_labels))

    indexed_target_labels = [label_to_index[label] for label in target_labels]
    indexed_predicted_labels = [label_to_index[label] for label in predicted_labels]
    #display(indexed_predicted_labels)
    num_classes = length(unique_labels)
    cm = zeros(Int, num_classes, num_classes)

    # Populate the confusion matrix
    for (t, p) in zip(indexed_target_labels, indexed_predicted_labels)
        cm[t, num_classes+1-p] += 1
    end
    
    # Normalize the confusion matrix to show percentages
    cm_percentage = zeros(Float64, num_classes, num_classes)
    for i in 1:num_classes
        total_in_class = sum(cm[i, :])  # Total instances of class i
        if total_in_class > 0
            cm_percentage[i, :] = cm[i, :] / total_in_class #* 100  # Convert to percentage
        end
    end
    
    return cm_percentage, unique_labels
end


ConfusionMatrix (generic function with 2 methods)