In [None]:
# Combined loss function
def compute_loss(y_true, y_pred, loss_type="cross_entropy"):
    if loss_type == "mse":
        return np.mean(np.square(y_true - y_pred))
    elif loss_type == "cross_entropy":
        return -np.mean(y_true * np.log(y_pred + 1e-8))
    else:
        raise ValueError("Invalid loss function. Choose 'mse' or 'cross_entropy'")
def backpropagation_no_update(X, y, weights, biases, activation_func, loss_type="cross_entropy"):
    """
    Backpropagation that ONLY computes gradients w.r.t. weights and biases,
    without performing any parameter updates.
    """
    # Forward pass: obtain activations (h) and pre-activations (a)
    h, a = forward_propagation(X, weights, biases, activation_func)
    
    # Compute the gradient at the output layer.
    if loss_type == "cross_entropy":
        grad_loss = h[-1] - y
    elif loss_type == "mse":
        # For MSE with sigmoid output
        grad_loss = (h[-1] - y) * (h[-1] * (1. - h[-1]))
    else:
        raise NotImplementedError(f"Loss type '{loss_type}' not implemented.")
    
    # Initialize lists to hold gradients for weights and biases.
    grad_weights = [None] * len(weights)
    grad_biases  = [None] * len(biases)
    
    # Start backpropagation from the output layer.
    grad_h = grad_loss
    for i in reversed(range(len(weights))):
        
        if i != len(weights) - 1:
            if activation_func == "sigmoid":
                grad_a = grad_h * d_sigmoid(a[i])
            elif activation_func == "tanh":
                grad_a = grad_h * d_tanh(a[i])
            elif activation_func == "relu":
                grad_a = grad_h * d_relu(a[i])
            elif activation_func == "identity":
                grad_a = grad_h * d_identity(a[i])  
            else:
                raise ValueError(f"Unsupported activation function: {activation_func}")
        else:
            grad_a = grad_h  # For the output layer
        
        # Compute gradients (note: no averaging is performed).
        grad_weights[i] = np.dot(h[i].T, grad_a)
        grad_biases[i]  = np.sum(grad_a, axis=0, keepdims=True)
        
        # Propagate the gradient to the previous layer.
        grad_h = np.dot(grad_a, weights[i].T)
    
    return grad_weights, grad_biases
        
def momentum(x_train, y_train, weights, biases, learning_rate=0.1, momentum_coef=0.5, num_epochs=1):
    """
    Momentum-based optimizer that computes gradients over the entire training set
    (i.e. one batch per epoch) and updates the parameters once per epoch, using momentum.
    
    Parameters:
    -----------
    x_train : np.ndarray
        Training data (features).
    y_train : np.ndarray
        Training data (labels).
    weights : list of np.ndarray
        List of weight matrices for each layer.
    biases : list of np.ndarray
        List of bias vectors for each layer.
    learning_rate : float, optional (default=0.1)
        The learning rate for the update.
    momentum_coef : float, optional (default=0.5)
        The momentum coefficient.
    num_epochs : int, optional (default=1)
        Number of epochs to run.
    
    Returns:
    --------
    weights : list of np.ndarray
        Updated weight matrices.
    biases : list of np.ndarray
        Updated bias vectors.
    """
    
    # Initialize momentum vectors for weights and biases (velocity)
    velocity_w = [np.zeros_like(w) for w in weights]
    velocity_b = [np.zeros_like(b) for b in biases]
    y_train = y_train.flatten()
    batch_size = 64 
    for epoch in range(num_epochs):
        total_loss = 0
        for i in range(0, len(x_train), batch_size):
            batch_x = x_train[i:i+batch_size]
            batch_y = y_train[i:i+batch_size]

            # Forward pass
            h, _ = forward_propagation(batch_x, weights, biases, config["activation"])

            # Compute loss using matching batch sizes
            loss = compute_loss(batch_y, h[-1], config["loss"])

            # Calculate gradients based on this batch
            grad_w, grad_b = backpropagation_no_update(
                batch_x, batch_y,
                weights.copy(), biases.copy(),
                activation_func=config["activation"],
                loss_type=config["loss"]
                 )
            # Update parameters using momentum
            for j in range(len(weights)):
                # Update velocity: v = momentum_coef * v + learning_rate * accumulated_gradient
                velocity_w[j] = momentum_coef * velocity_w[j] + learning_rate * grad_w[j]
                velocity_b[j] = momentum_coef * velocity_b[j] + learning_rate * grad_b[j]

                # Update parameters (SGD update with momentum)
                weights[j] -= velocity_w[j]
                biases[j]  -= velocity_b[j]

      
    return weights, biases
def nag(x_train, y_train, weights, biases, learning_rate=0.1, momentum_coef=0.5, num_epochs=1, loss_type="cross_entropy"):
    # Initialize velocity vectors for weights and biases (momentum)
    velocity_w = [np.zeros_like(w) for w in weights]
    velocity_b = [np.zeros_like(b) for b in biases]
    y_train = y_train.flatten()

    for epoch in range(num_epochs):
        # Use the entire training set as one batch
        batch_x = x_train
        batch_y = np.eye(10)[y_train.astype(int)]

#         batch_y = np.eye(10)[y_train[start:end].astype(int)]

        # Lookahead step: temporarily adjust parameters using momentum
        temp_weights = [weights[j] - momentum_coef * velocity_w[j] for j in range(len(weights))]
        temp_biases  = [biases[j] - momentum_coef * velocity_b[j] for j in range(len(biases))]
        
        # Forward pass using lookahead parameters
        h, _ = forward_propagation(batch_x, temp_weights, temp_biases, config["activation"])
        total_loss = compute_loss(batch_y, h[-1], loss_type)
        
        # Compute gradients using lookahead parameters
        grad_w, grad_b = backpropagation_no_update(
            batch_x, batch_y,
            temp_weights.copy(), temp_biases.copy(),
           activation_func=config["activation"],
            loss_type=config["loss"]
             )
        
        # Update velocities and parameters using NAG update rule
        for j in range(len(weights)):
            velocity_w[j] = momentum_coef * velocity_w[j] + learning_rate * grad_w[j]
            velocity_b[j] = momentum_coef * velocity_b[j] + learning_rate * grad_b[j]
            
            weights[j] -= velocity_w[j]
            biases[j]  -= velocity_b[j]
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")
    
    # Evaluate on test set (assuming x_test and y_test are defined globally)
    h_test, _ = forward_propagation(x_test, weights, biases, config["activation"])
    y_pred_test = np.argmax(h_test[-1], axis=1)
    test_accuracy = compute_accuracy(y_test, y_pred_test)
    print(f"Test Accuracy: {test_accuracy:.2f}%")
    
    return test_accuracy


# def nag(x_train, y_train, weights, biases, learning_rate=0.1, momentum_coef=0.5, num_epochs=1):
#     """
#     Nesterov Accelerated Gradient (NAG) optimizer that uses the entire training set as a single batch per epoch.
#     It applies a lookahead step before computing the gradients and updates parameters once per epoch.
    
#     Parameters:
#     -----------
#     x_train : np.ndarray
#         Training data (features).
#     y_train : np.ndarray
#         Training data (labels).
#     weights : list of np.ndarray
#         List of weight matrices for each layer.
#     biases : list of np.ndarray
#         List of bias vectors for each layer.
#     learning_rate : float, optional (default=0.1)
#         The learning rate for the update.
#     momentum_coef : float, optional (default=0.5)
#         The momentum coefficient.
#     num_epochs : int, optional (default=1)
#         Number of epochs to run.
    
#     Returns:
#     --------
#     test_accuracy : float
#         The test accuracy computed after training (assumes x_test and y_test are defined globally).
#     """
    
#     # Initialize velocity vectors for weights and biases (momentum)
#     velocity_w = [np.zeros_like(w) for w in weights]
#     velocity_b = [np.zeros_like(b) for b in biases]
    
#     for epoch in range(num_epochs):
#         # Use the entire training set as one batch
#         batch_x = x_train
# #         batch_y = np.eye(10)[y_train]  # One-hot encoding
#         batch_y = np.eye(10)[y_train.astype(int)]  # One-hot encoding with integer indices
     
#         # Lookahead step: temporarily adjust parameters using momentum
#         temp_weights = [weights[j] - momentum_coef * velocity_w[j] for j in range(len(weights))]
#         temp_biases  = [biases[j] - momentum_coef * velocity_b[j] for j in range(len(biases))]
        
#         # Forward pass using lookahead parameters
#         h, _ = forward_propagation(batch_x, temp_weights, temp_biases)
#         total_loss = compute_loss(batch_y, h[-1], config.loss)
        
#         # Compute gradients using lookahead parameters
#         grad_w, grad_b = backpropagation_no_update(
#             batch_x, batch_y,
#             temp_weights.copy(), temp_biases.copy(),
#             activation_func=config.activation,
#             loss_type=config.loss
#         )
        
#         # Update velocities and parameters using NAG update rule
#         for j in range(len(weights)):
#             # Update velocity: v = momentum_coef * v + learning_rate * grad
#             velocity_w[j] = momentum_coef * velocity_w[j] + learning_rate * grad_w[j]
#             velocity_b[j] = momentum_coef * velocity_b[j] + learning_rate * grad_b[j]
            
#             # Update parameters: parameter = parameter - velocity
#             weights[j] -= velocity_w[j]
#             biases[j]  -= velocity_b[j]
        
#         wandb.log({"epoch": epoch + 1, "loss": total_loss})
    
#     # Evaluate test accuracy (assumes x_test and y_test are defined globally)
#     h_test, _ = forward_propagation(x_test, weights.copy(), biases.copy(), activation_func=config.activation)
#     y_pred_test = np.argmax(h_test[-1], axis=1)
#     test_accuracy = compute_accuracy(y_test, y_pred_test)
#     wandb.log({"test_accuracy": test_accuracy})
    
#     return test_accuracy

def sgd(x_train, y_train, weights, biases, learning_rate=0.1, num_epochs=1):
    """
    Pure Stochastic Gradient Descent (SGD) optimizer that updates parameters
    after processing each individual training sample.

    Parameters:
    -----------
    x_train : np.ndarray
        Training data (features), shape (num_samples, num_features).
    y_train : np.ndarray
        Training labels (integers), shape (num_samples,).
    weights : list of np.ndarray
        List of weight matrices for each layer.
    biases : list of np.ndarray
        List of bias vectors for each layer.
    learning_rate : float, optional (default=0.1)
        The learning rate for the parameter updates.
    num_epochs : int, optional (default=1)
        Number of epochs to run.

    Returns:
    --------
    test_accuracy : float
        The accuracy on the test set (assuming x_test, y_test, etc. are defined globally).
    """
    num_samples = len(x_train)
    y_train = y_train.flatten()

    for epoch in range(num_epochs):
        total_loss = 0.0
        
        # Optionally, shuffle the training data at the beginning of each epoch
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        x_train = x_train[indices]
        y_train = y_train[indices]
        y_train = y_train.flatten()

        # Process each training sample one-by-one
        for i in range(num_samples):
            # Extract a single sample and reshape for forward propagation
            sample_x = x_train[i].reshape(1, -1)  # shape: (1, num_features)
            # Convert the scalar label to a one-hot vector
            sample_y = np.eye(10)[int(y_train[i].item())].reshape(1, -1)



            # Forward pass on the single sample
            h, _ = forward_propagation(sample_x, weights, biases, config["activation"])
            loss = compute_loss(sample_y, h[-1], config["loss"])
            total_loss += loss

            # Compute gradients for the single sample
            grad_w, grad_b = backpropagation_no_update(
                sample_x, sample_y,
                weights.copy(), biases.copy(),
                activation_func=config["activation"],
                loss_type=config["loss"] )
                 

            # Update parameters immediately (pure SGD)
            for j in range(len(weights)):
                weights[j] -= learning_rate * grad_w[j]
                biases[j]  -= learning_rate * grad_b[j]
        
        wandb.log({"epoch": epoch + 1, "loss": total_loss})
    
    # Evaluate on test data (assuming x_test and y_test are defined globally)
    h_test, _ = forward_propagation(x_test, weights, biases, config["activation"])
    y_pred_test = np.argmax(h_test[-1], axis=1)
    test_accuracy = compute_accuracy(y_test, y_pred_test)
    

    return test_accuracy

def rmsprop(x_train, y_train, weights, biases, 
                 learning_rate=0.1, beta=0.5, eps=0.000001, num_epochs=1):
    """
    RMSProp optimizer in a pure stochastic setting, updating parameters after each sample.

    Parameters:
    -----------
    x_train : np.ndarray
        Training data (features), shape (num_samples, num_features).
    y_train : np.ndarray
        Training labels (integers), shape (num_samples,).
    weights : list of np.ndarray
        List of weight matrices for each layer.
    biases : list of np.ndarray
        List of bias vectors for each layer.
    learning_rate : float, optional (default=0.1)
        The learning rate for parameter updates.
    beta : float, optional (default=0.5)
        Decay rate for the moving average of squared gradients.
    eps : float, optional (default=1e-8)
        Small constant to avoid division by zero.
    num_epochs : int, optional (default=1)
        Number of epochs to run.

    Returns:
    --------
    test_accuracy : float
        The accuracy on the test set (assuming x_test, y_test, etc. are defined globally).
    """
    num_samples = len(x_train)

    # Initialize RMSProp accumulators for squared gradients
    v_w = [np.zeros_like(w) for w in weights]
    v_b = [np.zeros_like(b) for b in biases]
    y_train = y_train.flatten()

    for epoch in range(num_epochs):
        total_loss = 0.0
        
        # Shuffle the training data at the beginning of each epoch (optional, but common in SGD)
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        x_train = x_train[indices]
        y_train = y_train[indices]

        # Process each training sample one-by-one
        for i in range(num_samples):
            # Extract a single sample and reshape for forward propagation
            sample_x = x_train[i].reshape(1, -1)  # shape: (1, num_features)
            # Convert the scalar label to a one-hot vector
            sample_y = np.eye(10)[int(y_train[i].item())].reshape(1, -1)

            # Forward pass on the single sample
            h, _ = forward_propagation(sample_x, weights, biases, config["activation"])
            loss = compute_loss(sample_y, h[-1], config["loss"])
            total_loss += loss

            # Compute gradients for the single sample
            grad_w, grad_b = backpropagation_no_update(
                sample_x, sample_y,
                weights.copy(), biases.copy(),
                activation_func=config["activation"],
                loss_type=config["loss"]
                 )

            # === RMSProp Update for Each Layer ===
            for j in range(len(weights)):
                # Update the moving average of the squared gradients
                v_w[j] = beta * v_w[j] + (1 - beta) * (grad_w[j] ** 2)
                v_b[j] = beta * v_b[j] + (1 - beta) * (grad_b[j] ** 2)

                # Update parameters: W -= lr * grad / (sqrt(v) + eps)
                weights[j] -= learning_rate * grad_w[j] / (np.sqrt(v_w[j]) + eps)
                biases[j]  -= learning_rate * grad_b[j] / (np.sqrt(v_b[j]) + eps)
        
       
    
    # Evaluate on test data (assuming x_test and y_test are defined globally)
    h_test, _ = forward_propagation(x_test, weights, biases, config["activation"])
    y_pred_test = np.argmax(h_test[-1], axis=1)
    test_accuracy = compute_accuracy(y_test, y_pred_test)
    

    return test_accuracy

def adam(x_train, y_train, weights, biases,learning_rate=0.1, beta1=0.5, beta2=0.5, eps=0.000001, num_epochs=1):
    """
    Adam optimizer in a pure stochastic setting, updating parameters after each sample.
    (Simplified: no bias correction for the first and second moments.)

    Parameters:
    -----------
    x_train : np.ndarray
        Training data (features), shape (num_samples, num_features).
    y_train : np.ndarray
        Training labels (integers), shape (num_samples,).
    weights : list of np.ndarray
        List of weight matrices for each layer.
    biases : list of np.ndarray
        List of bias vectors for each layer.
    learning_rate : float, optional (default=0.1)
        The learning rate for parameter updates.
    beta1 : float, optional (default=0.5)
        Exponential decay rate for the first moment (moving average of gradients).
    beta2 : float, optional (default=0.5)
        Exponential decay rate for the second moment (moving average of squared gradients).
    eps : float, optional (default=1e-8)
        Small constant to avoid division by zero.
    num_epochs : int, optional (default=1)
        Number of epochs to run.

    Returns:
    --------
    test_accuracy : float
        The accuracy on the test set (assuming x_test, y_test, etc. are defined globally).
    """
    num_samples = len(x_train)

    # Initialize first (m) and second (v) moments for each layer
    m_w = [np.zeros_like(w) for w in weights]
    m_b = [np.zeros_like(b) for b in biases]
    v_w = [np.zeros_like(w) for w in weights]
    v_b = [np.zeros_like(b) for b in biases]
    y_train = y_train.flatten()

    for epoch in range(num_epochs):
        total_loss = 0.0

        # Shuffle the training data at the beginning of each epoch (optional)
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        x_train = x_train[indices]

        # Process each training sample individually
        for i in range(num_samples):
            # Extract a single sample
            sample_x = x_train[i].reshape(1, -1)  # shape: (1, num_features)
            sample_y = np.eye(10)[int(y_train[i].item())].reshape(1, -1)

            # Forward pass on the single sample
            h, _ = forward_propagation(sample_x, weights, biases, config["activation"])
            loss = compute_loss(sample_y, h[-1], config["loss"])
            total_loss += loss

            # Compute gradients for the single sample
            grad_w, grad_b = backpropagation_no_update(
                sample_x, sample_y,
                weights.copy(), biases.copy(),
                activation_func=config["activation"],
                loss_type=config["loss"]
                 )

            # === Adam Update for Each Layer (no bias correction) ===
            for j in range(len(weights)):
                # Update first moment
                m_w[j] = beta1 * m_w[j] + (1 - beta1) * grad_w[j]
                m_b[j] = beta1 * m_b[j] + (1 - beta1) * grad_b[j]

                # Update second moment
                v_w[j] = beta2 * v_w[j] + (1 - beta2) * (grad_w[j] ** 2)
                v_b[j] = beta2 * v_b[j] + (1 - beta2) * (grad_b[j] ** 2)

                # Parameter update: param -= lr * (m / sqrt(v) + eps)
                weights[j] -= learning_rate * (m_w[j] / (np.sqrt(v_w[j]) + eps))
                biases[j]  -= learning_rate * (m_b[j] / (np.sqrt(v_b[j]) + eps))

        # Log the total loss for the epoch

# h, _ = forward_propagation(batch_x, weights, biases, config["activation"])

    # Evaluate on test data (assuming x_test and y_test are defined globally)
    h_test, _ = forward_propagation(x_test, weights, biases, config["activation"])
    y_pred_test = np.argmax(h_test[-1], axis=1)
    test_accuracy = compute_accuracy(y_test, y_pred_test)
    

    return test_accuracy