In [1]:
import numpy as np

class Optimizer:
    """Base class for all optimizers"""
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate

    def update(self, params, grads):
        """
        Update rule to be implemented by specific optimizers
        params: dictionary of parameters (W1, b1, W2, b2, etc.)
        grads: dictionary of gradients (dW1, db1, dW2, db2, etc.)
        """
        raise NotImplementedError("Optimizer subclasses must implement update method")


In [2]:
class SGD(Optimizer):
    """
    Stochastic Gradient Descent

    Formula:
    θ = θ - η * ∇J(θ)

    Where:
    θ: parameters (weights, biases)
    η: learning rate
    ∇J(θ): gradient of cost function with respect to parameters
    """
    def update(self, params, grads):
        for key in params:
            params[key] -= self.learning_rate * grads["d" + key]
        return params


In [3]:
class SGDMomentum(Optimizer):
    """
    SGD with Momentum

    Formula:
    v = γ * v + η * ∇J(θ)
    θ = θ - v

    Where:
    v: velocity (initialized as zeros)
    γ: momentum coefficient (typically 0.9)
    η: learning rate
    ∇J(θ): gradient of cost function
    """
    def __init__(self, learning_rate=0.01, momentum=0.9):
        super().__init__(learning_rate)
        self.momentum = momentum
        self.velocity = {}

    def update(self, params, grads):
        if not self.velocity:
            # Initialize velocity if it's the first iteration
            for key in params:
                self.velocity[key] = np.zeros_like(params[key])

        for key in params:
            # Update velocity
            self.velocity[key] = self.momentum * self.velocity[key] + self.learning_rate * grads["d" + key]
            # Update parameters
            params[key] -= self.velocity[key]

        return params


In [4]:
class RMSprop(Optimizer):
    """
    Root Mean Square Propagation

    Formula:
    s = β * s + (1 - β) * (∇J(θ))^2
    θ = θ - η * ∇J(θ) / (√s + ε)

    Where:
    s: squared gradients moving average (initialized as zeros)
    β: decay rate (typically 0.9)
    η: learning rate
    ε: small constant to avoid division by zero
    ∇J(θ): gradient of cost function
    """
    def __init__(self, learning_rate=0.01, beta=0.9, epsilon=1e-8):
        super().__init__(learning_rate)
        self.beta = beta
        self.epsilon = epsilon
        self.squared_gradients = {}

    def update(self, params, grads):
        if not self.squared_gradients:
            # Initialize squared gradients if it's the first iteration
            for key in params:
                self.squared_gradients[key] = np.zeros_like(params[key])

        for key in params:
            # Update squared gradients moving average
            self.squared_gradients[key] = self.beta * self.squared_gradients[key] + \
                                         (1 - self.beta) * np.square(grads["d" + key])
            # Update parameters
            params[key] -= self.learning_rate * grads["d" + key] / \
                          (np.sqrt(self.squared_gradients[key]) + self.epsilon)

        return params

In [5]:
class Adam(Optimizer):
    """
    Adaptive Moment Estimation

    Formula:
    m = β1 * m + (1 - β1) * ∇J(θ)               # First moment estimate
    v = β2 * v + (1 - β2) * (∇J(θ))^2           # Second moment estimate
    m̂ = m / (1 - β1^t)                          # Bias-corrected first moment
    v̂ = v / (1 - β2^t)                          # Bias-corrected second moment
    θ = θ - η * m̂ / (√v̂ + ε)                    # Parameter update

    Where:
    m: first moment vector (mean of gradients, initialized as zeros)
    v: second moment vector (uncentered variance, initialized as zeros)
    β1, β2: decay rates for moment estimates (typically 0.9 and 0.999)
    t: time step
    η: learning rate
    ε: small constant to avoid division by zero
    """
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super().__init__(learning_rate)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {}  # First moment
        self.v = {}  # Second moment
        self.t = 0   # Time step

    def update(self, params, grads):
        if not self.m:
            # Initialize moments if it's the first iteration
            for key in params:
                self.m[key] = np.zeros_like(params[key])
                self.v[key] = np.zeros_like(params[key])

        self.t += 1

        for key in params:
            # Update biased first moment estimate
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads["d" + key]
            # Update biased second moment estimate
            self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * np.square(grads["d" + key])

            # Compute bias-corrected first moment estimate
            m_corrected = self.m[key] / (1 - self.beta1 ** self.t)
            # Compute bias-corrected second moment estimate
            v_corrected = self.v[key] / (1 - self.beta2 ** self.t)

            # Update parameters
            params[key] -= self.learning_rate * m_corrected / (np.sqrt(v_corrected) + self.epsilon)

        return params

In [6]:
class AdaGrad(Optimizer):
    """
    Adaptive Gradient Algorithm

    Formula:
    s = s + (∇J(θ))^2
    θ = θ - η * ∇J(θ) / (√s + ε)

    Where:
    s: sum of squared gradients (initialized as zeros)
    η: learning rate
    ε: small constant to avoid division by zero
    ∇J(θ): gradient of cost function
    """
    def __init__(self, learning_rate=0.01, epsilon=1e-8):
        super().__init__(learning_rate)
        self.epsilon = epsilon
        self.squared_gradients_sum = {}

    def update(self, params, grads):
        if not self.squared_gradients_sum:
            # Initialize sum of squared gradients if it's the first iteration
            for key in params:
                self.squared_gradients_sum[key] = np.zeros_like(params[key])

        for key in params:
            # Accumulate squared gradients
            self.squared_gradients_sum[key] += np.square(grads["d" + key])
            # Update parameters
            params[key] -= self.learning_rate * grads["d" + key] / \
                          (np.sqrt(self.squared_gradients_sum[key]) + self.epsilon)

        return params


In [7]:
class Adadelta(Optimizer):
    """
    Adadelta Algorithm

    Formula:
    E[g²]_t = ρ * E[g²]_{t-1} + (1 - ρ) * (∇J(θ))^2
    RMS[g]_t = √(E[g²]_t + ε)
    Δθ_t = - η * ∇J(θ) / RMS[g]_t
    E[Δθ²]_t = ρ * E[Δθ²]_{t-1} + (1 - ρ) * Δθ_t^2
    RMS[Δθ]_t = √(E[Δθ²]_t + ε)
    θ_t = θ_{t-1} + RMS[Δθ]_{t-1} / RMS[g]_t * ∇J(θ)

    Where:
    E[g²]: Running average of squared gradients
    E[Δθ²]: Running average of squared parameter updates
    ρ: Decay constant (typically 0.95)
    ε: Small constant for numerical stability
    """
    def __init__(self, learning_rate=1.0, rho=0.95, epsilon=1e-6):
        super().__init__(learning_rate)
        self.rho = rho
        self.epsilon = epsilon
        self.avg_squared_grad = {}    # E[g²]
        self.avg_squared_delta = {}   # E[Δθ²]

    def update(self, params, grads):
        if not self.avg_squared_grad:
            # Initialize if it's the first iteration
            for key in params:
                self.avg_squared_grad[key] = np.zeros_like(params[key])
                self.avg_squared_delta[key] = np.zeros_like(params[key])

        for key in params:
            # Update running average of squared gradients
            self.avg_squared_grad[key] = self.rho * self.avg_squared_grad[key] + \
                                        (1 - self.rho) * np.square(grads["d" + key])

            # Compute RMS[g]
            rms_grad = np.sqrt(self.avg_squared_grad[key] + self.epsilon)

            # Compute RMS[Δθ]
            rms_delta = np.sqrt(self.avg_squared_delta[key] + self.epsilon)

            # Compute parameter update (Δθ)
            delta = -self.learning_rate * (rms_delta / rms_grad) * grads["d" + key]

            # Update running average of squared parameter updates
            self.avg_squared_delta[key] = self.rho * self.avg_squared_delta[key] + \
                                         (1 - self.rho) * np.square(delta)

            # Update parameters
            params[key] += delta

        return params


In [8]:
class Nadam(Optimizer):
    """
    Nesterov-accelerated Adaptive Moment Estimation

    Formula (simplified):
    m = β1 * m + (1 - β1) * ∇J(θ)               # First moment estimate
    v = β2 * v + (1 - β2) * (∇J(θ))^2           # Second moment estimate
    m̂ = m / (1 - β1^t)                          # Bias-corrected first moment
    v̂ = v / (1 - β2^t)                          # Bias-corrected second moment
    m̂_nesterov = (β1 * m̂) / (1 - β1^(t+1)) + ((1 - β1) * ∇J(θ)) / (1 - β1^t)  # Nesterov momentum
    θ = θ - η * m̂_nesterov / (√v̂ + ε)           # Parameter update

    Where:
    m: first moment vector (mean of gradients)
    v: second moment vector (uncentered variance)
    β1, β2: decay rates for moment estimates
    t: time step
    η: learning rate
    ε: small constant for numerical stability
    """
    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super().__init__(learning_rate)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {}  # First moment
        self.v = {}  # Second moment
        self.t = 0   # Time step

    def update(self, params, grads):
        if not self.m:
            # Initialize moments if it's the first iteration
            for key in params:
                self.m[key] = np.zeros_like(params[key])
                self.v[key] = np.zeros_like(params[key])

        self.t += 1

        for key in params:
            # Update biased first moment estimate
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads["d" + key]
            # Update biased second moment estimate
            self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * np.square(grads["d" + key])

            # Compute bias-corrected first moment estimate
            m_corrected = self.m[key] / (1 - self.beta1 ** self.t)
            # Compute bias-corrected second moment estimate
            v_corrected = self.v[key] / (1 - self.beta2 ** self.t)

            # Compute Nesterov accelerated momentum term
            m_nesterov = (self.beta1 * m_corrected) / (1 - self.beta1 ** (self.t + 1)) + \
                         ((1 - self.beta1) * grads["d" + key]) / (1 - self.beta1 ** self.t)

            # Update parameters
            params[key] -= self.learning_rate * m_nesterov / (np.sqrt(v_corrected) + self.epsilon)

        return params

In [9]:
"""
Usage example with neural network

def train_neural_network(model, optimizer, X, y, epochs=1000):
    for epoch in range(epochs):
        # Forward pass
        output = model.forward(X)

        # Compute loss
        loss = np.mean(np.square(y - output))

        # Backward pass - calculate gradients
        gradients = model.backward(X, y, output)

        # Update parameters using optimizer
        model.params = optimizer.update(model.params, gradients)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")

    return model

# Example usage:
# model = NeuralNetwork(...)
# optimizer = Adam(learning_rate=0.001)
# trained_model = train_neural_network(model, optimizer, X, y, epochs=5000)
"""


'\nUsage example with neural network\n\ndef train_neural_network(model, optimizer, X, y, epochs=1000):\n    for epoch in range(epochs):\n        # Forward pass\n        output = model.forward(X)\n        \n        # Compute loss\n        loss = np.mean(np.square(y - output))\n        \n        # Backward pass - calculate gradients\n        gradients = model.backward(X, y, output)\n        \n        # Update parameters using optimizer\n        model.params = optimizer.update(model.params, gradients)\n        \n        if epoch % 100 == 0:\n            print(f"Epoch {epoch}, Loss: {loss}")\n    \n    return model\n\n# Example usage:\n# model = NeuralNetwork(...)\n# optimizer = Adam(learning_rate=0.001)\n# trained_model = train_neural_network(model, optimizer, X, y, epochs=5000)\n'

 # When to Use Each Optimizer
    
## SGD (Stochastic Gradient Descent)
- **When to use**: Simple problems with convex loss surfaces
- **Advantages**: Simple to implement, well understood theoretically
- **Disadvantages**: Slow convergence, gets stuck in local minima, sensitive to feature scaling
- **Learning rate**: Typically 0.01 to 0.1

## SGD with Momentum
- **When to use**: When SGD is too slow or oscillates too much
- **Advantages**: Faster convergence than SGD, can escape some local minima
- **Disadvantages**: Needs momentum hyperparameter tuning, can overshoot minima
- **Learning rate**: Typically 0.01 to 0.1, with momentum 0.9 to 0.99

## RMSprop
- **When to use**: Non-stationary objectives, problems with sparse gradients
- **Advantages**: Adapts learning rate per parameter, good for RNNs and computer vision
- **Disadvantages**: Requires more computation than SGD
- **Learning rate**: Typically 0.001 to 0.01

## Adam
- **When to use**: Deep learning models, noisy gradients, large datasets
- **Advantages**: Combines benefits of momentum and RMSprop, robust to hyperparameters
- **Disadvantages**: Can converge to sub-optimal solutions in some cases
- **Learning rate**: Typically 0.001 to 0.0001

## AdaGrad
- **When to use**: Sparse data, NLP problems
- **Advantages**: Good for sparse features, different learning rates per parameter
- **Disadvantages**: Learning rate decreases over time, may stop learning too early
- **Learning rate**: Typically 0.01 to 0.1

## Adadelta
- **When to use**: When you want AdaGrad's benefits without decreasing learning rates
- **Advantages**: No need to set learning rate manually, robust to large gradients
- **Disadvantages**: More computationally intensive
- **Learning rate**: Not critical, often set to 1.0

## Nadam
- **When to use**: When you want the benefits of both NAG and Adam
- **Advantages**: Faster convergence than Adam in many cases
- **Disadvantages**: Slightly more computation than Adam
- **Learning rate**: Typically 0.002 to 0.0002

## General Recommendations

- **Start with Adam**: It's a good default optimizer for most problems
- **Use SGD with momentum**: For fine-tuning or if you suspect Adam is converging to poor solutions
- **RMSprop**: Good alternative to Adam, especially for RNNs
- **For sparse data**: Try AdaGrad or sparse variants of Adam
- **If all else fails**: Grid search over optimizers and their hyperparameters

## Special Cases

- **Computer Vision**: Adam, RMSprop, or SGD with momentum (with learning rate scheduling)
- **NLP**: Adam or variants like AdamW (Adam with weight decay)
- **Reinforcement Learning**: Often RMSprop or Adam
- **Generative Models (GANs)**: Adam for generator, SGD for discriminator often works well
