In [84]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import tensor
from matplotlib import cm
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

In [85]:
torch.manual_seed(42)
np.random.seed(42)

In [86]:
class Optimizer:
    def __init__(self, init_position, loss_fn, grad_fn, lr=0.01):
        self.position = init_position.clone()
        self.loss_fn = loss_fn
        self.grad_fn = grad_fn
        self.lr = lr
        self.trajectory = [init_position.clone()]
        self.losses = [self._compute_loss()]
    
    def _compute_loss(self):
        return self.loss_fn(self.position[0].item(), self.position[1].item())
    
    def step(self):
        raise NotImplementedError("Subclasses must implement step method")
    
    def optimize(self, num_steps=100):
        for _ in range(num_steps):
            self.step()
            self.trajectory.append(self.position.clone())
            self.losses.append(self._compute_loss())
        return self.position, self.losses[-1]

In [87]:
def rosenbrock(x, y, a=1, b=100):
    """f(x, y) = (a - x)^2 + b(y - x^2)^2"""
    return (a - x)**2 + b * (y - x**2)**2

def rosenbrock_grad(x, y, a=1, b=100):
    dx = -2*(a - x) - 4*b*x*(y - x**2)
    dy = 2*b*(y - x**2)
    return tensor([dx, dy])

In [88]:
class SGD(Optimizer):
    def step(self):
        gradient = self.grad_fn(self.position[0].item(), self.position[1].item())
        self.position -= self.lr * gradient

class Momentum(Optimizer):
    def __init__(self, init_position, loss_fn, grad_fn, lr=0.01, momentum=0.9):
        super().__init__(init_position, loss_fn, grad_fn, lr)
        self.momentum = momentum
        self.velocity = torch.zeros_like(init_position)
    
    def step(self):
        gradient = self.grad_fn(self.position[0].item(), self.position[1].item())
        self.velocity = self.momentum * self.velocity - self.lr * gradient
        self.position += self.velocity

class Adam(Optimizer):
    def __init__(self, init_position, loss_fn, grad_fn, lr=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super().__init__(init_position, loss_fn, grad_fn, lr)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = torch.zeros_like(init_position)
        self.v = torch.zeros_like(init_position)
        self.t = 0

    def step(self):
        self.t += 1
        gradient = self.grad_fn(self.position[0].item(), self.position[1].item())
        
        # update biased first moment estimate
        self.m = self.beta1 * self.m + (1 - self.beta1) * gradient
        
        # update biased second raw moment estimate
        self.v = self.beta2 * self.v + (1 - self.beta2) * gradient**2
        
        # compute bias-corrected first moment estimate
        m_hat = self.m / (1 - self.beta1**self.t)
        
        # compute bias-corrected second raw moment estimate
        v_hat = self.v / (1 - self.beta2**self.t)
        
        # update parameters
        self.position -= self.lr * m_hat / (torch.sqrt(v_hat) + self.epsilon)


In [108]:
init_position = tensor([-1.0, 1.0])

num_steps=100

sgd = SGD(init_position, rosenbrock, rosenbrock_grad, lr=0.001)
final_pos, final_loss = sgd.optimize(num_steps=num_steps)
print(f"{sgd.__class__.__name__}: Final position = {final_pos}, Final loss = {final_loss:.6f}")

momentum = Momentum(init_position, rosenbrock, rosenbrock_grad, lr=0.001)
final_pos, final_loss = momentum.optimize(num_steps=num_steps)
print(f"{momentum.__class__.__name__}: Final position = {final_pos}, Final loss = {final_loss:.6f}")

adam = Adam(init_position, rosenbrock, rosenbrock_grad, lr=0.002)
final_pos, final_loss = adam.optimize(num_steps=num_steps)
print(f"{adam.__class__.__name__}: Final position = {final_pos}, Final loss = {final_loss:.6f}")

SGD: Final position = tensor([-0.9127,  0.8410]), Final loss = 3.664728
Momentum: Final position = tensor([0.2936, 0.0817]), Final loss = 0.501033
Adam: Final position = tensor([-0.8887,  0.7992]), Final loss = 3.576109
