In [2]:
import numpy as np

In [3]:
def gradient_descent(f, gradient, x0, alpha, eps, iters):
    x = x0
    for i in range(iters):
        x_new = x - alpha * gradient(x)

        if abs(f(x_new) - f(x)) < eps:
            break

        x = x_new
    
    result = {}
    result['converged'] = i != iters
    result['num_iters'] = i
    result['x'] = x_new
    
    return result

In [4]:
def f(x):
    return 0.5 * (x[0]**2 + 10*x[1]**2)

In [5]:
def gradient(x):
    return np.array([x[0], 10*x[1]])

In [6]:
x0 = np.array([3,5])
eps = 0.00001
iters = 1000
alpha = 0.1

gradient_descent(f, gradient, x0, alpha, eps, iters)

{'converged': True, 'num_iters': 54, 'x': array([0.00912976, 0.        ])}

In [7]:
def momentum(f, gradient, x0, alpha, eps, iters, beta):
    x = x0
    d = 0
    
    for i in range(iters):
        d = beta * d + alpha * gradient(x)
        x_new = x - d
        
        if abs(f(x_new) - f(x)) < eps:
            break
        x = x_new
        
    result = {}
    result['converged'] = i != iters
    result['num_iters'] = i
    result['x'] = x_new
    
    return result    

In [8]:
momentum(f, gradient, x0, alpha, eps, iters, beta=0.5)

{'converged': True, 'num_iters': 24, 'x': array([-0.00088475,  0.00063032])}

In [9]:
def nesterov(f, gradient, x0, alpha, eps, iters, beta):
    x = x0
    d = 0
    
    for i in range(iters):
        d = beta * d + alpha * gradient(x - beta*d)
        x_new = x - d
        
        if abs(f(x_new) - f(x)) < eps:
            break
        x = x_new
        
    result = {}
    result['converged'] = i != iters
    result['num_iters'] = i
    result['x'] = x_new
    
    return result    

In [10]:
nesterov(f, gradient, x0, alpha, eps, iters, beta=0.5)

{'converged': True, 'num_iters': 24, 'x': array([0.00450673, 0.        ])}

In [16]:
def adam(f, gradient, x0, alpha, eps, iters, beta1, beta2, delta):
    x = x0
    m = 0
    v = 0
    
    for i in range(1, iters+1):
        grad = gradient(x)
        m = beta1 * m + (1 - beta1) * grad
        v = beta2 * v + (1 - beta2) * grad**2
        
        m_hat = m / (1 - beta1**i)
        v_hat = v / (1 - beta2**i)
        
        x_new = x - alpha * m_hat / (np.sqrt(v_hat) + delta)
        
        if abs(f(x_new) - f(x)) < eps:
            break
            
        x = x_new
        
    result = {}
    result['converged'] = i != iters
    result['num_iters'] = i
    result['x'] = x_new
    
    return result    

In [17]:
adam(f, gradient, x0, alpha, eps, iters, 0.9, 0.999, 1e-7)

{'converged': True, 'num_iters': 142, 'x': array([-0.00108211, -0.00152502])}