In [35]:
import numpy as np

In [36]:
# def alpha(i):
#     return 1 / i

In [37]:
def gradient_descent(f, x0, alpha, num_iters, eps):
    result = {}
    
    x = x0
    
    for i in range(num_iters):
        x_new = x - alpha*gradient(x)
        
        if abs(f(x_new) - f(x)) < eps:
            result['converged'] = True
            break
        
        x = x_new
    else:
        result['converged'] = False
    result['iter'] = i
    result['x'] = x
    return result

In [38]:
def f(x):
    return 0.5*(x[0]**2 + 10*x[1]**2)

In [39]:
def gradient(x):
    return np.array([x[0], 10*x[1]])

In [40]:
x0 = np.array([2,5])
alpha = 0.1
num_iters = 1000
eps = 0.0001

In [41]:
gradient_descent(f, x0, alpha, num_iters, eps)

{'converged': True, 'iter': 40, 'x': array([0.02956177, 0.        ])}

In [42]:
def momentum(f, x0, alpha, num_iters, eps, beta):
    x = x0
    result = {}
    
    d = 0
    
    for i in range(num_iters):
        d = beta * d - alpha * gradient(x)
        x_new = x + d
        
        if abs(f(x_new) - f(x)) < eps:
            result['converged'] = True
            break
        
        x = x_new
    else:
        result['converged'] = False
    result['iter'] = i
    result['x'] = x
    return result

In [43]:
momentum(f, x0, alpha, num_iters, eps, beta=0.5)

{'converged': True, 'iter': 19, 'x': array([-0.00011178, -0.00162125])}

In [44]:
def nesterov(f, x0, alpha, num_iters, eps, beta):
    x = x0
    result = {}
    
    d = 0
    
    for i in range(num_iters):
        d = beta * d
        d = d - alpha * gradient(x + d)
        x_new = x + d
        
        if abs(f(x_new) - f(x)) < eps:
            result['converged'] = True
            break
        
        x = x_new
    else:
        result['converged'] = False
    result['iter'] = i
    result['x'] = x
    return result

In [45]:
nesterov(f, x0, alpha, num_iters, eps, beta=0.5)

{'converged': True, 'iter': 19, 'x': array([0.01679126, 0.        ])}

In [48]:
def adam(f, x0, alpha, num_iters, eps, beta1, beta2, delta):
    x = x0
    result = {}
    
    m = 0
    v = 0
    for i in range(1, num_iters+1):
        grad = gradient(x)
        m = beta1 * m + (1 - beta1) * grad 
        v = beta2 * v + (1 - beta2) * grad**2

        m_hat = m / (1 - beta1**i)
        v_hat = v / (1 - beta2**i)
        
        x_new = x - m_hat / (np.sqrt(v_hat) + delta)
        
        if abs(f(x_new) - f(x)) < eps:
            result['converged'] = True
            break
        
        x = x_new
    else:
        result['converged'] = False
    result['iter'] = i
    result['x'] = x
    return result

In [49]:
adam(f, x0, alpha, num_iters, eps, beta1=0.9, beta2=0.999, delta=1e-6)

{'converged': True, 'iter': 100, 'x': array([-0.00662001, -0.02229997])}