In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.core.display import Image

In [None]:
Image(filename='../img//boromir.png')

<!-- <img src='../img//boromir.png'/> 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch, torchvision

In [None]:
import sys
sys.path.append('../../')
from mchlearn import utils

# Optimizers

[An overview of gradient descent optimization algorithms](http://ruder.io/optimizing-gradient-descent/)

## Gradient Descent

$$\begin{align}
\theta_{t+1}& = \theta^{t}-\eta\nabla_\theta L(\theta_t)
\end{align}
$$

In [None]:
data = np.load("../data/sgd_data.npy").astype('float32')

In [None]:
sin_example = utils.SinFitExample(data)

In [None]:
sin_example.display_data();

In [None]:
def fitf(x,o,t):
    return np.sin(x*o+t)

def fitf_tensor(x,o,t):
    return np.moveaxis(np.sin(np.tensordot(np.atleast_1d(x),o,0)+t),0,-1)

def mse(f, x, y, o, t):
        err = f(x,o,t)-y
        return 0.5*np.sum(err*err, axis=-1)/len(x)

In [None]:
t_rxs = torch.from_numpy(data[:400,0])
t_rys = torch.from_numpy(data[:400,1])

In [None]:
loss_f = torch.nn.MSELoss()

In [None]:
rdataset = torch.utils.data.TensorDataset(t_rxs, t_rys)

In [None]:
onebatchloader = torch.utils.data.DataLoader(rdataset, batch_size=len(rdataset), shuffle=False);

In [None]:
p = torch.FloatTensor([3.2,-0.4])
p.requires_grad_(True)
gd = torch.optim.SGD([p], lr=0.2)
sin_example.run_example(p, gd, onebatchloader);

In [None]:
batch_data_loader = torch.utils.data.DataLoader(rdataset, batch_size=50, shuffle=True)

In [None]:
p = torch.FloatTensor([3.2,-0.4])
p.requires_grad_(True)
gd = torch.optim.SGD([p], lr=0.2)
sin_example.run_example(p, gd, batch_data_loader);

## "Ravine"

In [None]:
rav_par = np.asarray([1.0,10.0]).astype('float32')

In [None]:
ravine_example = utils.RavineExample(rav_par)

In [None]:
p = torch.FloatTensor([-8, 3])
p.requires_grad_(True);
gd = torch.optim.SGD([p], lr=0.01)
ravine_example.run_example(p, gd,100,1.0 );

## Gradient Descent with Momentum

$$\begin{align}
v_{t+1}& = \mu v_{t} + (1-\beta)\nabla_\theta L(\theta_t)\\
\theta_{t+1}& = \theta_{t}-\eta v_{t+1}
\end{align}
$$

In [None]:
p = torch.FloatTensor([-8, 3])
p.requires_grad_(True);
gd = torch.optim.SGD([p], lr=0.021, momentum=0.5)
ravine_example.run_example(p, gd,100,1.0 );

In [None]:
p = torch.FloatTensor([3.2,-0.4])
p.requires_grad_(True)
gd = torch.optim.SGD([p], lr=0.01, momentum=0.9)
sin_example.run_example(p, gd, batch_data_loader);

$$\begin{align}
v_{t+1}& = \mu v_{t} + (1-\beta)\nabla_\theta L(\theta_t)\\
\theta_{t+1}& = \theta_{t}-\eta v_{t+1}
\end{align}
$$

$$v_{t+1} = \mu v_{t} + (1-\beta)g_t$$

$$v_1 = (1-\beta)g_0$$

$$v_2 = \mu (1-\beta)g_0+(1-\beta) g_1$$

$$v_3 = \mu\left(\mu (1-\beta)g_0+(1-\beta) g_1\right)+(1-\beta)g_2$$

$$v_3 = \mu^2 (1-\beta)g_0+\mu (1-\beta) g_1
+(1-\beta)g_2$$

$$v_t = (1-\beta)\sum_{i=1}^t \mu^{i-1}g_{t-i}$$

In [None]:
ns = np.arange(0,100)
for mu in [0.9, 0.7,0.5, 0.25]:
    plt.plot(ns,mu**ns,'.', label="%4.2f" % (mu,))
plt.legend();

## Nesterov Accelerated Gradient Descent

In [None]:
Image(filename="nesterov_update_vector.png")

$$\begin{align}
v_{t+1}& = \mu v_{t} + \nabla_\theta L(\theta_t-\eta \mu v_t)\\
\theta_{t+1}& = \theta_{t}-\eta v_{t+1}
\end{align}
$$

$$r_t = \theta_t-\eta\mu v_t$$
$$\theta_t = r_t+\eta\mu v_t$$

$$\begin{align}
v_{t+1}& = \mu v_{t} + \nabla_r L(r)\\
r_{t+1}& = r_{t}-\eta\left(\nabla_r L(r) +\mu v^{t+1}\right)
\end{align}
$$

In [None]:
p = torch.FloatTensor([-8, 3])
p.requires_grad_(True);
gd = torch.optim.SGD([p], lr=0.01, momentum=0.9, nesterov=True)
ravine_example.run_example(p, gd,100,1.0 );

In [None]:
p = torch.FloatTensor([3.2,-0.4])
p.requires_grad_(True)
gd = torch.optim.SGD([p], lr=0.03, momentum=0.9, nesterov=True)
sin_example.run_example(p, gd, batch_data_loader);

## Adaptive gradient: Adagrad

$$\begin{align}
v_{t+1}& =\nabla_\theta L(\theta^t)\\
G_{t+1}&=G_{t}+\left(\nabla_\theta L(\theta_t)\right)^2\\
\theta_{t+1}& = \theta_{t}-\frac{\eta}{\sqrt{G_{t+1}+\epsilon}} v_{t+1}
\end{align}
$$

In [None]:
p = torch.FloatTensor([-8, 3])
p.requires_grad_(True);
gd = torch.optim.Adagrad([p], lr=2.0)
ravine_example.run_example(p, gd,100,1.0 );

In [None]:
p = torch.FloatTensor([3.2,-0.4])
p.requires_grad_(True)
gd = torch.optim.Adagrad([p], lr=.2)
sin_example.run_example(p, gd, batch_data_loader);

## RMSProp

$$\begin{align}
v_{t+1}& =\nabla_\theta L(\theta_t)\\
E[g^2]_{t+1}&=\gamma E[g^2]_t+(1-\gamma)\left(\nabla_\theta L(\theta_t)\right)^2\\
\theta_{t+1}& = \theta_{t}-\frac{\eta}{\sqrt{E[g^2]_{t+1}+\epsilon}} v_{t+1}
\end{align}
$$

In [None]:
p = torch.FloatTensor([-8, 3])
p.requires_grad_(True);
gd = torch.optim.RMSprop([p], lr=1.0)
ravine_example.run_example(p, gd,100,1.0 );

In [None]:
p = torch.FloatTensor([3.2,-0.4])
p.requires_grad_(True)
gd = torch.optim.RMSprop([p], lr=0.1)
sin_example.run_example(p, gd, batch_data_loader);

## Adadelta

Units do not match! 

$$\begin{align}
v^{t+1}& =\nabla_\theta L(\theta^t)\\
E[g^2]_{t+1}&=\gamma E[g^2]_t+(1-\gamma)\left(\nabla_\theta L(\theta^t)\right)^2\\
E[\Delta\theta^2]_t & = \gamma E[\Delta\theta]_{t-1} +(1-\gamma)\Delta\theta^2_t\\
\theta_{t+1}& = \theta_{t}-\frac{\eta E[\Delta\theta^2]_t}{\sqrt{E[g^2]_{t+1}+\epsilon}} v_{t+1}
\end{align}
$$

In [None]:
p = torch.FloatTensor([-8, 3])
p.requires_grad_(True);
gd = torch.optim.Adadelta([p], lr=4.0)
ravine_example.run_example(p, gd,2100,1.0 );

In [None]:
p = torch.FloatTensor([3.2,-0.4])
p.requires_grad_(True)
gd = torch.optim.Adadelta([p], lr=4.0)
sin_example.run_example(p, gd, batch_data_loader);

### Adam: Adaptive Momentum Estimation 

$$\begin{split}
m_t &= \beta_1 m_{t-1} + (1-\beta_1) g_t \\
v_t &= \beta_2 v_{t-1} + (1-\beta_2) g^2_t \\
\end{split}
$$

$$\theta_{t+1} = \theta_t -\frac{\eta}{\sqrt{v}+\epsilon}m_t $$

In [None]:
p = torch.FloatTensor([-8, 3])
p.requires_grad_(True);
gd = torch.optim.Adam([p], lr=0.5)
ravine_example.run_example(p, gd,2100,1.0 );

In [None]:
p = torch.FloatTensor([3.4,-0.4])
p.requires_grad_(True)
gd = torch.optim.Adam([p], lr=0.5)
sin_example.run_example(p, gd, batch_data_loader);