# Physics 494/594
## Stochastic Gradient Descent

In [None]:
# %load ./include/header.py
import numpy as np
import matplotlib.pyplot as plt
import sys
from tqdm import trange,tqdm
sys.path.append('./include')
import ml4s
import jax.numpy as jnp 
from jax import grad
from IPython import display

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('./include/notebook.mplstyle')
np.set_printoptions(linewidth=120)
ml4s.set_css_style('./include/bootstrap.css')
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
π = np.pi

## Last Time

### [Notebook Link: 10_Model_Complexity_Regularization.ipynb](./10_Model_Complexity_Regularization.ipynb)

- Improved gradient descent methods based on momentum and adaptation

## Today

- Introduction of approximate  gradients for stochastic downhill steps.

Let us return to our original task, the minimization of a cost function: 

\begin{equation}
\boxed{
\mathcal{C} = \frac{1}{2N} \sum_{n=1}^N  \left( F^{(n)} - y^{(n)} \right)^2 = \frac{1}{2N} \lvert \lvert \vec{F} - \vec{y}\rvert\rvert^2
}
\end{equation}

which represents the goodness of fit of some data set $\{(x^{(n)},y^{(n)})\}_{n=1}^N$ to some proposed model $F$. Here, for simplicity we will assume $x^{(n)},y^{(n)} \in \mathbb{R}$ such that  $F : \mathbb{R} \mapsto \mathbb{R}$.

Consider the following data set located at `../data/stochastic_GD.dat`

<!--
x = np.linspace(0,6.0/(π),100)
header = f"{'x':>13s}\t{'y':>15s}"
k1 = np.random.normal(loc=2*π,scale=0.1,size=x.size)
k2 = np.random.normal(loc=4*π,scale=0.2,size=x.size)
y = np.cos(k1*x)*np.sin(k2*x) + np.random.normal(loc=0,scale=0.05,size=x.size)
data_out = np.column_stack([x,y])
np.savetxt('../data/stochastic_GD.dat', data_out,fmt='% 15.8e', header=header, delimiter='\t')
-->

In [None]:
x,y = np.loadtxt('../data/stochastic_GD.dat',unpack=True)

In [None]:
plt.plot(x,y, 'o')
plt.xlabel('x')
plt.ylabel('y')

### Model

The proposed model to fit this data is given by:

\begin{equation}
F(x,\vec{w}) = \cos(w_0 x)\sin(w_1 x)
\end{equation}

Our goal is to extract the weights $\vec{w}$ from the data.

<div class="span alert alert-warning">
    <strong>Note:</strong> if we want to use <tt>jax</tt> we need to use <tt>jnp</tt>.
</div>

In [None]:
def F(x,w):
    return jnp.cos(w[0]*x)*jnp.sin(w[1]*x)

## The Cost Function 

We define the cost function as the least squares difference (maximum likelihood) between the model and data. 

In [None]:
def C(w,x,y):
    return 0.5*jnp.average((F(x,w)-y)**2)
    
dC_dw = grad(C,argnums=0)

### Visualze

As before, we can visualize the cost function as a 2D convex function in weight space.  Let's use our previously defined functions.

In [None]:
grid_w = np.linspace(π,5*π,50)
fig,ax,ax3d = ml4s.plot_2D_function(grid_w,grid_w,lambda w: C(w,x,y))

#### Visualize the direction and size of the gradient

In [None]:
min_w = np.min(grid_w)
max_w = np.max(grid_w)
wₒ = np.random.uniform(low=min_w,high=max_w,size=2)

dC = dC_dw(wₒ,x,y)

_min = min(min_w,wₒ[0]+dC[0]-0.5,wₒ[1]+dC[1]-0.5)
_max = max(max_w,wₒ[0]+dC[0]+0.5,wₒ[1]+dC[1]+0.5)

# plot the surface and the sampling point
fig,ax,ax3d = ml4s.plot_2D_function(grid_w,grid_w,lambda w: C(w,x,y))
ax.plot(wₒ[0],wₒ[1],'x', ms=5)

# plot the gradients
arrow_prop_dict = dict(mutation_scale=10, arrowstyle='-|>', color='k', fc='w', shrinkA=0, shrinkB=0)
ax.annotate('',xy=wₒ+dC,xytext=wₒ,xycoords='data', textcoords='data',arrowprops=dict(arrowstyle="-|>", fc='w',
                             shrinkA=0,shrinkB=0))
a = ml4s.Arrow3D([wₒ[0], wₒ[0]+dC[0]], [wₒ[1], wₒ[1]+dC[1]], [C(wₒ,x,y),C(wₒ+dC,x,y)], **arrow_prop_dict)
ax3d.add_artist(a);

ax.text(1,1.1,rf'$w_0 = ({wₒ[0]:.3f},{wₒ[1]:.3f}),\; C(w_0) = {C(wₒ,x,y):.3f},\;  \nabla C(w_0) = ({dC[0]:.3f},{dC[1]:.3f})$', 
        fontsize=13, transform=ax.transAxes, ha='center');

## Gradient Descent

Implement our convential gradient descent procedure

In [None]:
fig,ax,ax3d = ml4s.plot_2D_function(grid_w,grid_w,lambda w: C(w,x,y))

# hyperparameters
η = 0.5
num_iter = 100

# let's start near the minimum
w = np.array([6.75,14.5])

# store the cost function during the trajectory
C_traj = {}
C_traj['GD'] = np.zeros([num_iter])

ax.plot(*w, marker='.', color='k', ms=15)  

for i in range(num_iter):

    # we keep a copy of the previous version for plotting
    w_old = np.copy(w)
    C_traj['GD'][i] = C(w,x,y)
    
    # perform the GD update
    w += -η*dC_dw(w,x,y)
    
    # plot
    ax.plot([w_old[0], w[0]], [w_old[1], w[1]], marker='.', linestyle='-', color='k',lw=1) 
    ax3d.plot([w_old[0], w[0]], [w_old[1], w[1]], [C(w_old,x,y),C(w,x,y)], marker='.', linestyle='-', color='k',lw=1, zorder=100)

    ax.set_title(f'$i={i}, w=[{w[0]:.2f},{w[1]:.2f}]$' + '\n' + f'$C(w) = {C(w,x,y):.6f}$', fontsize=14);
    display.display(fig)
    display.clear_output(wait=True)

### Plot the cost function during minimization

In [None]:
plt.plot(C_traj['GD'], label='Gradient Descent')
plt.xlabel('Iteration Step')
plt.ylabel('Cost')
plt.legend()

## Stochastic Gradient Descent

Instead of computing the gradient using the entire data set $\mathcal{D} = \{(x^{(n)},y^{(n)})\}_{n=1}^N$ we select a minibatch of size $N_B$.  Let's take $N_B = 5$ here.  We start by computing the random indices.

In [None]:
N = x.size
NB = 5
idx = np.random.choice(N, NB, replace=False)

Compare the exact and approximate gradient for a minibatch.

In [None]:
wₒ = np.random.uniform(low=min_w,high=max_w,size=2)

print('Exact:')
dC = dC_dw(wₒ,x,y)
out = rf'$w_0 = ({wₒ[0]:.3f},{wₒ[1]:.3f}),\; C(w_0) = {C(wₒ,x,y):.3f},\;  \nabla C(w_0) = ({dC[0]:.3f},{dC[1]:.3f})$'
ml4s.mdtex(out)

print('MiniBatch:')
dC = dC_dw(wₒ,x[idx],y[idx])
out = rf'$w_0 = ({wₒ[0]:.3f},{wₒ[1]:.3f}),\; C(w_0) = {C(wₒ,x,y):.3f},\;  \nabla C(w_0) = ({dC[0]:.3f},{dC[1]:.3f})$'
ml4s.mdtex(out)

In [None]:
fig,ax,ax3d = ml4s.plot_2D_function(grid_w,grid_w,lambda w: C(w,x,y))

# hyperparameters
η = 0.5
w = np.array([6.75,14.5])

num_epoch = 5
num_batch = int(N/NB)

w_traj = np.zeros([num_epoch*num_batch,2])
w_traj[0,:] = w

C_traj['SGD'] = np.zeros([num_epoch*num_batch])

ax.plot(*w, marker='.', color='k', ms=15)  

# each epoch includes all minibatches
i = 0
for epoch in range(num_epoch):
    for batch in range(num_batch):

        # get the batch
        idx = np.random.choice(N, NB, replace=False)
        
        # we keep a copy of the previous version for plotting
        w_old = np.copy(w)
        C_traj['SGD'][i] = C(w,x,y)

        # perform the stocahstic GD update
        w += -η*dC_dw(w,x[idx],y[idx])

        # plot
        ax.plot([w_old[0], w[0]], [w_old[1], w[1]], marker='.', linestyle='-', color='k',lw=1) 
        ax3d.plot([w_old[0], w[0]], [w_old[1], w[1]], [C(w_old,x,y),C(w,x,y)], marker='.', linestyle='-', color='k',lw=1, zorder=100)

        ax.set_title(fr'$w=[{w[0]:.2f},{w[1]:.2f}]$' + '\n' + f'$C(w) = {C(w,x,y):.6f}$', fontsize=14);
        display.display(fig)
        display.clear_output(wait=True)
        i += 1

In [None]:
plt.plot(C_traj['GD'], label='Gradient Descent')
plt.plot(C_traj['SGD'], label='Stochastic Gradient Descent')

plt.xlabel('Iteration Step')
plt.ylabel('Cost')
plt.legend()

### It Works!

Let's return to our original model and compare

In [None]:
plt.plot(x,y, 'o', label='data', alpha=0.5)
plt.plot(x,F(x,w),'-', label='fit', color=colors[0])
plt.xlabel('x')
plt.ylabel('y')
plt.legend()