In [12]:
%pylab inline
import torch
import torch.nn as nn
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score

## Afternoon session 2:
### Optimization and Neural Networks

Read the following paper: [An overview of gradient descent optimization
algorithms](https://arxiv.org/pdf/1609.04747.pdf).  
(At least read about stochastic, full-batch and mini-batch gradient descent, SGD, Momentum and Adam.)

Then try to implement the following:

2. Compare different optimization methods using the Ackley-Exercise
    - Create plots for SGD, SGD+Momentum=0.9, RMSProp, Adam
    - Compare for a number of learning rates lr=[10, 1, 1e-1, 1e-2]
    - The final result should be a 4x4 matrix of plots of the Ackley function optimized from 100 starting locations (set_seed(42)) to get the same locations every time.
    - What do you observe in terms of local and global minima?
3. For the final exercise from the morning session create a network that achieves at least 98% accuracy on the test set. You may add layers, add weights, change learning rates and optimization methods. Use full-batch training.

Additional Reading Material:  
[Visualizing the Loss Landscapes of Neural Networks](https://papers.nips.cc/paper/7875-visualizing-the-loss-landscape-of-neural-nets.pdf)  
[On the importance of initialization and momentum in deep learning](http://proceedings.mlr.press/v28/sutskever13.pdf)  
[Why Momentum really works distill.pub](https://distill.pub/2017/momentum/)

## Ackely Investigation

In [None]:
%matplotlib inline

In [10]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np

def ackley(x, y):
    sum_sq_term = -20 * np.exp(-0.2*np.sqrt(0.5*(x*x+y*y)))
    cos_term = -np.exp(0.5*(np.cos(2*np.pi*x)+np.cos(2*np.pi*y)))
    value = sum_sq_term+cos_term+np.exp(1)+20
    return value

class Ackley(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, coords):
        x = coords[:, 0]
        y = coords[:, 1]
        sum_sq_term = -20 * torch.exp(-0.2*torch.sqrt(0.5*(x*x+y*y)))
        cos_term = -torch.exp(0.5*(torch.cos(2*np.pi*x)+torch.cos(2*np.pi*y)))
        value = sum_sq_term+cos_term+np.exp(1)+20
        return value

In [9]:
def optimize_ackley(method, lr, N_points=100):
    set_seed(42)
    ackley_torch = Ackley()

    attempts = []
    for j in range(N_points):
        coords = torch.randn(1, 2)*2
        coords.requires_grad = True
        if method == "SGD":
            # add SGD optimizer
        elif method == "SGD+Momentum":
            # add SGD + momentum
        elif method == "RMSProp":
            # add RMSProp
        elif method == "Adam":
            # add Adam  
        steps = [coords.detach().numpy().copy()]
        for i in range(50):
            # reset grads with zero_grad
            # instantiate object of ackley_torch class
            # compute gradients
            # step
            steps.append(coords.detach().numpy().copy())
        attempts.append(steps)
    return attempts

In [7]:
def plot_ackley_and_trajectories(ax, attempts, method, lr):
    x = np.linspace(-3, 3, 100)
    y = np.linspace(-3, 3, 100)
    # Make data.
    x, y = np.meshgrid(x, y)
    z = ackley(x, y)

    # Plot the surface.
    surf = ax.contourf(x, y, z, np.linspace(0, 10, 100), cmap=cm.coolwarm)
    for a in attempts:
        steps_np = np.array(a)[:, 0, :]
        ax.plot(steps_np[:, 0], steps_np[:, 1], linewidth=2, c="black", linestyle="--", alpha=0.1)

    for a in attempts:
        steps_np = np.array(a)[:, 0, :]
        ax.scatter(steps_np[[0], 0], steps_np[[0], 1], marker="o", color="yellow", s=50, zorder=100)
        ax.scatter(steps_np[[-1], 0], steps_np[[-1], 1], marker="o", color="magenta", s=50, zorder=100)
    ax.set_xlim(-3, 3)
    ax.set_ylim(-3, 3)
    ax.set_title(method+" "+str(lr))
    return True

In [11]:
fig, axarr = plt.subplots(4, 4, figsize=(24, 24))
# loop over methods 
    # loop over learning rates
        

Learning Rate 10: Trajectories diverge, no global minima found, no local minima found  
Learning Rate 1: Most Trajectories diverge, only Adam finds global and some local minima  
Learning Rate 1e-2: SGD finds points clode to global minimum, SGD+Momentum overshoots, Adam finds all local minima  
Learning Rate 1e-3: All learning methods dive into local minima, SGD+momentum and Adam have longer trajectories and find global minimum more often.  


## Improving the two-moons neural network