In [None]:
%pip install --upgrade plotly

In [3]:
import plotly
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
import torch
from torch import nn, Tensor
from torch.optim import Optimizer
from torch.utils.data import DataLoader, Subset, random_split

from torchvision import datasets
from torchvision.utils import make_grid
import torchvision.transforms as T

from copy import deepcopy

from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from plot_loss import get_loss_grid, plot_contour, plot_surface, plot_losses, get_state_directions
from convert_net import get_conv, get_lin
from utils import get_batch, plot_examples

In [4]:
def conf_pltly():
    import IPython
    display(IPython.core.display.HTML("""
      <script src="/static/components/requirejs/require.js"></script>
      <script>
        requirejs.config({
          paths: '/static/base',
          plotly: 'https://cds.plot.ly/plotly-2.23.2.min.js?noext'
        });
      </script>
    """))

# Optimizing Neural Networks

In this exercise, you will learn how to implement optimizers such as Gradient Descent to use in PyTorch. You will evaluate the impact of learning rates and different parameter update strategies. For this you will use the dataset from previous exercise and improve the convergence of a simple classifier.

## Prepare Dataset

We first define useful functions and data loaders. For this, we need a pipeline to convert the data samples into Tensors.

In [5]:
transforms = T.Compose([
    T.ToTensor(),
    T.Normalize(0.5, 0.5)
])

We then load the data and create a small subset of the dataset to speed up tests of the implemented optimizers. Look at the [pytorch](https://pytorch.org/docs/stable/data.html) documentation and create

- a small subset of 50 batches with size 64
- two dataloaders that iterate over all samples and batched samples, respectively

In [None]:
batch_size = 64
batches = 50

data = datasets.MNIST(root="./", transform=transforms, target_transform=None, download=True)
data = Subset(...)

gd_data_loader = DataLoader(...)
data_loader = DataLoader(...)

You can use `plot_examples()` to visualize some examples of the dataset

In [None]:
plot_examples(data)

## Optimizer

In this section we will compare different optimizer that were introduced to improve the convergence behavior of neural networks. The nice thing about PyTorch is that all gradients are computed automatically when using the implemented backward pass and saved for Tensor `x` in `x.grad`. Therefore, we first implement the network used in previous exercise using the PyTorch implementations of:

- nn.Sequential
- nn.Flatten
- nn.Linear

Transfer the net to the GPU before returning it, to use all available compute.

In [8]:
def get_OneFCNet():
    return ...

## Gradient Descent

Now that we have a network to optimize, let's have a look at the optimizer class of PyTorch. To implement an optimizer, we need to implement the `step()` function to actually update the parameters.

In [9]:
class GD(Optimizer):
    
    def __init__(self, params, lr=0.2) -> None:
        super().__init__(params, {'lr': lr})
        self.lr = lr
    
    def step(self):
        # do not consider the next steps for gradient calculations
        with torch.no_grad():
            
            # iter over all parameters
            for p in self.param_groups[0]['params']:
                
                # if the gradient is set, update it
                if p.grad is not None:
                    
                    # update parameters
                    # hint: in torch each function (e.g. Tensor.add()) has an inplace variant
                    # which modifies the tensor inplace: Tensor.add_()
                    ...

Let's define a training step by defining the `loss_fn` globally and implementing a step with the following substeps:

- all gradients must be reseted by `optimizer.zero_grad()`
- get the result of a forward pass of the network
- calculate the loss for this batch
- do a backwardpass using `.backward()` on the calculated loss
- do an optimizer step

In [10]:
def training_step(net, optimizer, loss_fn, batch):
    img, gt = batch
    
    # implement training step
    ...
    
    return loss

To do multiple steps, we implement a function `the_loop` that iterates over a dataloader. It should do a training step per batch for `epochs`. After one epoch, the loss on the validation set should be calculated.

In [11]:
def the_loop(net, optimizer, train_loader, val_loader=None, epochs=None, swa_model=None, swa_start=5):
    if epochs is None:
        raise Exception("a training duration must be given: set epochs")
    
    log_iterval = 1
    running_mean = 0.
    loss = Tensor([0.]).cuda()

    losses = []
    val_losses = []
    states = []
    i, j = 0, 0
    
    pbar = tqdm(train_loader, desc=f"epoch {i}", postfix={"loss": loss.item(), "step": j})

    for i in range(epochs):
        running_mean = 0.
        j = 0

        pbar.set_description(f"epoch {i}")
        pbar.refresh()
        pbar.reset()
        for j, batch in enumerate(train_loader):

            # implement training step by 
            # - appending the current states to `states`
            # - doing a training_step
            # - appending the current loss to the `losses` list
            # - update the running_mean for logging

            ...

            if j % log_iterval == 0 and j != 0:
                pbar.set_postfix({"loss": running_mean.item(), "step": j})
                running_mean = 0.
            pbar.update()
        
        if i > swa_start and swa_model is not None:
            swa_model.update_parameters(net)
        
        if val_loader is not None:
            
            # evaluate the current net on the validation data loader and
            # collect all losses in the ´val_loss´ list
           
            ...

    pbar.refresh()
    
    if val_loader is not None:
        return losses, states, val_losses
    
    return losses, states

Now train a OneFCNet using your Gradient Descent optimizer, the data loader which iterates over all samples in one batch with a Cross Entropy loss (hint: there is an implementation of PyTorch for this loss). For testing the optimizers we are not yet interested in the validation loss. So no need to provide a validation loader.

In [None]:
net = ...
epochs = 10
optimizer = GD(net.parameters(), 0.002)
loss_fn = ...

losses, states = ...

fig = plot_losses(losses)
iplot(fig)

We can now use a method to plot the loss surface of the network by projecting the parameter updates into two dimensions. You can find more information on that [here](https://proceedings.neurips.cc/paper_files/paper/2018/hash/a41b3bb3e6b050b6c9067c67f663b915-Abstract.html). But you can just use the provided code. The contour plot will show how the loss will change if you would follow the two main directions of the past parameter updates.

Think about the challenges and the optimization process of this landscape. What could impede the convergence of the net?

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=0, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=loss_coordinates.abs().max().item())

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

### Garment Classifier

Below there is a net given, that start to use convolutions to consider the advantages of the spatial structure of images. Test the gradient descent method on this net and compare it.


In [None]:
def get_GarmentClassifier():
    return nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.Conv2d(6, 16, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.Flatten(),
            nn.Linear(16 * 4 * 4, 120),
            nn.ReLU(),
            nn.Linear(120, 84),
            nn.ReLU(),
            nn.Linear(84, 10)
        ).cuda()

In [None]:
net = ...
epochs = 10
optimizer = GD(net.parameters(), 0.02)

losses, states = ...

fig = plot_losses(losses)
conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=0, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=loss_coordinates.abs().max().item())

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

What are the differences of the contour plots and convergence behaviors?

## Stochastic Gradient Descent

For Gradient Descent, we calculated the gradients for each sample individually. To improve the convergence behavior and to speed up the training process, we now consider batches of gradients. This stabilizes the convergence and prevents the parameters from being push in one direction and directly afterwards being pulled in the opposite direction. Therefore, we can use the gradient descent implementation and just swap the data loader to use the batched variant. The bad thing is that we need to iterate now multiple times over the dataset.

Is there any difference?

In [None]:
net = ...
epochs = 10
optimizer = GD(net.parameters(), 0.02)

losses, states = ...

fig = plot_losses(losses)
conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=400, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=loss_coordinates.abs().max().item())

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

### Stochastic Gradient Descent with Momentum

A next improvement to our optimizer is inspired by the physical momentum. A typical problem for gradient descent in complex optimization functions is, that one could end in a local minima. Therefore, we not only consider the current gradient but also the last update weighted by a momentum factor:

$$ v = \eta \cdot ( -\frac{\partial L(\theta)}{\partial \theta} ) + v^\text{old} \cdot mtm $$

with $\eta$ being the learning rate, $\theta$ denoting the network parameters and $L(\theta)$ being the current loss of with the current set of parameters. The update is given by:

$$ \theta = \theta + v $$

Now use the implementation of Stochastic Gradient Descent and add momemtum to it. Again, train a network and see if you can spot any differences to the previous optimization.

In [None]:
class SGDwM(Optimizer):
    
    def __init__(self, params, lr=0.2, momentum=0.9) -> None:
        super().__init__(params, {'lr': lr})
        self.lr = lr
        
        # maybe you need some more code here
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.param_groups[0]['params']):
                if p.grad is not None:
                    
                    # update parameters
                    ...

In [None]:
net = get_GarmentClassifier()
epochs = 10
optimizer = SGDwM(net.parameters(), 0.02)

losses, states = the_loop(net, optimizer, data_loader, epochs=epochs)

fig = plot_losses(losses)
conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=400, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=loss_coordinates.abs().max().item())

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

### Nesterov Accelerated Gradient Descent

A variant of Stochastic Gradient Descent also considers the penultimate update to include more statistics for optimization:

$$ \tilde{\theta} = \theta + v^\text{old} \cdot mtm $$
$$ v = v^\text{old} \cdot mtm + \eta \cdot (-\frac{\partial L(\tilde{\theta})}{\partial \theta}) $$
$$ \theta = \tilde{\theta} $$

Are there differences to previous methods and why do they exists?

In [None]:
class NAGD(Optimizer):
    
    def __init__(self, params, lr=0.2, momentum=0.9) -> None:
        super().__init__(params, {'lr': lr, 'momentum': momentum})
        self.lr = lr
        
        # maybe you need some more code here
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.param_groups[0]['params']):
                if p.grad is not None:
                    
                    # update parameters
                    ...


In [None]:
net = get_GarmentClassifier()
epochs = 10
optimizer = NAGD(net.parameters(), 0.02)

losses, states = the_loop(net, optimizer, data_loader, epochs=epochs)

fig = plot_losses(losses)
conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=400, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=1)

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

### Learning Rate Decay

Let's apply a last idea. What about adjusting the learning rate for each update so that later updates have less impact and therefore prevent oscillations around the optimum. Use the implementation of SGD with momentum as a starting point and adjust the learning rate for each step $t$ by a decay factor $d$:

$$ \eta_t = \frac{\eta_0}{1 + d \cdot t} $$

Which decay factor works the best?

In [None]:
class SGDwMLD(Optimizer):
    
    def __init__(self, params, lr=0.2, momentum=0.9, decay=0.01) -> None:
        super().__init__(params, {'lr': lr, 'momentum': momentum, 'decay': decay})
        self.lr = lr
        
        # maybe you need some more code here
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.param_groups[0]['params']):
                if p.grad is not None:
                    
                    # update parameters here


In [None]:
net = get_GarmentClassifier()
epochs = 20
optimizer = SGDwMLD(net.parameters(), 0.02, 0.9, lr_d)

losses, states = the_loop(net, optimizer, data_loader, epochs=epochs)

fig = plot_losses(losses)
conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=800, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=1)

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

## Adaptive Learning Rates

### Adaptive Gradient (AdaGrad)

Taking the learning rate decay a step further, we take the past gradients into account and with this adjust the learning rate influence:

$$ g_t = \frac{\partial L(\theta_t)}{\partial \theta} $$
$$ V_t = \sqrt{\sum_{i=1}^t (g_i)^2} + \epsilon $$
$$ \theta_{t+1} = \theta_t - \eta \frac{g_t}{V_t} $$

- How does the AdaGrad perform compared to the Learning Rate Decay?
- What could be a disadvantage of these update rules?

In [None]:
class AdaGrad(Optimizer):
    
    def __init__(self, params, lr=0.01, eps=1e-10) -> None:
        super().__init__(params, {'lr': lr})
        self.lr = lr
        
        # maybe you need more code here
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.param_groups[0]['params']):
                if p.grad is not None:
                    
                    # update parameters
                    ...

In [None]:
net = get_GarmentClassifier()
epochs = 20
optimizer = AdaGrad(net.parameters())

losses, states = the_loop(net, optimizer, data_loader, epochs=epochs)

fig = plot_losses(losses)
conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=900, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=loss_coordinates.abs().max().item())

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

### RMSProb

RMSProb improves AdaGrad by using a second-order cumulative momentum using a decay factor $\beta$:

$$ V_t = \sqrt{\beta V_{t-1} + (1-\beta)(g_t)^2} + \epsilon $$

- What is the benifit of this update rule compared to the $V_t$ of AdaGrad?

In [None]:
class RMSProb(Optimizer):
    
    def __init__(self, params, lr=0.01, eps=1e-10, decay=0.1) -> None:
        super().__init__(params, {'lr': lr})
        self.lr = lr
        
        # maybe you need more code here
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.param_groups[0]['params']):
                if p.grad is not None:
                    
                    # update parameters

In [None]:
net = get_GarmentClassifier()
epochs = 20
optimizer = RMSProb(net.parameters())

losses, states = the_loop(net, optimizer, data_loader, epochs=epochs)

fig = plot_losses(losses)
conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=800, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=loss_coordinates.abs().max().item())

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

### Adaptive Moment Estimation (Adam)

For the lecture, you should know the Adam optimizer. Implement it and compare it to previous methods.

- What is the advantage of Adam compared to RMSProb?

In [None]:
class Adam(Optimizer):
    
    def __init__(self, params, lr=0.01, grad_decay=0.9, squared_grad_decay=0.999, eps=1e-8) -> None:
        super().__init__(params, {'lr': lr})
        self.lr = lr
        
        # maybe you need more code here
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.param_groups[0]['params']):
                if p.grad is not None:
                    
                    # update parameters

In [None]:
net = get_GarmentClassifier()
epochs = 20
optimizer = Adam(net.parameters(), lr=0.002)

losses, states = the_loop(net, optimizer, data_loader, epochs=epochs)

fig = plot_losses(losses)
conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=800, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=loss_coordinates.abs().max().item())

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

## Stochastic Variance Reduction Gradient (SVRG)

A last strategy to further improve convergence, stochastic variance reduction averages the last $n$ parameters to reduce oscillations in the last training episode. PyTorch already offers an implementation.

- Does this improve the convergence in our case?

In [None]:
from torch.optim.swa_utils import AveragedModel

In [None]:
net = get_GarmentClassifier()
avg_net = AveragedModel(net)
epochs = 20
optimizer = Adam(net.parameters(), lr=0.002)

losses, states = the_loop(net, optimizer, data_loader, epochs=epochs, swa_model=avg_net, swa_start=15)

fig = plot_losses(losses)
conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

In [None]:
# project states onto the main directions of the gradient updates using n samples over all steps starting from sample x
# the directions are calculated using the last sample as a reference
directions, state_ids, loss_coordinates = get_state_directions(states, n_states=10, start_from=800, reference_id=-1)

# compute the losses over the main directions of the gradient updates
x, y, Z, _ = get_loss_grid(net, data_loader, loss_fn, directions=directions, resolution=(20, 20), scale=loss_coordinates.abs().max().item())

# plot the landscape as a contour plot
fig = plot_contour(np.copy(x), np.copy(y), np.copy(Z), scale=True)
fig.add_traces(go.Scatter(x=np.copy(loss_coordinates[0].cpu().numpy()),
                          y=np.copy(loss_coordinates[1].cpu().numpy())))

print('loss samples:', np.array(losses)[state_ids])

conf_pltly()
init_notebook_mode(connected=False)
iplot(fig)

## Hyperparameter Tuning

As a last task, find the best training strategy from (SGDwMLD, AdaGrad, RMSProb, Adam) and their hyperparameters to classify MNIST with the GarmentClassifier with 1 epoch of training.

For this we load all data and use a train, validation, and test split of 90%, 5%, 5%.

In [None]:
data = datasets.MNIST(root="./", transform=transforms, target_transform=None)
train, val, test = random_split(data, [0.9, 0.05, 0.05])

train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val, batch_size=batch_size)
test_loader = DataLoader(test, batch_size=batch_size)