# Deep Learning
## Exercise 6 - Optimization

### 1. Effect of Momentum
Momentum can accelerate the training when the loss surface is too flat. Another effect of momentum is avoiding the optimization algorithm from being trapped in a local minimum and help to find the global minimum. Assume our loss function to be
\begin{equation}
    \mathcal{L}(\theta)=
    \begin{cases}
        \theta \cdot cos(\pi \theta) & \text{if } −1.0\leq \theta\leq 2.0 \\
        1e10 & \text{else}
    \end{cases}
\end{equation}
where $\theta$ is the parameter to solve. 

In [None]:
import torch
import matplotlib.pyplot as plt
from math import pi
import numpy as np

def loss_function(theta):
    loss = torch.full(theta.shape, 1e10)
    loss[(theta>=-1)&(theta<=2)] = theta * torch.cos(theta*pi)
    return loss

x = torch.linspace(-1, 2, 50)

plt.plot(x, loss_function(x))
plt.ylabel('Loss')
plt.xlabel('Theta')
plt.show()

#### 1. SGD without Momentum

Using pytorch's `torch.optim.SGD`, find a minimum with initial values of $\theta = -0.95$ and learning rate $\alpha = 0.01$. Does your algorithm get stuck at a local minimum? Plot the loss values over the number of steps and on the loss surface.

In [None]:
#ToDo: Run SGD without momentum. How do your loss values evolve?

In [None]:
theta = torch.tensor([-0.95], requires_grad=True)

optimizer = torch.optim.SGD([theta], lr = 0.01)

def run_SGD(theta, optimizer, loss_function):
    loss = loss_function(theta)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss

theta_steps, loss_values = [],[]

for i in range(100):
    theta_steps += [theta.item()]
    loss_values += [run_SGD(theta, optimizer, loss_function).item()]
    
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True, figsize=(10,5))
    
ax1.plot(list(range(100)), loss_values, 'o', color='tab:orange')
ax1.set_ylabel('Loss')
ax1.set_xlabel('Step')
ax2.plot(x, loss_function(x))
ax2.plot(theta_steps, loss_values, 'o')
ax2.set_xlabel('Theta')

plt.show()

#### 2. SGD with Momentum

Use Momentum (by setting the momentum argument). Which value for the momentum lets the algorithm converge at the global minimum? Plot the loss values over the number of steps and on the loss surface.

In [None]:
#ToDo: Run SGD with momentum. How do your loss values evolve now?

In [None]:
theta = torch.tensor([-0.95], requires_grad=True)

optimizer = torch.optim.SGD([theta], lr = 0.01, momentum=0.95)

def run_SGD(theta, optimizer, loss_function):
    loss = loss_function(theta)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss

theta_steps, loss_values = [],[]

for i in range(100):
    theta_steps += [theta.item()]
    loss_values += [run_SGD(theta, optimizer, loss_function).item()]
    
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True, figsize=(10,5))
    
ax1.plot(list(range(100)), loss_values, 'o', color='tab:orange')
ax1.set_ylabel('Loss')
ax1.set_xlabel('Step')
ax2.plot(x, loss_function(x))
ax2.plot(theta_steps, loss_values, 'o')
ax2.set_xlabel('Theta')

plt.show()

### 2. Effect of Optimization Techniques on Training Time and Performance
In this task we revisit the CNN from Exercise 4.2, that you built to classify images into the 10 classes of the CIFAR-10 dataset. You can re-use your solutions for that taks.

Same data loading as in Exercise 4.2:

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

use_subset = False # Set this to True for debugging purposes.

transform = transforms.ToTensor()

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
val_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
classes = train_dataset.classes

if use_subset:
    train_dataset = torch.utils.data.Subset(train_dataset, torch.arange(0, 100))
    val_dataset = torch.utils.data.Subset(val_dataset, torch.arange(0, 100))

print(f'classes: {classes}\nnumber of instances:\n\ttrain: {len(train_dataset)}\n\tval: {len(val_dataset)}')

def show_examples(n):
    for i in range(n):
        index = torch.randint(0, len(train_dataset), size=(1,)) # select a random example
        image, target = train_dataset[index]
        print(f'image of shape: {image.shape}')
        print(f'label: {classes[target]}')
        plt.imshow(image.permute(1,2,0).numpy())
        plt.show()

show_examples(4)

batch_size = 32
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, num_workers=2)

#### 1. Optimizers

Re-train the model for a maximum of $10$ epochs with
* SGD without momentum
* SGD with momentum (set to $0.9$)
* AdaGrad
* ADAM

Use a learning rate of $0.001$. Make sure to fix the random seed before creating the model in each training run for a better comparison. To set the random seed, use `torch.manual_seed(0)` before intializing each model.

For each epoch, print:
* the average train loss
* the average validation loss
* the validation accuracy

How do the optimizers compare in terms of convergence speed and resulting validation performance?

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

In [None]:
#ToDo: Set up model training

In [None]:
def set_up_model():
    model = nn.Sequential(
            nn.Conv2d(3, 48, (3,3), padding=(1,1)),
            nn.ReLU(),
            nn.Conv2d(48, 96, (3,3), padding=(1,1)),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(96, 192, (3,3), padding=(1,1)),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Flatten(),
            nn.Linear(192*8*8 ,64),
            nn.ReLU(),
            nn.Linear(64,10)
            )
    return model

loss = nn.CrossEntropyLoss()


def train(epochs, model, loss_function, opt, train_loader, val_loader, evaluation):
    res = []
    for epoch in range(epochs):
        model.train()
        cum_loss = 0
        num_batches = 0
        for img, label in tqdm(train_loader, desc='Train Iteration',ascii=True):
            output = model(img)
            loss = loss_function(output, label)
            loss.backward()
            opt.step()
            opt.zero_grad()
            cum_loss += loss.item()
            num_batches +=1
        acc, val_loss = evaluation(model, loss_function, val_loader)
        cum_loss = cum_loss/len(train_loader)
        res += [[cum_loss, val_loss, acc]]
        print(f"Epoch {epoch} \t ----> \t Loss {cum_loss:.5f} \t ----- \t Val Loss {val_loss:.5f} \t ----- \t Accuracy {acc:.5f}")
    return res


def evaluation(model, loss_funct, val_loader):
    model.eval()
    total_matches = 0
    val_entries = 0
    cum_loss = 0
    with torch.no_grad():
        for img, label in tqdm(val_loader, desc='Val Iteration', ascii=True):
            output = model(img)
            loss = loss_funct(output, label)
            cum_loss += loss.item()
            prediction = torch.argmax(output, dim=1)
            num_matches_batch = (prediction==label).sum()
            total_matches += num_matches_batch
            val_entries += len(img)
    accuracy = total_matches/val_entries
    loss = cum_loss/len(val_loader)
    return accuracy.item(), loss

In [None]:
#ToDo: Run Training with the different optimizers

In [None]:
lr = 1e-3
num_epochs = 10

torch.manual_seed(0)
model = set_up_model()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.)
sgd_res = train(num_epochs, model, loss, optimizer, train_dl, val_dl, evaluation)

torch.manual_seed(0)
model = set_up_model()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
sgd_mom_res = train(num_epochs, model, loss, optimizer, train_dl, val_dl, evaluation)

torch.manual_seed(0)
model = set_up_model()
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
ada_res = train(num_epochs, model, loss, optimizer, train_dl, val_dl, evaluation)

torch.manual_seed(0)
model = set_up_model()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
adam_res = train(num_epochs, model, loss, optimizer, train_dl, val_dl, evaluation)

print("Train Loss")
print("Epoch \t SGD    \t SGD Momentum \t AdaGrad \t Adam")
for epoch, (sgd, sgd_mom, ada, adam) in enumerate(zip(sgd_res, sgd_mom_res, ada_res, adam_res)):
    print(f"{epoch} \t {sgd[0]:4f} \t {sgd_mom[0]:4f} \t {ada[0]:4f} \t {adam[0]:4f}")
    
print("\nVal Loss")
print("Epoch \t SGD    \t SGD Momentum \t AdaGrad \t Adam")
for epoch, (sgd, sgd_mom, ada, adam) in enumerate(zip(sgd_res, sgd_mom_res, ada_res, adam_res)):
    print(f"{epoch} \t {sgd[1]:4f} \t {sgd_mom[1]:4f} \t {ada[1]:4f} \t {adam[1]:4f}")

print("\nAccuracy")
print("Epoch \t SGD    \t SGD Momentum \t AdaGrad \t Adam")
for epoch, (sgd, sgd_mom, ada, adam) in enumerate(zip(sgd_res, sgd_mom_res, ada_res, adam_res)):
    print(f"{epoch} \t {sgd[2]:4f} \t {sgd_mom[2]:4f} \t {ada[2]:4f} \t {adam[2]:4f}")
    

#### 2. Batch Normalization

Repeat the experiment from 1., but add batch normalization after every convolutional layer (after the ReLU activation). What effect does adding batch normalization have?

In [None]:
#ToDo: Set up the model

In [None]:
def set_up_model():
    model = nn.Sequential(
            nn.Conv2d(3, 48, (3,3), padding=(1,1)),
            nn.ReLU(),
            nn.BatchNorm2d(48),
            nn.Conv2d(48, 96, (3,3), padding=(1,1)),
            nn.ReLU(),
            nn.BatchNorm2d(96),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(96, 192, (3,3), padding=(1,1)),
            nn.ReLU(),
            nn.BatchNorm2d(192),
            nn.MaxPool2d((2,2)),
            nn.Flatten(),
            nn.Linear(192*8*8 ,64),
            nn.ReLU(),
            nn.Linear(64,10)
            )
    return model

In [None]:
#ToDo: Run training with different optimizers

In [None]:
lr = 1e-3
num_epochs = 10

torch.manual_seed(0)
model = set_up_model()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.)
sgd_res = train(num_epochs, model, loss, optimizer, train_dl, val_dl, evaluation)

torch.manual_seed(0)
model = set_up_model()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
sgd_mom_res = train(num_epochs, model, loss, optimizer, train_dl, val_dl, evaluation)

torch.manual_seed(0)
model = set_up_model()
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
ada_res = train(num_epochs, model, loss, optimizer, train_dl, val_dl, evaluation)

torch.manual_seed(0)
model = set_up_model()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
adam_res = train(num_epochs, model, loss, optimizer, train_dl, val_dl, evaluation)

print("Train Loss")
print("Epoch \t SGD    \t SGD Momentum \t AdaGrad \t Adam")
for epoch, (sgd, sgd_mom, ada, adam) in enumerate(zip(sgd_res, sgd_mom_res, ada_res, adam_res)):
    print(f"{epoch} \t {sgd[0]:4f} \t {sgd_mom[0]:4f} \t {ada[0]:4f} \t {adam[0]:4f}")
    
print("\nVal Loss")
print("Epoch \t SGD    \t SGD Momentum \t AdaGrad \t Adam")
for epoch, (sgd, sgd_mom, ada, adam) in enumerate(zip(sgd_res, sgd_mom_res, ada_res, adam_res)):
    print(f"{epoch} \t {sgd[1]:4f} \t {sgd_mom[1]:4f} \t {ada[1]:4f} \t {adam[1]:4f}")

print("\nAccuracy")
print("Epoch \t SGD    \t SGD Momentum \t AdaGrad \t Adam")
for epoch, (sgd, sgd_mom, ada, adam) in enumerate(zip(sgd_res, sgd_mom_res, ada_res, adam_res)):
    print(f"{epoch} \t {sgd[2]:4f} \t {sgd_mom[2]:4f} \t {ada[2]:4f} \t {adam[2]:4f}")