

```
# This is formatted as code
```

# CSC413 Final Project

**Version Date**: 2023-04-07



#### Imports

In [None]:
import argparse
import math
import time
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import numpy.random as npr
import scipy.misc
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import math
import torchvision
import matplotlib
import matplotlib.pyplot as plt
import time
import argparse
import math
import time


#### MNIST dataset & FCN

In [None]:
class MNIST_FullyConnected(nn.Module):
    """
    A fully-connected NN for the MNIST task. This is Optimizable but not itself
    an optimizer.
    """
    def __init__(self, num_inp, num_hid, num_out):
        super(MNIST_FullyConnected, self).__init__()
        self.layer1 = nn.Linear(num_inp, num_hid)
        self.layer2 = nn.Linear(num_hid, num_out)

    def initialize(self):
        nn.init.kaiming_uniform_(self.layer1.weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.layer2.weight, a=math.sqrt(5))

    def forward(self, x):
        """Compute a prediction."""
        x = self.layer1(x)
        x = torch.tanh(x)
        x = self.layer2(x)
        x = torch.tanh(x)
        x = F.log_softmax(x, dim=1)
        return x

BATCH_SIZE = 256
EPOCHS = 5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

mnist_train = torchvision.datasets.MNIST('./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_test = torchvision.datasets.MNIST('./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=10000, shuffle=False)

#### Hyperoptimizers by Chandra et al.

In [None]:
# Gradient Descent: The Ultimate Optimizer (Chandra et al.)
class Optimizable:
    '''
    This is the interface for anything that has parameters that need to be
    optimized, somewhat like torch.nn.Model but with the right plumbing for
    hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter
    interface which does not give us enough control about the detachments.)
    Nominal operation of an Optimizable at the lowest level is as follows:
        o = MyOptimizable(...)
        o.initialize()
        loop {
            o.begin()
            o.zero_grad()
            loss = --compute loss function from parameters--
            loss.backward()
            o.step()
        }
    Optimizables recursively handle updates to their optimiz*ers*.
    '''
    def __init__(self, parameters, optimizer):
        self.parameters = parameters # a dict mapping names to tensors
        self.optimizer = optimizer   # which must itself be Optimizable!
        self.all_params_with_gradients = []

    def initialize(self):
        ''' Initialize parameters, e.g. with a Kaiming initializer. '''
        pass
    
    def begin(self):
        ''' Enable gradient tracking on current parameters. '''
        for param in self.all_params_with_gradients:
             param.grad = None
        self.all_params_with_gradients.clear()
        for name, param in self.parameters.items():
            param.requires_grad_() # keep gradient information...
            param.retain_grad()    # even if not a leaf...
            self.all_params_with_gradients.append(param)
        self.optimizer.begin()

    def zero_grad(self):
        ''' Set all gradients to zero. '''
        for param in self.all_params_with_gradients:
            param.grad = torch.zeros_like(param)
        self.optimizer.zero_grad()

    ''' Note: at this point you would probably call .backwards() on the loss
    function. '''

    def step(self):
        ''' Update parameters '''
        pass

class NoOpOptimizer(Optimizable):
    '''
    NoOpOptimizer sits on top of a stack, and does not affect what lies below.
    '''
    def __init__(self):
        pass

    def initialize(self):
        pass

    def begin(self):
        pass

    def zero_grad(self):
        pass

    def step(self, params):
        pass

    def __str__(self):
        return ''

class SGD(Optimizable):
    '''
    A hyperoptimizable SGD.
    '''
    def __init__(self, alpha=0.01, mu=0.0, optimizer=NoOpOptimizer()):
        self.mu = mu
        self.state = {}
        parameters = {
            'alpha': torch.tensor(alpha),
            'mu': torch.tensor(mu)
        }
        super().__init__(parameters, optimizer)

    def step(self, params):
        self.optimizer.step(self.parameters)
        for name, param in params.items():
            g = param.grad.detach()
            p = param.detach()
            if self.mu != 0.0:
                if name not in self.state:
                    buf = self.state[name] = g
                else:
                    buf = self.state[name].detach()
                    buf = buf * self.parameters['mu'] + g
                g = self.state[name] = buf
            params[name] = p - g * self.parameters['alpha']
        
    def __str__(self):
        return 'sgd / '+ str(self.optimizer)

class SGDPerParam(Optimizable):
    '''
    Optimizes parameters individually with SGD.
    '''
    def __init__(self, params, optimizer=NoOpOptimizer()):
        parameters = {k + '_alpha' : torch.tensor(v) for k, v in params}
        super().__init__(parameters, optimizer)

    def step(self, params):
        self.optimizer.step(self.parameters)
        for name, param in params.items():
            g = param.grad.detach()
            p = param.detach()
            if name + '_alpha' not in self.parameters: params[name] = p
            else: params[name] = p - g * self.parameters[name + '_alpha']

    def __str__(self):
        return 'sgdPerParam / ' + str(self.optimizer)

class AdaGrad(Optimizable):
    '''
    A hyperoptimizable AdaGrad.
    '''
    def __init__(self, alpha=0.01, optimizer=NoOpOptimizer()):
        self.eps = 1e-10
        self.cache = {}
        parameters = {
            'alpha': torch.tensor(alpha)
        }
        super().__init__(parameters, optimizer)
    
    def step(self, params):
        self.optimizer.step(self.parameters)
        for name, param in params.items():
            if name not in self.cache:
                self.cache[name] = {
                    'G': torch.zeros_like(param) + 1e-1
                }
            g = param.grad.detach()
            self.cache[name]['G'] = G = self.cache[name]['G'].detach() + torch.square(g)
            params[name] = param.detach() - self.parameters['alpha'] * g / torch.sqrt(G + self.eps).detach()
    
    def __str__(self):
        return 'adagrad / ' + str(self.optimizer)

class RMSProp(Optimizable):
    '''
    A hyperoptimizable RMSProp.
    '''
    def clamp(x):
        return (x.tanh() + 1.) / 2.

    def unclamp(y):
        z = y * 2. - 1.
        return ((1. + z) / (1. - z)).log() / 2.

    def __init__(self, alpha=0.01, gamma=0.99, optimizer=NoOpOptimizer()):
        self.eps = 1e-8
        parameters = {
            'alpha': torch.sqrt(torch.tensor(alpha)),
            'gamma': RMSProp.unclamp(torch.tensor(gamma))
        }
        super().__init__(parameters, optimizer)
        self.cache = {}

    def step(self, params):
        self.optimizer.step(self.parameters)
        gamma = RMSProp.clamp(self.parameters['gamma'])
        alpha = torch.square(self.parameters['alpha'])
        for name, param in params.items():
            if name not in self.cache:
                self.cache[name] = {
                    's': torch.zeros_like(param)
                }
            g = param.grad.detach()
            self.cache[name]['s'] = s = gamma * self.cache[name]['s'].detach() + (1. - gamma) * torch.square(g)
            self.all_params_with_gradients.append(s)
            params[name] = param.detach() - alpha * g / torch.sqrt(s + self.eps)
    
    def __str__(self):
        return 'rmsprop / ' + str(self.optimizer)

class RMSPropAlpha(Optimizable):
    '''
    A hyperoptimizable RMSProp for only alpha.
    '''
    def __init__(self, alpha=0.01, gamma=0.99, optimizer=NoOpOptimizer()):
        self.eps = 1e-8
        self.gamma = gamma
        parameters = {
            'alpha': torch.sqrt(torch.tensor(alpha)),
        }
        super().__init__(parameters, optimizer)
        self.cache = {}

    def step(self, params):
        self.optimizer.step(self.parameters)
        alpha = torch.square(self.parameters['alpha'])
        for name, param in params.items():
            if name not in self.cache:
                self.cache[name] = {
                    's': torch.zeros_like(param)
                }
            g = param.grad.detach()
            self.cache[name]['s'] = s = self.gamma * self.cache[name]['s'].detach() + (1. - self.gamma) * torch.square(g)
            self.all_params_with_gradients.append(s)
            params[name] = param.detach() - alpha * g / torch.sqrt(s + self.eps)
    
    def __str__(self):
        return 'rmspropAlpha / ' + str(self.optimizer)

class Adam(Optimizable):
    '''
    A hyperoptimizable Adam optimizer.
    '''
    def clamp(x):
        return (x.tanh() + 1.) / 2.

    def unclamp(y):
        z = y * 2. - 1.
        return ((1. + z) / (1. - z)).log() / 2.

    def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, log_eps=-8., optimizer=NoOpOptimizer()):
        self.eps = 10. ** log_eps
        parameters = {
            'alpha': torch.tensor(alpha),
            'beta1': Adam.unclamp(torch.tensor(beta1)),
            'beta2': Adam.unclamp(torch.tensor(beta2)),
        }
        super().__init__(parameters, optimizer)
        self.num_stepments = 0
        self.cache = {}

    def step(self, params):
        self.num_stepments += 1
        self.optimizer.step(self.parameters)
        t = self.num_stepments
        beta1 = Adam.clamp(self.parameters['beta1'])
        beta2 = Adam.clamp(self.parameters['beta2'])
        for name, param in params.items():
            if name not in self.cache:
                self.cache[name] = {
                    'm': torch.zeros_like(param),
                    'v': torch.zeros_like(param) +\
                            self.eps
# NOTE that we add a little `fudge factor' here because sqrt is not
# differentiable at exactly zero
                }
            g = param.grad.detach()
            self.cache[name]['m'] = m =\
                beta1 * self.cache[name]['m'].detach() + (1. - beta1) * g
            self.cache[name]['v'] = v =\
                beta2 * self.cache[name]['v'].detach() + (1. - beta2) * g * g
            self.all_params_with_gradients.append(m)
            self.all_params_with_gradients.append(v)

            m_hat = m / (1. - beta1 ** float(t))
            v_hat = v / (1. - beta2 ** float(t))

            dparam = m_hat / (v_hat ** 0.5 + self.eps)
            params[name] = param.detach() - self.parameters['alpha'] * dparam

    def __str__(self):
        return 'adam / ' + str(self.optimizer)

class AdamBaydin(Optimizable):
    ''' Same as above, but only optimizes the learning rate, treating the
    remaining hyperparameters as constants. '''

    def __init__(
        self,
        alpha=0.001, beta1=0.9, beta2=0.999, log_eps=-8.,
        optimizer=NoOpOptimizer()
    ):
        parameters = {
            'alpha': torch.tensor(alpha),
        }
        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
        self.log_eps = log_eps
        super().__init__(parameters, optimizer)
        self.num_stepments = 0
        self.cache = {}

    def step(self, params):
        self.num_stepments += 1
        self.optimizer.step(self.parameters)
        t = self.num_stepments
        beta1 = self.beta1
        beta2 = self.beta2
        for name, param in params.items():
            if name not in self.cache:
                self.cache[name] = {
                    'm': torch.zeros_like(param),
                    'v': torch.zeros_like(param) +\
                            10.**self.log_eps
# NOTE that we add a little `fudge factor' here because sqrt is not
# differentiable at exactly zero
                }

            g = param.grad.detach()
            self.cache[name]['m'] = m =\
                beta1 * self.cache[name]['m'].detach() + (1. - beta1) * g
            self.cache[name]['v'] = v =\
                beta2 * self.cache[name]['v'].detach() + (1. - beta2) * g * g

            self.all_params_with_gradients.append(m)
            self.all_params_with_gradients.append(v)

            m_hat = m / (1. - beta1 ** float(t))
            v_hat = v / (1. - beta2 ** float(t))

            dparam = m_hat / (v_hat ** 0.5 + 10. ** self.log_eps)
            params[name] = param.detach() - self.parameters['alpha'] * dparam

    def __str__(self):
        return 'adamBaydin / ' + str(self.optimizer)


class ModuleWrapper(Optimizable):
    '''
    This class tries to convert a torch.nn.Module to an Optimizable, handling
    the internal plumbing needed to update parameters correctly.
    '''
    def __init__(self, module, optimizer=NoOpOptimizer()):
        self.module = module
        parameters = {k:v for k, v in module.named_parameters(recurse=True)}
        super().__init__(parameters, optimizer)
    
    def initialize(self):
        self.optimizer.initialize()
    
    def zero_grad(self):
        """ Set all gradients to zero. """
        self.module.zero_grad()
        for param in self.all_params_with_gradients:
            param.grad = torch.zeros_like(param)
        self.optimizer.zero_grad()
    
    def forward(self, *xyz):
        return self.module(*xyz)
    
    def train(self):
        self.module.train()
    
    def eval(self):
        self.module.eval()
    
    def step(self):
        self.optimizer.step(self.parameters)
        def set_param(m, k, v):
            kk = k
            while '.' in k:
                sm = k[:k.index('.')]
                k = k[k.index('.') + 1:]
                m = m._modules[sm]

            m._parameters[k] = None
            m._parameters[k] = self.parameters[kk]

        for k, v in self.module.named_parameters(recurse=True):
            set_param(self.module, k, v)

#### New hyperoptimizers for AdaBelief and AMSProp

In [None]:
# Adabelief
class AdaBelief(Optimizable):
    '''
    A hyperoptimizable Adabelief optimizer.
    '''
    def clamp(x):
        return (x.tanh() + 1.) / 2.

    def unclamp(y):
        z = y * 2. - 1.
        return ((1. + z) / (1. - z)).log() / 2.

    def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, log_eps=-8., optimizer=NoOpOptimizer()):
        self.eps = 10. ** log_eps
        parameters = {
            'alpha': torch.tensor(alpha),
            'beta1': Adam.unclamp(torch.tensor(beta1)),
            'beta2': Adam.unclamp(torch.tensor(beta2)),
        }
        super().__init__(parameters, optimizer)
        self.num_stepments = 0
        self.cache = {}

    def step(self, params):
        self.num_stepments += 1
        self.optimizer.step(self.parameters)
        t = self.num_stepments
        beta1 = Adam.clamp(self.parameters['beta1'])
        beta2 = Adam.clamp(self.parameters['beta2'])
        for name, param in params.items():
            if name not in self.cache:
                self.cache[name] = {
                    'm': torch.zeros_like(param),
                    's': torch.zeros_like(param) +\
                            self.eps
# NOTE that we add a little `fudge factor' here because sqrt is not
# differentiable at exactly zero
                }
            g = param.grad.detach()
            m = beta1 * self.cache[name]['m'].detach() + (1. - beta1) * g
            self.cache[name]['s'] = s  =\
                beta2 * self.cache[name]['s'].detach() + (1. - beta2) * (
                    g - m) ** 2 +  self.eps
            self.cache[name]['m'] = m
            self.all_params_with_gradients.append(m)
            self.all_params_with_gradients.append(s)

            m_hat = m / (1. - beta1 ** float(t))
            s_hat = s / (1. - beta2 ** float(t))

            dparam = m_hat / (s_hat ** 0.5 + self.eps)
            params[name] = param.detach() - self.parameters['alpha'] * dparam

    def __str__(self):
        return 'adabelief / ' + str(self.optimizer)


class AdaBeliefBaydin(Optimizable):
    '''
    A hyperoptimizable AdabeliefBaydin optimizer.
    '''
    def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, log_eps=-8., optimizer=NoOpOptimizer()):
        self.eps = 10. ** log_eps
        parameters = {
            'alpha': torch.tensor(alpha),
        }
        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
        self.log_eps = log_eps
        super().__init__(parameters, optimizer)
        self.num_stepments = 0
        self.cache = {}

    def step(self, params):
        self.num_stepments += 1
        self.optimizer.step(self.parameters)
        t = self.num_stepments
        beta1 = self.beta1
        beta2 = self.beta2
        for name, param in params.items():
            if name not in self.cache:
                self.cache[name] = {
                    'm': torch.zeros_like(param),
                    's': torch.zeros_like(param) +\
                            self.eps
# NOTE that we add a little `fudge factor' here because sqrt is not
# differentiable at exactly zero
                }
            g = param.grad.detach()
            m = beta1 * self.cache[name]['m'].detach() + (1. - beta1) * g
            self.cache[name]['s'] = s  =\
                beta2 * self.cache[name]['s'].detach() + (1. - beta2) * (
                    g - m) ** 2 +  self.eps
            self.cache[name]['m'] = m
            self.all_params_with_gradients.append(m)
            self.all_params_with_gradients.append(s)

            m_hat = m / (1. - beta1 ** float(t))
            s_hat = s / (1. - beta2 ** float(t))

            dparam = m_hat / (s_hat ** 0.5 + self.eps)
            params[name] = param.detach() - self.parameters['alpha'] * dparam

    def __str__(self):
        return 'adabeliefbaydin / ' + str(self.optimizer)




# Amsgrad
class AMSGrad(Optimizable):
    '''
    A hyperoptimizable AMSGrad optimizer.
    '''
    def clamp(x):
        return (x.tanh() + 1.) / 2.

    def unclamp(y):
        z = y * 2. - 1.
        return ((1. + z) / (1. - z)).log() / 2.

    def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, log_eps=-8., optimizer=NoOpOptimizer()):
        self.eps = 10. ** log_eps
        parameters = {
            'alpha': torch.tensor(alpha),
            'beta1': Adam.unclamp(torch.tensor(beta1)),
            'beta2': Adam.unclamp(torch.tensor(beta2)),
        }
        super().__init__(parameters, optimizer)
        self.num_stepments = 0
        self.cache = {}

    def step(self, params):
        self.num_stepments += 1
        self.optimizer.step(self.parameters)
        t = self.num_stepments
        beta1 = Adam.clamp(self.parameters['beta1'])
        beta2 = Adam.clamp(self.parameters['beta2'])
        for name, param in params.items():
            if name not in self.cache:
                self.cache[name] = {
                    'm': torch.zeros_like(param),
                    'v': torch.zeros_like(param) +\
                            self.eps
                }
            g = param.grad.detach()
            prev_v = self.cache[name]['v'].detach()
            self.cache[name]['m'] = m =\
                beta1 * self.cache[name]['m'].detach() + (1. - beta1) * g

            v = beta2 * prev_v + (1. - beta2) * g * g

            self.cache[name]['v'] = v
            self.all_params_with_gradients.append(m)
            self.all_params_with_gradients.append(v)

            v_hat = torch.maximum(prev_v, v)

            dparam = m / (v_hat ** 0.5 + self.eps)
            params[name] = param.detach() - self.parameters['alpha'] * dparam

    def __str__(self):
        return 'amsgrad / ' + str(self.optimizer)
  

class AMSGradBaydin(Optimizable):
    '''
    A hyperoptimizable AMSGradBaydin optimizer.
    '''
    def clamp(x):
        return (x.tanh() + 1.) / 2.

    def unclamp(y):
        z = y * 2. - 1.
        return ((1. + z) / (1. - z)).log() / 2.

    def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, log_eps=-8., optimizer=NoOpOptimizer()):


        self.eps = 10. ** log_eps
        parameters = {
            'alpha': torch.tensor(alpha),
        }
        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
        self.log_eps = log_eps
        super().__init__(parameters, optimizer)
        self.num_stepments = 0
        self.cache = {}

    def step(self, params):
        self.num_stepments += 1
        self.optimizer.step(self.parameters)
        t = self.num_stepments
        beta1 =self.beta1
        beta2 = self.beta2
        for name, param in params.items():
            if name not in self.cache:
                self.cache[name] = {
                    'm': torch.zeros_like(param),
                    'v': torch.zeros_like(param) +\
                            self.eps
                }
            g = param.grad.detach()
            prev_v = self.cache[name]['v'].detach()
            self.cache[name]['m'] = m =\
                beta1 * self.cache[name]['m'].detach() + (1. - beta1) * g

            v = beta2 * prev_v + (1. - beta2) * g * g

            self.cache[name]['v'] = v
            self.all_params_with_gradients.append(m)
            self.all_params_with_gradients.append(v)

            v_hat = torch.maximum(prev_v, v)

            dparam = m / (v_hat ** 0.5 + self.eps)
            params[name] = param.detach() - self.parameters['alpha'] * dparam

    def __str__(self):
        return 'amsgrad / ' + str(self.optimizer)

## Part 1: Reproduction of the original Paper
In this part, we will compare the performance of classical optimizers and hyperoptimized optimizers on MNIST dataset. Specifically, we will compare the losses and accuracies of:


1. 
Classical SGD v.s. SGD with SGD/Adam/AdaGrad/RMSProp hyperoptimizers

2. Classical Adam v.s. Adam with Adam/SGD hyperoptimizers.

3. AdaGrad v.s. AdaGrad with SGD/AdaGrad hyperoptimizers.

4. RMSProp v.s. SGD and RMSProp optimizers. 


#### Training and generating optimizers

In [None]:
def train(model, mw):
  # Initialize lists to store losses
  train_losses = []
  val_losses = []
  accuracy = []
  final_parameters={}
  total_time = 0
  # Training loop
  for i in range(1, EPOCHS+1):
    running_loss = 0.0
    start_time = time.time()
    for j, (features_, labels_) in enumerate(dl_train):
        mw.begin() # call this before each step, enables gradient tracking on desired params
        features, labels = torch.reshape(features_, (-1, 28 * 28)).to(DEVICE), labels_.to(DEVICE)
        pred = mw.forward(features)
        loss = F.nll_loss(pred, labels)
        mw.zero_grad()
        loss.backward(create_graph=True) # important! use create_graph=True
        mw.step()
        running_loss += loss.item() * features_.size(0)
    end_time = time.time()
    total_time += (end_time-start_time)
    train_loss = running_loss / len(dl_train.dataset)
    train_losses.append(train_loss)
    # Evaluate on test dataset
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for data, target in dl_test:
            data, target = data.to(DEVICE), target.to(DEVICE)
            data = data.view(data.size(0), -1)  # Flatten the input data
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # Sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(dl_test.dataset)
    val_losses.append(test_loss)
    accuracy.append(100. * correct / len(dl_test.dataset))
    print('Epoch: {}, Train loss: {:.4f}, Test loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
          i, train_loss, test_loss, correct, len(dl_test.dataset),
          100. * correct / len(dl_test.dataset)))

  print("The training time is: "+ str(total_time))
  for i in mw.optimizer.parameters:
    if ((type(mw.optimizer)==Adam or type(mw.optimizer)==AdamBaydin or type(mw.optimizer)==AdaBeliefBaydin or type(mw.optimizer)==AdaBelief or type(mw.optimizer)==AMSGradBaydin or type(mw.optimizer)==AMSGrad) 
    and (i in {'beta1','beta2'})) or ((type(mw.optimizer)==RMSProp or type(mw.optimizer)==RMSPropAlpha) and i in {'gamma'}):
      final_parameters[i] = Adam.clamp(mw.optimizer.parameters[i]).item()
    elif ((type(mw.optimizer)==RMSProp or type(mw.optimizer)==RMSPropAlpha) and i in {'alpha'}):
      final_parameters[i] = mw.optimizer.parameters[i].item()**2
    else:
      final_parameters[i] = mw.optimizer.parameters[i].item()
    print(i + ": " +str(final_parameters[i]))
  return train_losses, val_losses, accuracy, final_parameters

In [None]:
def generate_optimizers(optimizer1, hyperoptimizer1=False, lr=False):

  dict1 = {'Adam': Adam(lr), 'AdaGrad': AdaGrad(lr), 
           'RMSProp': RMSProp(lr), 'SGD': SGD(lr), 'AdaBelief': AdaBelief(lr), 
           'AMSGrad': AMSGrad(lr), False: NoOpOptimizer()}
  model = MNIST_FullyConnected(28 * 28, 128, 10).to(DEVICE)
  
  if optimizer1=='Adam':
    opt = Adam(optimizer=dict1[hyperoptimizer1])
  if optimizer1=='AdaGrad':
    opt = AdaGrad(optimizer=dict1[hyperoptimizer1])
  if optimizer1=='RMSProp':
    opt = RMSProp(optimizer=dict1[hyperoptimizer1])
  if optimizer1=='SGD':
    opt = SGD(optimizer=dict1[hyperoptimizer1])
  if optimizer1=='AdamBaydin':
    opt = AdamBaydin(optimizer=dict1[hyperoptimizer1])
  if optimizer1=='RMSPropAlpha':
    opt = RMSPropAlpha(optimizer=dict1[hyperoptimizer1])
  if optimizer1 == 'AdaBelief':
    opt = AdaBelief(optimizer=dict1[hyperoptimizer1])
  if optimizer1 == 'AMSGrad':
    opt = AMSGrad(optimizer=dict1[hyperoptimizer1])

  mw = ModuleWrapper(model, optimizer=opt)
  return model, mw
    


### Train by classical and hyperoptimized optimizers

#### SGD

In [None]:
# SGD:
model, mw = generate_optimizers('SGD')
SGD_train_losses, SGD_val_losses, SGD_accuracy, _ = train(model, mw)
model, mw = generate_optimizers('SGD','SGD',0.01)
SGD_SGD_train_losses, SGD_SGD_val_losses, SGD_SGD_accuracy, SGD_SGD_params = train(model, mw)
model, mw = generate_optimizers('SGD','Adam',0.1)
SGD_Adam_train_losses, SGD_Adam_val_losses, SGD_Adam_accuracy, SGD_Adam_params = train(model, mw)
model, mw = generate_optimizers('SGD','AdaGrad',0.01)
SGD_AdaGrad_train_losses, SGD_AdaGrad_val_losses, SGD_AdaGrad_accuracy, SGD_AdaGrad_params = train(model, mw)

In [None]:
# This is the actual 5th run
model, mw = generate_optimizers('SGD','RMSProp',0.1)
SGD_RMSProp_train_losses, SGD_RMSProp_val_losses, SGD_RMSProp_accuracy, SGD_RMSProp_params = train(model, mw)


#### Adam

In [None]:
# the printed beta is weird, i.e. about 3, but you can do the clamp/unclamp operation to modify it.


model, mw = generate_optimizers('Adam')
Adam_train_losses, Adam_val_losses, Adam_accuracy, _ = train(model, mw)
model, mw = generate_optimizers('Adam','SGD',0.00001)
Adam_SGD_train_losses, Adam_SGD_val_losses, Adam_SGD_accuracy, Adam_SGD_params = train(model, mw)
model, mw = generate_optimizers('AdamBaydin','SGD',0.00001)
AdamBaydin_SGD_train_losses, AdamBaydin_SGD_val_losses, AdamBaydin_SGD_accuracy, AdamBaydin_SGD_params = train(model, mw)
model, mw = generate_optimizers('Adam','Adam',0.001)
Adam_Adam_train_losses, Adam_Adam_val_losses, Adam_Adam_accuracy, Adam_Adam_params = train(model, mw)
model, mw = generate_optimizers('AdamBaydin','Adam',0.001)
AdamBaydin_Adam_train_losses, AdamBaydin_Adam_val_losses, AdamBaydin_Adam_accuracy, AdamBaydin_Adam_params = train(model, mw)

#### AdaGrad

In [None]:
model, mw = generate_optimizers('AdaGrad')
AdaGrad_train_losses, AdaGrad_val_losses,AdaGrad_accuracy, _ = train(model, mw)
model, mw = generate_optimizers('AdaGrad','SGD',0.01)
AdaGrad_SGD_train_losses, AdaGrad_SGD_val_losses, AdaGrad_SGD_accuracy, AdaGrad_SGD_params = train(model, mw)
model, mw = generate_optimizers('AdaGrad','AdaGrad',0.01)
AdaGrad_AdaGrad_train_losses, AdaGrad_AdaGrad_val_losses, AdaGrad_AdaGrad_accuracy, AdaGrad_AdaGrad_params = train(model, mw)


#### RMSProp

In [None]:
EPOCHS=5
model, mw = generate_optimizers('RMSProp')
RMSProp_train_losses, RMSProp_val_losses,RMSProp_accuracy, _ = train(model, mw)

model, mw = generate_optimizers('RMSPropAlpha','SGD',0.0001)
RMSPropAlpha_SGD_train_losses, RMSPropAlpha_SGD_val_losses, RMSPropAlpha_SGD_accuracy, RMSPropAlpha_SGD_params = train(model, mw)
model, mw = generate_optimizers('RMSProp','SGD',0.0001)
RMSProp_SGD_train_losses, RMSProp_SGD_val_losses, RMSProp_SGD_accuracy, RMSProp_SGD_params = train(model, mw)
model, mw = generate_optimizers('RMSPropAlpha','RMSProp',0.0001)
RMSPropAlpha_RMSProp_train_losses, RMSPropAlpha_RMSProp_val_losses, RMSPropAlpha_RMSProp_accuracy, RMSPropAlpha_RMSProp_params = train(model, mw)
model, mw = generate_optimizers('RMSProp','RMSProp',0.0001)
RMSProp_RMSProp_train_losses, RMSProp_RMSProp_val_losses, RMSProp_RMSProp_accuracy, RMSProp_RMSProp_params = train(model, mw)

#### AdaBelief

In [None]:
# AdaBelief:
EPOCHS=5
model, mw = generate_optimizers('AdaBelief')
AdaBelief_train_losses, AdaBelief_val_losses, AdaBelief_accuracy, _ = train(model, mw)
model, mw = generate_optimizers('AdaBelief','SGD',0.0001)
AdaBelief_SGD_train_losses, AdaBelief_SGD_val_losses, AdaBelief_SGD_accuracy, AdaBelief_SGD_params = train(model, mw)
model, mw = generate_optimizers('AdaBelief','Adam',0.0001)
AdaBelief_Adam_train_losses, AdaBelief_Adam_val_losses, AdaBelief_Adam_accuracy, AdaBelief_Adam_params = train(model, mw)


#### AMSGrad

In [None]:
# AMSGrad:
model, mw = generate_optimizers('AMSGrad')
AMSGrad_train_losses, AMSGrad_val_losses, AMSGrad_accuracy, _ = train(model, mw)
model, mw = generate_optimizers('AMSGrad','SGD',0.0001)
AMSGrad_SGD_train_losses, AMSGrad_SGD_val_losses, AMSGrad_SGD_accuracy, AMSGrad_SGD_params = train(model, mw)
model, mw = generate_optimizers('AMSGrad','Adam',0.0001)
AMSGrad_Adam_train_losses, AMSGrad_Adam_val_losses, AMSGrad_Adam_accuracy, AMSGrad_Adam_params = train(model, mw)


### Test the optimizers with hyperoptimized hyperparameters

#### SGD

In [None]:
# SGD_params = [(SGD_SGD_accuracy[-1], SGD_SGD_params),(SGD_Adam_accuracy[-1], SGD_Adam_params),(SGD_AdaGrad_accuracy[-1], SGD_AdaGrad_params),(SGD_RMSProp_accuracy[-1], SGD_RMSProp_params)]
# params = sorted(SGD_params, key=lambda x: x[0])[-1][1]
model = MNIST_FullyConnected(28 * 28, 128, 10).to(DEVICE)
optimizer = SGD(0.1137143075466156)
mw = ModuleWrapper(model, optimizer=optimizer)
mw.initialize()
Hyperoptimized_SGD_train_losses, Hyperoptimized_SGD_val_losses, Hyperoptimized_SGD_accuracy, _ = train(model, mw)



#### Adam

In [None]:
Adam_params = [(Adam_SGD_accuracy[-1], Adam_SGD_params),(AdamBaydin_SGD_accuracy[-1], AdamBaydin_SGD_params),(Adam_Adam_accuracy[-1], Adam_Adam_params),(AdamBaydin_Adam_accuracy[-1], AdamBaydin_Adam_params)]
params = sorted(Adam_params, key=lambda x: x[0])[-1][1]
model = MNIST_FullyConnected(28 * 28, 128, 10).to(DEVICE)
if 'beta1' in params:

  optimizer = Adam(alpha=params['alpha'], beta1=params['beta1'],beta2=params['beta2'])
else: 
  optimizer = Adam(alpha=params['alpha'])
mw = ModuleWrapper(model, optimizer=optimizer)
mw.initialize()
Hyperoptimized_Adam_train_losses, Hyperoptimized_Adam_val_losses, Hyperoptimized_Adam_accuracy, _ = train(model, mw)

####AdaGrad

In [None]:
AdaGrad_params = [(AdaGrad_SGD_accuracy[-1], AdaGrad_AdaGrad_params),(AdaGrad_AdaGrad_accuracy[-1], AdaGrad_AdaGrad_params)]
params = sorted(AdaGrad_params, key=lambda x: x[0])[-1][1]
model = MNIST_FullyConnected(28 * 28, 128, 10).to(DEVICE)
optimizer = AdaGrad(alpha=params['alpha'])
mw = ModuleWrapper(model, optimizer=optimizer)
mw.initialize()
Hyperoptimized_AdaGrad_train_losses, Hyperoptimized_AdaGrad_val_losses, Hyperoptimized_AdaGrad_accuracy,_ = train(model, mw)

#### RMSProp

In [None]:
optimizer = RMSProp(alpha= 0.0031602321243759945)
mw = ModuleWrapper(model, optimizer=optimizer)
mw.initialize()
Hyperoptimized_RMSProp_train_losses, Hyperoptimized_RMSProp_val_losses, Hyperoptimized_RMSProp_accuracy,_ = train(model, mw)

#### AdaBelief

In [None]:
# AdaBelief
optimizer = AdaBelief(alpha= 0.008482803590595722,beta1= 0.8999614715576172,beta2= 0.9989999532699585)
mw = ModuleWrapper(model, optimizer=optimizer)
mw.initialize()
Hyperoptimized_AdaBelief_train_losses, Hyperoptimized_AdaBelief_val_losses, Hyperoptimized_AdaBelief_accuracy, _ = train(model, mw)

#### AMSGrad

In [None]:
optimizer = AMSGrad(alpha= 0.001403641072101891,
beta1= 0.8986871838569641,
beta2= 0.9990024566650391)
mw = ModuleWrapper(model, optimizer=optimizer)
mw.initialize()
Hyperoptimized_AMSGrad_train_losses, Hyperoptimized_AMSGrad_val_losses, Hyperoptimized_AMSGrad_accuracy, _ = train(model, mw)

### Plots

#### SGD

In [None]:

fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))

# Plot training loss on the top
ax1.plot(SGD_train_losses, label='SGD')
ax1.plot(SGD_SGD_train_losses, label='SGD with SGD hyperoptimizer')
ax1.plot(SGD_Adam_train_losses, label='SGD with Adam hyperoptimizer')
ax1.plot(SGD_AdaGrad_train_losses, label='SGD with AdaGrad hyperoptimizer')
ax1.plot(SGD_RMSProp_train_losses, label='SGD wih RMSProp hyperoptimizer')

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(SGD_val_losses, label='SGD')
ax2.plot(SGD_SGD_val_losses, label='SGD with SGD hyperoptimizer')
ax2.plot(SGD_Adam_val_losses, label='SGD with Adam hyperoptimizer')
ax2.plot(SGD_AdaGrad_val_losses, label='SGD with AdaGrad hyperoptimizer')
ax2.plot(SGD_RMSProp_val_losses, label='SGD with RMSProp hyperoptimizer')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(SGD_accuracy, label='SGD')
ax3.plot(SGD_SGD_accuracy, label='SGD with SGD hyperoptimizer')
ax3.plot(SGD_Adam_accuracy, label='SGD with Adam hyperoptimizer')
ax3.plot(SGD_AdaGrad_accuracy, label='SGD with AdaGrad hyperoptimizer')
ax3.plot(SGD_RMSProp_accuracy, label='SGD with RMSProp hyperoptimizer')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')

plt.tight_layout()
plt.show()



fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(SGD_train_losses, label='SGD')
ax1.plot(Hyperoptimized_SGD_train_losses, label='SGD with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(SGD_val_losses, label='SGD')
ax2.plot(Hyperoptimized_SGD_val_losses, label='SGD with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(SGD_accuracy, label='SGD')
ax3.plot(Hyperoptimized_SGD_accuracy, label='SGD with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

#### Adam

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
# Plot training loss on the top
ax1.plot(Adam_train_losses, label='Adam')
ax1.plot(Adam_SGD_train_losses, label='Adam with SGD hyperoptimizer')
ax1.plot(AdamBaydin_SGD_train_losses, label='Adam with SGD hyperoptimizer (learning-rate only)')
ax1.plot(Adam_Adam_train_losses, label='Adam with Adam hyperoptimizer')
ax1.plot(AdamBaydin_Adam_train_losses, label='Adam with Adam hyperoptimizer (learning-rate only)')

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(Adam_val_losses, label='Adam')
ax2.plot(Adam_SGD_val_losses, label='Adam with SGD hyperoptimizer')
ax2.plot(AdamBaydin_SGD_val_losses, label='Adam with SGD hyperoptimizer (learning-rate only)')
ax2.plot(Adam_Adam_val_losses, label='Adam with Adam hyperoptimizer')
ax2.plot(AdamBaydin_Adam_val_losses, label='Adam with Adam hyperoptimizer (learning-rate only)')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(Adam_accuracy, label='Adam')
ax3.plot(Adam_SGD_accuracy, label='Adam with SGD hyperoptimizer')
ax3.plot(AdamBaydin_SGD_accuracy, label='Adam with SGD hyperoptimizer (learning-rate only)')
ax3.plot(Adam_Adam_accuracy, label='Adam with Adam hyperoptimizer')
ax3.plot(AdamBaydin_Adam_accuracy, label='Adam with Adam hyperoptimizer (learning-rate only)')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Accuracy')

plt.tight_layout()
plt.show()


fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(Adam_train_losses, label='Adam')
ax1.plot(Hyperoptimized_Adam_train_losses, label='Adam with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(Adam_val_losses, label='Adam')
ax2.plot(Hyperoptimized_Adam_val_losses, label='Adam with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(Adam_accuracy, label='Adam')
ax3.plot(Hyperoptimized_Adam_accuracy, label='Adam with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

#### AdaGrad

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
# Plot training loss on the top
ax1.plot(AdaGrad_train_losses, label='AdaGrad')
ax1.plot(AdaGrad_SGD_train_losses, label='AdaGrad with SGD hyperoptimizer')
ax1.plot(AdaGrad_AdaGrad_train_losses, label='AdaGrad with AdaGrad hyperoptimizer')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(AdaGrad_val_losses, label='AdaGrad')
ax2.plot(AdaGrad_SGD_val_losses, label='AdaGrad with SGD hyperoptimizer')
ax2.plot(AdaGrad_AdaGrad_val_losses, label='AdaGrad with AdaGrad hyperoptimizer')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')


ax3.plot(AdaGrad_accuracy, label='AdaGrad')
ax3.plot(AdaGrad_SGD_accuracy, label='AdaGrad with SGD hyperoptimizer')
ax3.plot(AdaGrad_AdaGrad_accuracy, label='AdaGrad with AdaGrad hyperoptimizer')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Loss')

plt.tight_layout()
plt.show()


fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(AdaGrad_train_losses, label='AdaGrad')
ax1.plot(Hyperoptimized_AdaGrad_train_losses, label='AdaGrad with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(AdaGrad_val_losses, label='AdaGrad')
ax2.plot(Hyperoptimized_AdaGrad_val_losses, label='AdaGrad with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AdaGrad_accuracy, label='AdaGrad')
ax3.plot(Hyperoptimized_AdaGrad_accuracy, label='AdaGrad with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

#### RMSProp

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3,1, figsize=(8, 13))
# Plot training loss on the top
ax1.plot(RMSProp_train_losses, label='RMSProp')
ax1.plot(RMSPropAlpha_SGD_train_losses, label='RMSProp with SGD hyperoptimizer (learning rate only)')
ax1.plot(RMSProp_SGD_train_losses, label='RMSProp with SGD hyperoptimizer')
ax1.plot(RMSPropAlpha_RMSProp_train_losses, label='RMSProp with RMSProp hyperoptimizer (learning rate only)')
ax1.plot(RMSProp_RMSProp_train_losses, label='RMSProp with RMSProp hyperoptimizer')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(RMSProp_val_losses, label='RMSProp')
ax2.plot(RMSPropAlpha_SGD_val_losses, label='RMSProp with SGD hyperoptimizer (learning rate only)')
ax2.plot(RMSProp_SGD_val_losses, label='RMSProp with SGD hyperoptimizer')
ax2.plot(RMSPropAlpha_RMSProp_val_losses, label='RMSProp with RMSProp hyperoptimizer (learning rate only)')
ax2.plot(RMSProp_RMSProp_val_losses, label='RMSProp with RMSProp hyperoptimizer')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(RMSProp_accuracy, label='RMSProp')
ax3.plot(RMSPropAlpha_SGD_accuracy, label='RMSProp with SGD hyperoptimizer (learning rate only)')
ax3.plot(RMSProp_SGD_accuracy, label='RMSProp with SGD hyperoptimizer')
ax3.plot(RMSPropAlpha_RMSProp_accuracy, label='RMSProp with RMSProp hyperoptimizer (learning rate only)')
ax3.plot(RMSProp_RMSProp_accuracy, label='RMSProp with RMSProp hyperoptimizer')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Loss')
ax3.legend()
ax3.set_title('Accuracy')

plt.tight_layout()
plt.show()


fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(RMSProp_train_losses, label='RMSProp')
ax1.plot(Hyperoptimized_RMSProp_train_losses, label='RMSProp with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(RMSProp_val_losses, label='RMSProp')
ax2.plot(Hyperoptimized_RMSProp_val_losses, label='RMSProp with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(RMSProp_accuracy, label='RMSProp')
ax3.plot(Hyperoptimized_RMSProp_accuracy, label='RMSProp with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

#### AdaBelief

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3,1, figsize=(8, 13))
# Plot training loss on the top
ax1.plot(AdaBelief_train_losses, label='AdaBelief')
ax1.plot(AdaBelief_SGD_train_losses, label='AdaBelief with SGD hyperoptimizer')
ax1.plot(AdaBelief_Adam_train_losses, label='AdaBelief with Adam hyperoptimizer')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(AdaBelief_val_losses, label='AdaBelief')
ax2.plot(AdaBelief_SGD_val_losses, label='AdaBelief with SGD hyperoptimizer')
ax2.plot(AdaBelief_Adam_val_losses, label='AdaBelief with Adam hyperoptimizer')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AdaBelief_accuracy, label='AdaBelief')
ax3.plot(AdaBelief_SGD_accuracy, label='AdaBelief with SGD hyperoptimizer')
ax3.plot(AdaBelief_Adam_accuracy, label='AdaBelief with Adam hyperoptimizer')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Loss')
ax3.legend()
ax3.set_title('Accuracy')

plt.tight_layout()
plt.show()


fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(AdaBelief_train_losses, label='AdaBelief')
ax1.plot(Hyperoptimized_AdaBelief_train_losses, label='AdaBelief with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(AdaBelief_val_losses, label='AdaBelief')
ax2.plot(Hyperoptimized_AdaBelief_val_losses, label='AdaBelief with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AdaBelief_accuracy, label='AdaBelief')
ax3.plot(Hyperoptimized_AdaBelief_accuracy, label='AdaBelief with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

#### AMSGrad

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3,1, figsize=(8, 13))
# Plot training loss on the top
ax1.plot(AMSGrad_train_losses, label='AMSGrad')
ax1.plot(AMSGrad_SGD_train_losses, label='AMSGrad with SGD hyperoptimizer')
ax1.plot(AMSGrad_Adam_train_losses, label='AMSGrad with Adam hyperoptimizer')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(AMSGrad_val_losses, label='AMSGrad')
ax2.plot(AMSGrad_SGD_val_losses, label='AMSGrad with SGD hyperoptimizer')
ax2.plot(AMSGrad_Adam_val_losses, label='AMSGrad with Adam hyperoptimizer')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AMSGrad_accuracy, label='AMSGrad')
ax3.plot(AMSGrad_SGD_accuracy, label='AMSGrad with SGD hyperoptimizer')
ax3.plot(AMSGrad_Adam_accuracy, label='AMSGrad with Adam hyperoptimizer')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Loss')
ax3.legend()
ax3.set_title('Accuracy')

plt.tight_layout()
plt.show()


fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(AMSGrad_train_losses, label='AMSGrad')
ax1.plot(Hyperoptimized_AMSGrad_train_losses, label='AMSGrad with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(AMSGrad_val_losses, label='AMSGrad')
ax2.plot(Hyperoptimized_AMSGrad_val_losses, label='AMSGrad with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AMSGrad_accuracy, label='AMSGrad')
ax3.plot(Hyperoptimized_AMSGrad_accuracy, label='AMSGrad with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

## Part 2: Extension of the original paper.
In this part, we will construct a more sophisticated neural network, UNet, and apply the concept of hyperoptimization to implement hyperoptimized AdaBelief and AMSGrad. The performance of the hyperoptimzers and classical optimizers on UNet will be compared. 

### CIFAR-10 Setup Code from CSC413 Assignment 2

#### import

In [None]:
######################################################################
# Setup working directory
######################################################################
%mkdir -p /content/csc413/a2/
%cd /content/csc413/a2

######################################################################
# Helper functions for loading data
######################################################################
# adapted from
# https://github.com/fchollet/keras/blob/master/keras/datasets/cifar10.py
HORSE_CATEGORY=7
import os
import pickle
import sys
import tarfile

import numpy as np
from PIL import Image
from six.moves.urllib.request import urlretrieve


def get_file(fname, origin, untar=False, extract=False, archive_format="auto", cache_dir="data"):
    datadir = os.path.join(cache_dir)
    if not os.path.exists(datadir):
        os.makedirs(datadir)

    if untar:
        untar_fpath = os.path.join(datadir, fname)
        fpath = untar_fpath + ".tar.gz"
    else:
        fpath = os.path.join(datadir, fname)

    print("File path: %s" % fpath)
    if not os.path.exists(fpath):
        print("Downloading data from", origin)

        error_msg = "URL fetch failure on {}: {} -- {}"
        try:
            try:
                urlretrieve(origin, fpath)
            except URLError as e:
                raise Exception(error_msg.format(origin, e.errno, e.reason))
            except HTTPError as e:
                raise Exception(error_msg.format(origin, e.code, e.msg))
        except (Exception, KeyboardInterrupt) as e:
            if os.path.exists(fpath):
                os.remove(fpath)
            raise

    if untar:
        if not os.path.exists(untar_fpath):
            print("Extracting file.")
            with tarfile.open(fpath) as archive:
                archive.extractall(datadir)
        return untar_fpath

    if extract:
        _extract_archive(fpath, datadir, archive_format)

    return fpath


def load_batch(fpath, label_key="labels"):
    """Internal utility for parsing CIFAR data.
    # Arguments
        fpath: path the file to parse.
        label_key: key for label data in the retrieve
            dictionary.
    # Returns
        A tuple `(data, labels)`.
    """
    f = open(fpath, "rb")
    if sys.version_info < (3,):
        d = pickle.load(f)
    else:
        d = pickle.load(f, encoding="bytes")
        # decode utf8
        d_decoded = {}
        for k, v in d.items():
            d_decoded[k.decode("utf8")] = v
        d = d_decoded
    f.close()
    data = d["data"]
    labels = d[label_key]

    data = data.reshape(data.shape[0], 3, 32, 32)
    return data, labels


def load_cifar10(transpose=False):
    """Loads CIFAR10 dataset.
    # Returns
        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """
    dirname = "cifar-10-batches-py"
    origin = "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
    path = get_file(dirname, origin=origin, untar=True)

    num_train_samples = 50000

    x_train = np.zeros((num_train_samples, 3, 32, 32), dtype="uint8")
    y_train = np.zeros((num_train_samples,), dtype="uint8")

    for i in range(1, 6):
        fpath = os.path.join(path, "data_batch_" + str(i))
        data, labels = load_batch(fpath)
        x_train[(i - 1) * 10000 : i * 10000, :, :, :] = data
        y_train[(i - 1) * 10000 : i * 10000] = labels

    fpath = os.path.join(path, "test_batch")
    x_test, y_test = load_batch(fpath)

    y_train = np.reshape(y_train, (len(y_train), 1))
    y_test = np.reshape(y_test, (len(y_test), 1))

    if transpose:
        x_train = x_train.transpose(0, 2, 3, 1)
        x_test = x_test.transpose(0, 2, 3, 1)
    return (x_train, y_train), (x_test, y_test)

In [None]:
# Download cluster centers for k-means over colours
colours_fpath = get_file(
    fname="colours", origin="http://www.cs.toronto.edu/~jba/kmeans_colour_a2.tar.gz", untar=True
)
# Download CIFAR dataset
m = load_cifar10()

#### Data related code

In [None]:
def get_rgb_cat(xs, colours):
    """
    Get colour categories given RGB values. This function doesn't
    actually do the work, instead it splits the work into smaller
    chunks that can fit into memory, and calls helper function
    _get_rgb_cat

    Args:
      xs: float numpy array of RGB images in [B, C, H, W] format
      colours: numpy array of colour categories and their RGB values
    Returns:
      result: int numpy array of shape [B, 1, H, W]
    """
    if np.shape(xs)[0] < 100:
        return _get_rgb_cat(xs)
    batch_size = 100
    nexts = []
    for i in range(0, np.shape(xs)[0], batch_size):
        next = _get_rgb_cat(xs[i : i + batch_size, :, :, :], colours)
        nexts.append(next)
    result = np.concatenate(nexts, axis=0)
    return result


def _get_rgb_cat(xs, colours):
    """
    Get colour categories given RGB values. This is done by choosing
    the colour in `colours` that is the closest (in RGB space) to
    each point in the image `xs`. This function is a little memory
    intensive, and so the size of `xs` should not be too large.

    Args:
      xs: float numpy array of RGB images in [B, C, H, W] format
      colours: numpy array of colour categories and their RGB values
    Returns:
      result: int numpy array of shape [B, 1, H, W]
    """
    num_colours = np.shape(colours)[0]
    xs = np.expand_dims(xs, 0)
    cs = np.reshape(colours, [num_colours, 1, 3, 1, 1])
    dists = np.linalg.norm(xs - cs, axis=2)  # 2 = colour axis
    cat = np.argmin(dists, axis=0)
    cat = np.expand_dims(cat, axis=1)
    return cat


def get_cat_rgb(cats, colours):
    """
    Get RGB colours given the colour categories

    Args:
      cats: integer numpy array of colour categories
      colours: numpy array of colour categories and their RGB values
    Returns:
      numpy tensor of RGB colours
    """
    return colours[cats]


def process(xs, ys, max_pixel=256.0, downsize_input=False):
    """
    Pre-process CIFAR10 images by taking only the horse category,
    shuffling, and have colour values be bound between 0 and 1

    Args:
      xs: the colour RGB pixel values
      ys: the category labels
      max_pixel: maximum pixel value in the original data
    Returns:
      xs: value normalized and shuffled colour images
      grey: greyscale images, also normalized so values are between 0 and 1
    """
    xs = xs / max_pixel
    xs = xs[np.where(ys == HORSE_CATEGORY)[0], :, :, :]
    npr.shuffle(xs)

    grey = np.mean(xs, axis=1, keepdims=True)

    if downsize_input:
        downsize_module = nn.Sequential(
            nn.AvgPool2d(2),
            nn.AvgPool2d(2),
            nn.Upsample(scale_factor=2),
            nn.Upsample(scale_factor=2),
        )
        xs_downsized = downsize_module.forward(torch.from_numpy(xs).float())
        xs_downsized = xs_downsized.data.numpy()
        return (xs, xs_downsized)
    else:
        return (xs, grey)


def get_batch(x, y, batch_size):
    """
    Generated that yields batches of data

    Args:
      x: input values
      y: output values
      batch_size: size of each batch
    Yields:
      batch_x: a batch of inputs of size at most batch_size
      batch_y: a batch of outputs of size at most batch_size
    """
    N = np.shape(x)[0]
    assert N == np.shape(y)[0]
    for i in range(0, N, batch_size):
        batch_x = x[i : i + batch_size, :, :, :]
        batch_y = y[i : i + batch_size, :, :, :]
        yield (batch_x, batch_y)

#### Torch helper

In [None]:
def get_torch_vars(xs, ys, gpu=False):
    """
    Helper function to convert numpy arrays to pytorch tensors.
    If GPU is used, move the tensors to GPU.

    Args:
      xs (float numpy tenosor): greyscale input
      ys (int numpy tenosor): categorical labels
      gpu (bool): whether to move pytorch tensor to GPU
    Returns:
      Variable(xs), Variable(ys)
    """
    xs = torch.from_numpy(xs).float()
    ys = torch.from_numpy(ys).long()
    if gpu:
        xs = xs.cuda()
        ys = ys.cuda()
    return Variable(xs), Variable(ys)


def compute_loss(criterion, outputs, labels, batch_size, num_colours):
    """
    Helper function to compute the loss. Since this is a pixelwise
    prediction task we need to reshape the output and ground truth
    tensors into a 2D tensor before passing it in to the loss criteron.

    Args:
      criterion: pytorch loss criterion
      outputs (pytorch tensor): predicted labels from the model
      labels (pytorch tensor): ground truth labels
      batch_size (int): batch size used for training
      num_colours (int): number of colour categories
    Returns:
      pytorch tensor for loss
    """
    batch = outputs.size(0)
    loss_out = outputs.transpose(1, 3).contiguous().view([batch * 32 * 32, num_colours])
    loss_lab = labels.transpose(1, 3).contiguous().view([batch * 32 * 32])
    return criterion(loss_out, loss_lab)


def run_validation_step(
    cnn,
    criterion,
    test_grey,
    test_rgb_cat,
    batch_size,
    colours,
    plotpath=None,
    visualize=True,
    downsize_input=False
):
    correct = 0.0
    total = 0.0
    losses = []
    num_colours = np.shape(colours)[0]
    for i, (xs, ys) in enumerate(get_batch(test_grey, test_rgb_cat, batch_size)):
        images, labels = get_torch_vars(xs, ys, args.gpu)
        outputs = cnn(images)

        val_loss = compute_loss(
            criterion, outputs, labels, batch_size=args.batch_size, num_colours=num_colours
        )
        losses.append(val_loss.data.item())

        _, predicted = torch.max(outputs.data, 1, keepdim=True)
        total += labels.size(0) * 32 * 32
        correct += (predicted == labels.data).sum()

    if plotpath:  # only plot if a path is provided
        plot(
            xs,
            ys,
            predicted.cpu().numpy(),
            colours,
            plotpath,
            visualize=visualize,
            compare_bilinear=downsize_input,
        )

    val_loss = np.mean(losses)
    val_acc = 100 * correct / total
    return val_loss, val_acc

#### Visualization

In [None]:
def plot(input, gtlabel, output, colours, path, visualize=True, compare_bilinear=False):
    """
    Generate png plots of input, ground truth, and outputs

    Args:
      input: the greyscale input to the colourization CNN
      gtlabel: the grouth truth categories for each pixel
      output: the predicted categories for each pixel
      colours: numpy array of colour categories and their RGB values
      path: output path
      visualize: display the figures inline or save the figures in path
    """
    grey = np.transpose(input[:10, :, :, :], [0, 2, 3, 1])
    gtcolor = get_cat_rgb(gtlabel[:10, 0, :, :], colours)
    predcolor = get_cat_rgb(output[:10, 0, :, :], colours)

    img_stack = [np.hstack(np.tile(grey, [1, 1, 1, 3])), np.hstack(gtcolor), np.hstack(predcolor)]

    if compare_bilinear:
        downsize_module = nn.Sequential(
            nn.AvgPool2d(2),
            nn.AvgPool2d(2),
            nn.Upsample(scale_factor=2, mode="bilinear"),
            nn.Upsample(scale_factor=2, mode="bilinear"),
        )
        gt_input = np.transpose(
            gtcolor,
            [
                0,
                3,
                1,
                2
            ],
        )
        color_bilinear = downsize_module.forward(torch.from_numpy(gt_input).float())
        color_bilinear = np.transpose(color_bilinear.data.numpy(), [0, 2, 3, 1])
        img_stack = [
            np.hstack(np.transpose(input[:10, :, :, :], [0, 2, 3, 1])),
            np.hstack(gtcolor),
            np.hstack(predcolor),
            np.hstack(color_bilinear),
        ]
    img = np.vstack(img_stack)

    plt.grid(None)
    plt.imshow(img, vmin=0.0, vmax=1.0)
    plt.show()
    # else:
    #     plt.savefig(path)


def toimage(img, cmin, cmax):
    return Image.fromarray((img.clip(cmin, cmax) * 255).astype(np.uint8))


def plot_activation(args, cnn):
    # LOAD THE COLOURS CATEGORIES
    colours = np.load(args.colours, allow_pickle=True)[0]
    num_colours = np.shape(colours)[0]

    (x_train, y_train), (x_test, y_test) = load_cifar10()
    test_rgb, test_grey = process(x_test, y_test, downsize_input=args.downsize_input)
    test_rgb_cat = get_rgb_cat(test_rgb, colours)

    # Take the idnex of the test image
    id = args.index
    # outdir = "outputs/" + args.experiment_name + "/act" + str(id)
    # if not os.path.exists(outdir):
    #     os.makedirs(outdir)
    images, labels = get_torch_vars(
        np.expand_dims(test_grey[id], 0), np.expand_dims(test_rgb_cat[id], 0)
    )
    cnn.cpu()
    outputs = cnn(images)
    _, predicted = torch.max(outputs.data, 1, keepdim=True)
    predcolor = get_cat_rgb(predicted.cpu().numpy()[0, 0, :, :], colours)
    img = predcolor
    # toimage(predcolor, cmin=0, cmax=1).save(os.path.join(outdir, "output_%d.png" % id))

    if not args.downsize_input:
        img = np.tile(np.transpose(test_grey[id], [1, 2, 0]), [1, 1, 3])
    else:
        img = np.transpose(test_grey[id], [1, 2, 0])
    # toimage(img, cmin=0, cmax=1).save(os.path.join(outdir, "input_%d.png" % id))

    img = np.transpose(test_rgb[id], [1, 2, 0])
    # toimage(img, cmin=0, cmax=1).save(os.path.join(outdir, "input_%d_gt.png" % id))

    def add_border(img):
        return np.pad(img, 1, "constant", constant_values=1.0)

    def draw_activations(path, activation, imgwidth=4):
        img = np.vstack(
            [
                np.hstack(
                    [
                        add_border(filter)
                        for filter in activation[i * imgwidth : (i + 1) * imgwidth, :, :]
                    ]
                )
                for i in range(activation.shape[0] // imgwidth)
            ]
        )
        scipy.misc.imsave(path, img)

    # for i, tensor in enumerate([cnn.out1, cnn.out2, cnn.out3, cnn.out4, cnn.out5]):
    #     draw_activations(
    #         os.path.join(outdir, "conv%d_out_%d.png" % (i + 1, id)), tensor.data.cpu().numpy()[0]
    #     )
    # print("visualization results are saved to %s" % outdir)

### UNet & Training Function (From Assignment 2)

#### UNet 

In [None]:
class UNet(nn.Module):
    def __init__(self, kernel, num_filters, num_colours, num_in_channels):
        super(UNet, self).__init__()

        # Useful parameters
        stride = 2
        padding = kernel // 2
        output_padding = 1
        self.first = nn.Sequential(
            nn.Conv2d(num_in_channels, num_filters, kernel, stride=2, padding = kernel // 2),
            nn.BatchNorm2d(num_filters),
            nn.ReLU()
        )

        self.second = nn.Sequential(
            nn.Conv2d(num_filters, 2*num_filters, kernel, stride=2, padding = kernel // 2),
            nn.BatchNorm2d(2*num_filters),
            nn.ReLU()
        )

        self.third = nn.Sequential(
            nn.ConvTranspose2d(2*num_filters,num_filters, kernel_size=kernel,stride=2,dilation = 1,padding=padding,output_padding=1),
            nn.BatchNorm2d(num_filters),
            nn.ReLU()
        )

        self.fourth = nn.Sequential(
            nn.ConvTranspose2d(2*num_filters,num_colours, kernel_size=kernel,stride=2,dilation = 1,padding=padding,output_padding=1),
            nn.BatchNorm2d(num_colours),
            nn.ReLU()
        )

        self.fifth = nn.Conv2d(num_colours+num_in_channels, num_colours, kernel,padding = kernel // 2)

    def initialize(self):
        nn.init.kaiming_uniform_(self.first[0].weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.second[0].weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.third[0].weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.fourth[0].weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.fifth.weight, a=math.sqrt(5))

    def forward(self, x):
        first = self.first(x)
        second = self.second(first)
        third = self.third(second)

        fourth = self.fourth(torch.cat([first, third], dim=1))

        output = self.fifth(torch.cat([x, fourth], dim=1))
        return output

#### Training Function

In [None]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

def train_UNet(args, optimizer,cnn=None):
    opt = optimizer
    torch.set_num_threads(5)
    # Numpy random seed
    npr.seed(args.seed)
    save_dir = "outputs/" + args.experiment_name
    # LOAD THE COLOURS CATEGORIES
    colours = np.load(args.colours, allow_pickle=True, encoding="bytes")[0]
    num_colours = np.shape(colours)[0]
    # INPUT CHANNEL
    num_in_channels = 1 if not args.downsize_input else 3
    # LOAD THE MODEL
    if cnn is None:
        Net = globals()[args.model]
        cnn = Net(args.kernel, args.num_filters, num_colours, num_in_channels)
    cnn.initialize()
    # LOSS FUNCTION
    criterion = nn.CrossEntropyLoss()
    # DATA
    print("Loading data...")
    (x_train, y_train), (x_test, y_test) = load_cifar10()

    print("Transforming data...")
    train_rgb, train_grey = process(x_train, y_train, downsize_input=args.downsize_input)
    train_rgb_cat = get_rgb_cat(train_rgb, colours)
    test_rgb, test_grey = process(x_test, y_test, downsize_input=args.downsize_input)
    test_rgb_cat = get_rgb_cat(test_rgb, colours)

    print("Beginning training ...")
    if args.gpu:
        cnn.cuda()
    

    train_losses = []
    valid_losses = []
    valid_accs = []
    total_time = 0
    
    mw = ModuleWrapper(cnn, optimizer=opt)
    mw.initialize()
    for epoch in range(args.epochs):
        # Train the Model
        start_time = time.time()
        cnn.train()  # Change model to 'train' mode
        
        losses = []
        for i, (xs, ys) in enumerate(get_batch(train_grey, train_rgb_cat, args.batch_size)):
            mw.begin()
            images, labels = get_torch_vars(xs, ys, args.gpu)
            pred = mw.forward(images)
            loss = compute_loss(
                criterion, pred, labels, batch_size=args.batch_size, num_colours=num_colours
            )
            # Forward + Backward + Optimize
            mw.zero_grad()

            
            loss.backward(create_graph=True)
            mw.step()
            losses.append(loss.data.item())
        time_elapsed = time.time() - start_time
        avg_loss = np.mean(losses)
        train_losses.append(avg_loss)
        
        print(
            "Epoch [%d/%d], Loss: %.4f, Time (s): %.2f"
            % (epoch + 1, args.epochs, avg_loss, time_elapsed)
        )
        total_time += time_elapsed

        # Evaluate the model
        cnn.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
        val_loss, val_acc = run_validation_step(
            cnn,
            criterion,
            test_grey,
            test_rgb_cat,
            args.batch_size,
            colours,
            save_dir + "/test_%d.png" % epoch,
            args.visualize,
            args.downsize_input,
        )

        valid_losses.append(val_loss)
        valid_accs.append(val_acc.item())
        print(
            "Epoch [%d/%d], Val Loss: %.4f, Val Acc: %.1f%%"
            % (epoch + 1, args.epochs, val_loss, val_acc)
        )
        final_parameters={}
        for i in mw.optimizer.parameters:
          if ((type(mw.optimizer)==Adam or type(mw.optimizer)==AdamBaydin or type(mw.optimizer)==AMSGrad or type(mw.optimizer)==AdaBelief) 
          and (i in {'beta1','beta2'})) or ((type(mw.optimizer)==RMSProp or type(mw.optimizer)==RMSPropAlpha) and i in {'gamma'}):
            final_parameters[i] = Adam.clamp(mw.optimizer.parameters[i]).item()
          elif (type(mw.optimizer)==RMSProp or type(mw.optimizer)==RMSPropAlpha) and i=='Alpha':
            final_parameters[i] = mw.optimizer.parameters[i].item()**2
          else:
            final_parameters[i] = mw.optimizer.parameters[i].item()
        print(final_parameters)
    print('Total training time: {:.2f} seconds'.format(total_time))


    return cnn, train_losses, valid_losses, valid_accs, final_parameters

#### Argument Setup

In [None]:
args = AttrDict()
args_dict = {
    "gpu": True,
    "valid": False,
    "checkpoint": "",
    "colours": "./data/colours/colour_kmeans24_cat7.npy",
    "model": "UNet",
    "kernel": 3,
    "num_filters": 32,
    'learn_rate':0.001, 
    "batch_size": 100,
    "epochs": 50,
    "seed": 0,
    "plot": True,
    "experiment_name": "colourization_cnn",
    "visualize": True,
    "downsize_input": False,
}
args.update(args_dict)




### Train by classical and hyperoptimized optimizers


#### SGD

In [None]:
s = train_UNet(args, optimizer=AdaBelief())

In [None]:
s = train_UNet(args, optimizer=AdaBeliefBaydin())

In [None]:
cnn, SGD_train_losses, SGD_valid_losses, SGD_valid_accs, SGD_params = train_UNet(args, optimizer=SGD())

In [None]:
cnn, SGD_SGD_train_losses, SGD_SGD_valid_losses, SGD_SGD_valid_accs, SGD_params = train_UNet(args, optimizer=SGD(optimizer=SGD()))

In [None]:
cnn, SGD_Adam_train_losses, SGD_Adam_valid_losses, SGD_Adam_valid_accs, SGD_Adam_params = train_UNet(args, optimizer=SGD(optimizer=Adam(1e-5)))

In [None]:
cnn, SGD_AMSGrad_train_losses, SGD_AMSGrad_valid_losses, SGD_AMSGrad_valid_accs, SGD_AMSGrad_params = train_UNet(args, optimizer=SGD(optimizer=AMSGrad(1e-5)))

#### Adam

In [None]:
cnn, Adam_train_losses, Adam_valid_losses, Adam_valid_accs, Adam_params = train_UNet(args, optimizer=Adam(alpha=0.01))

In [None]:
cnn, Adam_Adam_train_losses, Adam_Adam_valid_losses, Adam_Adam_valid_accs, Adam_params = train_UNet(args, optimizer=Adam(alpha=0.01, optimizer=Adam(alpha=1e-5)))

In [None]:
cnn, Adam_AMSGrad_train_losses, Adam_AMSGrad_valid_losses, Adam_AMSGrad_valid_accs, Adam_AMSGrad_params = train_UNet(args, optimizer=Adam(alpha=0.01,optimizer=(AMSGrad(alpha=1e-5))))

#### AdaBelief

In [None]:
cnn, AdaBelief_SGD_train_losses, AdaBelief_SGD_valid_losses, AdaBelief_SGD_valid_accs, AdaBelief_SGD_params = train_UNet(args, optimizer=AdaBelief(alpha=0.01,optimizer=SGD(1e-6)))

In [None]:
cnn, AdaBelief_train_losses, AdaBelief_valid_losses, AdaBelief_valid_accs, AdaBelief_params = train_UNet(args, optimizer=AdaBelief(alpha=0.01))

In [None]:
cnn, AdaBelief_Adam_train_losses, AdaBelief_Adam_valid_losses, AdaBelief_Adam_valid_accs, AdaBelief_Adam_params = train_UNet(args, optimizer=AdaBelief(alpha=0.01,optimizer=Adam(alpha=1e-6)))

In [None]:
cnn, AdaBelief_AdaBelief_train_losses, AdaBelief_AdaBelief_valid_losses, AdaBelief_AdaBelief_valid_accs, AdaBelief_AdaBelief_params = train_UNet(args, optimizer=AdaBelief(0.01,optimizer=AdaBelief(alpha=1e-6)))

In [None]:
cnn, AdaBeliefBaydin_AdaBelief_train_losses, AdaBeliefBaydin_AdaBelief_valid_losses, AdaBeliefBaydin_AdaBelief_valid_accs, AdaBeliefBaydin_AdaBelief_params = train_UNet(args, optimizer=AdaBeliefBaydin(0.01,optimizer=AdaBelief(alpha=1e-6)))

In [None]:
cnn, AdaBeliefBaydin_Adam_train_losses, AdaBeliefBaydin_Adaam_valid_losses, AdaBeliefBaydin_Adam_valid_accs, AdaBeliefBaydin_Adam_params = train_UNet(args, optimizer=AdaBeliefBaydin(0.01,optimizer=Adam(alpha=1e-6)))

#### AMSGrad

In [None]:
cnn, AMSGrad_SGD_train_losses, AMSGrad_SGD_valid_losses, AMSGrad_SGD_valid_accs, AMSGrad_SGD_params = train_UNet(args, optimizer=AMSGrad(alpha=0.02,optimizer=SGD(1e-5)))

In [None]:
cnn, AMSGrad_train_losses, AMSGrad_valid_losses, AMSGrad_valid_accs, AMSGrad_params = train_UNet(args, optimizer=AMSGrad(alpha=0.02))

In [None]:
cnn, AMSGrad_Adam_train_losses, AMSGrad_Adam_valid_losses, AMSGrad_Adam_valid_accs, AMSGrad_Adam_params = train_UNet(args, optimizer=AMSGrad(0.02,optimizer=Adam(alpha=1e-4)))


In [None]:
cnn, AMSGrad_AMSGrad_train_losses, AMSGrad_AMSGrad_valid_losses, AMSGrad_AMSGrad_valid_accs, AMSGrad_AMSGrad_params = train_UNet(args, optimizer=AMSGrad(0.02,optimizer=AMSGrad(alpha=1e-5)))

In [None]:
cnn, AMSGradBaydin_Adam_train_losses, AMSGradBaydin_Adam_valid_losses, AMSGradBaydin_Adam_valid_accs, AMSGradBaydin_Adam_params = train_UNet(args, optimizer=AMSGradBaydin(0.02,optimizer=Adam(1e-5)))

In [None]:
cnn, AMSGradBaydin_AMSGrad_train_losses, AMSGradBaydin_AMSGrad_valid_losses, AMSGradBaydin_AMSGrad_valid_accs, AMSGradBaydin_AMSGrad_params = train_UNet(args, optimizer=AMSGradBaydin(0.02,optimizer=AMSGrad(1e-5)))

### Plot

#### SGD

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(SGD_train_losses, label='SGD')
ax1.plot(SGD_Adam_train_losses, label='SGD with Adam hyperoptimizer')
ax1.plot(SGD_AMSGrad_train_losses, label='SGD with AMSGrad tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(SGD_valid_losses, label='SGD')
ax2.plot(SGD_Adam_valid_losses, label='SGD with Adam hyperoptimizer')
ax2.plot(SGD_AMSGrad_valid_losses, label='SGD with AMSGrad tuned hyperoptimizer')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(SGD_valid_accs, label='SGD')
ax3.plot(SGD_Adam_valid_accs, label='SGD with Adam hyperoptimizer')
ax3.plot(SGD_AMSGrad_valid_accs, label='SGD with AMSGrad hyperoptimizer')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

#### Adam

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
# Plot training loss on the top
ax1.plot(Adam_train_losses, label='Adam')
ax1.plot(Adam_Adam_train_losses, label='Adam with Adam hyperoptimizer')
ax1.plot(Adam_AMSGrad_train_losses, label='Adam with AMSGrad hyperoptimizer')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(Adam_valid_losses, label='Adam')
ax2.plot(Adam_Adam_valid_losses, label='Adam with Adam hyperoptimizer')
ax2.plot(Adam_AMSGrad_valid_losses, label='Adam with AMSGrad hyperoptimizer')


ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')
ax3.plot(Adam_valid_accs, label='Adam')
ax3.plot(Adam_Adam_valid_accs, label='Adam with Adam hyperoptimizer')
ax3.plot(Adam_AMSGrad_valid_accs, label='Adam with AMSGrad hyperoptimizer')

ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')

plt.tight_layout()
plt.show()


#### AdaBelief

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
# Plot training loss on the top
ax1.plot(AdaBelief_train_losses, label='AdaBelief')
ax1.plot(AdaBeliefBaydin_AdaBelief_train_losses, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(AdaBelief_valid_losses, label='AdaBelief')
ax2.plot(AdaBeliefBaydin_AdaBelief_valid_losses, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')


ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AdaBelief_valid_accs, label='AdaBelief')
ax3.plot(AdaBeliefBaydin_AdaBelief_valid_accs, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')


ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')

plt.tight_layout()
plt.show()



In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
# Plot training loss on the top
ax1.plot(AdaBelief_train_losses, label='AdaBelief')
ax1.plot(AdaBelief_Adam_train_losses, label='AdaBelief with Adam hyperoptimizer')
ax1.plot(AdaBelief_AdaBelief_train_losses, label='AdaBelief with AdaBelief hyperoptimizer')

ax1.plot(AdaBeliefBaydin_Adam_train_losses, label='AdaBelief with Adam hyperoptimizer (learning rate only)')
ax1.plot(AdaBeliefBaydin_AdaBelief_train_losses, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(AdaBelief_valid_losses, label='AdaBelief')
ax2.plot(AdaBelief_Adam_valid_losses, label='AdaBelief with Adam hyperoptimizer')
ax2.plot(AdaBelief_AdaBelief_valid_losses, label='AdaBelief with AdaBelief hyperoptimizer')
ax2.plot(AdaBeliefBaydin_Adam_valid_losses, label='AdaBelief with Adam hyperoptimizer (learning rate only)')
ax2.plot(AdaBeliefBaydin_AdaBelief_valid_losses, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')


ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AdaBelief_valid_accs, label='AdaBelief')
ax3.plot(AdaBelief_Adam_valid_accs, label='AdaBelief with Adam hyperoptimizer')
ax3.plot(AdaBelief_AdaBelief_valid_accs, label='AdaBelief with AdaBelief hyperoptimizer')
ax3.plot(AdaBeliefBaydin_Adam_valid_accs, label='AdaBelief with Adam hyperoptimizer (learning rate only)')
ax3.plot(AdaBeliefBaydin_AdaBelief_valid_accs, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')


ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')

plt.tight_layout()
plt.show()


#### AMSGrad

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))

# Plot training loss on the top
ax1.plot(AMSGrad_train_losses, label='AMSGrad')
ax1.plot(AMSGrad_Adam_train_losses, label='AMSGrad with Adam hyperoptimizer')
ax1.plot(AMSGrad_AMSGrad_train_losses, label='AMSGrad with AMSGrad hyperoptimizer')
ax1.plot(AMSGradBaydin_Adam_train_losses, label='AMSGrad with Adam hyperoptimizer (learning rate only)')
ax1.plot(AMSGradBaydin_AMSGrad_train_losses, label='AMSGrad with AMSGrad hyperoptimizer (learning rate only)')

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(AMSGrad_valid_losses, label='AMSGrad')
ax2.plot(AMSGrad_Adam_valid_losses, label='AMSGrad with Adam hyperoptimizer')
ax2.plot(AMSGrad_AMSGrad_valid_losses, label='AMSGrad with AMSGrad hyperoptimizer')
ax2.plot(AMSGradBaydin_Adam_valid_losses, label='AMSGrad with Adam hyperoptimizer (learning rate only)')
ax2.plot(AMSGradBaydin_AMSGrad_valid_losses, label='AMSGrad with AMSGrad hyperoptimizer (learning rate only)')

ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AMSGrad_valid_accs, label='AMSGrad')
ax3.plot(AMSGrad_Adam_valid_accs, label='AMSGrad with Adam hyperoptimizer')
ax3.plot(AMSGrad_AMSGrad_valid_accs, label='AMSGrad with AMSGrad hyperoptimizer')
ax3.plot(AMSGradBaydin_Adam_valid_accs, label='AMSGrad with Adam hyperoptimizer (learning rate only)')
ax3.plot(AMSGradBaydin_AMSGrad_valid_accs, label='AMSGrad with AMSGrad hyperoptimizer (learning rate only)')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')

plt.tight_layout()
plt.show()

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))

# Plot training loss on the top
ax1.plot(AMSGrad_train_losses, label='AMSGrad')
ax1.plot(AMSGrad_AMSGrad_train_losses, label='AMSGrad with AMSGrad hyperoptimizer')

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(AMSGrad_valid_losses, label='AMSGrad')

ax2.plot(AMSGrad_AMSGrad_valid_losses, label='AMSGrad with AMSGrad hyperoptimizer')

ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AMSGrad_valid_accs, label='AMSGrad')
ax3.plot(AMSGrad_AMSGrad_valid_accs, label='AMSGrad with AMSGrad hyperoptimizer')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')

plt.tight_layout()
plt.show()

#### Overall Plot

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 17))

# Plot training loss on the top
ax1.plot(AMSGrad_train_losses, label='AMSGrad')
ax1.plot(AMSGrad_Adam_train_losses, label='AMSGrad with Adam hyperoptimizer')
ax1.plot(AMSGrad_AMSGrad_train_losses, label='AMSGrad with AMSGrad hyperoptimizer')
ax1.plot(AMSGradBaydin_Adam_train_losses, label='AMSGrad with Adam hyperoptimizer (learning rate only)')
ax1.plot(AMSGradBaydin_AMSGrad_train_losses, label='AMSGrad with AMSGrad hyperoptimizer (learning rate only)')
ax1.plot(AdaBelief_train_losses, label='AdaBelief')
ax1.plot(AdaBelief_Adam_train_losses, label='AdaBelief with Adam hyperoptimizer')
ax1.plot(AdaBelief_AdaBelief_train_losses, label='AdaBelief with AdaBelief hyperoptimizer')

ax1.plot(AdaBeliefBaydin_Adam_train_losses, label='AdaBelief with Adam hyperoptimizer (learning rate only)')
ax1.plot(AdaBeliefBaydin_AdaBelief_train_losses, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(AMSGrad_valid_losses, label='AMSGrad')
ax2.plot(AMSGrad_Adam_valid_losses, label='AMSGrad with Adam hyperoptimizer')
ax2.plot(AMSGrad_AMSGrad_valid_losses, label='AMSGrad with AMSGrad hyperoptimizer')
ax2.plot(AMSGradBaydin_Adam_valid_losses, label='AMSGrad with Adam hyperoptimizer (learning rate only)')
ax2.plot(AMSGradBaydin_AMSGrad_valid_losses, label='AMSGrad with AMSGrad hyperoptimizer (learning rate only)')
ax2.plot(AdaBelief_valid_losses, label='AdaBelief')
ax2.plot(AdaBelief_Adam_valid_losses, label='AdaBelief with Adam hyperoptimizer')
ax2.plot(AdaBelief_AdaBelief_valid_losses, label='AdaBelief with AdaBelief hyperoptimizer')
ax2.plot(AdaBeliefBaydin_Adam_valid_losses, label='AdaBelief with Adam hyperoptimizer (learning rate only)')
ax2.plot(AdaBeliefBaydin_AdaBelief_valid_losses, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AMSGrad_valid_accs, label='AMSGrad')
ax3.plot(AMSGrad_Adam_valid_accs, label='AMSGrad with Adam hyperoptimizer')
ax3.plot(AMSGrad_AMSGrad_valid_accs, label='AMSGrad with AMSGrad hyperoptimizer')
ax3.plot(AMSGradBaydin_Adam_valid_accs, label='AMSGrad with Adam hyperoptimizer (learning rate only)')
ax3.plot(AMSGradBaydin_AMSGrad_valid_accs, label='AMSGrad with AMSGrad hyperoptimizer (learning rate only)')
ax3.plot(AdaBelief_valid_accs, label='AdaBelief')
ax3.plot(AdaBelief_Adam_valid_accs, label='AdaBelief with Adam hyperoptimizer')
ax3.plot(AdaBelief_AdaBelief_valid_accs, label='AdaBelief with AdaBelief hyperoptimizer')
ax3.plot(AdaBeliefBaydin_Adam_valid_accs, label='AdaBelief with Adam hyperoptimizer (learning rate only)')
ax3.plot(AdaBeliefBaydin_AdaBelief_valid_accs, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')

plt.tight_layout()
plt.show()

Overall Plot with only best hyperoptimizers

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 17))

# Plot training loss on the top
ax1.plot(SGD_train_losses, label='SGD')
ax1.plot(SGD_Adam_train_losses, label='SGD with Adam hyperoptimizer')
ax1.plot(Adam_AMSGrad_train_losses, label='Adam with AMSGrad hyperoptimizer (learning rate only)')
ax1.plot(Adam_Adam_train_losses, label='Adam')
ax1.plot(AdaBelief_train_losses, label='AdaBelief')
ax1.plot(AdaBeliefBaydin_AdaBelief_train_losses, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')
ax1.plot(AMSGrad_train_losses, label='AMSGrad hyperoptimizer')
ax1.plot(AMSGrad_AMSGrad_train_losses, label='AMSGrad with AMSGrad hyperoptimizer')

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

# Plot validation loss on the bottom
ax2.plot(SGD_valid_losses, label='SGD')
ax2.plot(SGD_Adam_valid_losses, label='SGD with Adam hyperoptimizer')
ax2.plot(Adam_AMSGrad_valid_losses, label='Adam with AMSGrad hyperoptimizer (learning rate only)')
ax2.plot(Adam_Adam_valid_losses, label='Adam')
ax2.plot(AdaBelief_valid_losses, label='AdaBelief')
ax2.plot(AdaBeliefBaydin_AdaBelief_valid_losses, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')
ax2.plot(AMSGrad_valid_losses, label='AMSGrad hyperoptimizer')
ax2.plot(AMSGrad_AMSGrad_valid_losses, label='AMSGrad with AMSGrad hyperoptimizer')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')


ax3.plot(SGD_valid_accs, label='SGD')
ax3.plot(SGD_Adam_valid_accs, label='SGD with Adam hyperoptimizer')
ax3.plot(Adam_AMSGrad_valid_accs, label='Adam with AMSGrad hyperoptimizer (learning rate only)')
ax3.plot(Adam_Adam_valid_accs, label='Adam')
ax3.plot(AdaBelief_valid_accs, label='AdaBelief')
ax3.plot(AdaBeliefBaydin_AdaBelief_valid_accs, label='AdaBelief with AdaBelief hyperoptimizer (learning rate only)')
ax3.plot(AMSGrad_valid_accs, label='AMSGrad hyperoptimizer')
ax3.plot(AMSGrad_AMSGrad_valid_accs, label='AMSGrad with AMSGrad hyperoptimizer')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')

plt.tight_layout()
plt.show()

### Test Optimizers with hyperoptimized hyperparameters

#### Training Function

In [None]:
def train_UNet_general_optimizer(args, cnn=None,optimizer=None):
    torch.set_num_threads(5)
    npr.seed(args.seed)
    save_dir = "outputs/" + args.experiment_name
    colours = np.load(args.colours, allow_pickle=True, encoding="bytes")[0]
    num_colours = np.shape(colours)[0]
    num_in_channels = 1 if not args.downsize_input else 3
    if cnn is None:
        Net = globals()[args.model]
        cnn = Net(args.kernel, args.num_filters, num_colours, num_in_channels)
    criterion = nn.CrossEntropyLoss()
    print("Loading data...")
    (x_train, y_train), (x_test, y_test) = load_cifar10()

    print("Transforming data...")
    train_rgb, train_grey = process(x_train, y_train, downsize_input=args.downsize_input)
    train_rgb_cat = get_rgb_cat(train_rgb, colours)
    test_rgb, test_grey = process(x_test, y_test, downsize_input=args.downsize_input)
    test_rgb_cat = get_rgb_cat(test_rgb, colours)

    print("Beginning training ...")
    if args.gpu:
        cnn.cuda()
    train_losses = []
    valid_losses = []
    valid_accs = []
    total_time = 0
    start_time = time.time()
    mw = ModuleWrapper(cnn, optimizer=optimizer)
    mw.initialize()
    
    for epoch in range(args.epochs):
        start = time.time()
        # Train the Model
        cnn.train()  # Change model to 'train' mode
        losses = []
        
        for i, (xs, ys) in enumerate(get_batch(train_grey, train_rgb_cat, args.batch_size)):
            mw.begin()
            images, labels = get_torch_vars(xs, ys, args.gpu)
            pred = mw.forward(images)
            # Forward + Backward + Optimize

            loss = compute_loss(
                criterion, pred, labels, batch_size=args.batch_size, num_colours=num_colours
            )
            mw.zero_grad()
            loss.backward(create_graph=True)
            mw.step()
            losses.append(loss.data.item())
        time_elapsed = time.time() - start
        avg_loss = np.mean(losses)
        train_losses.append(avg_loss)
        
        print(
            "Epoch [%d/%d], Loss: %.4f, Time (s): %d"
            % (epoch + 1, args.epochs, avg_loss, time_elapsed)
        )
        total_time += time_elapsed

        # Evaluate the model
        cnn.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
        val_loss, val_acc = run_validation_step(
            cnn,
            criterion,
            test_grey,
            test_rgb_cat,
            args.batch_size,
            colours,
            save_dir + "/test_%d.png" % epoch,
            args.visualize,
            args.downsize_input,
        )

        valid_losses.append(val_loss)
        valid_accs.append(val_acc.item())
        print(
            "Epoch [%d/%d], Val Loss: %.4f, Val Acc: %.1f%%"
            % (epoch + 1, args.epochs, val_loss, val_acc)
        )
    print('Total training time: {:.2f} seconds'.format(total_time))
    final_parameters={}

    return cnn, train_losses, valid_losses, valid_accs, final_parameters

#### SGD

In [None]:

optimizer = SGD(alpha= 0.008516764268279076)
cnn, Hyperoptimized_SGD_train_losses, Hyperoptimized_SGD_valid_losses, Hyperoptimized_SGD_valid_accs, _ = train_UNet_general_optimizer(args, cnn=None,optimizer=optimizer)



In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(SGD_train_losses, label='SGD')
ax1.plot(Hyperoptimized_SGD_train_losses, label='SGD with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(SGD_valid_losses, label='SGD')
ax2.plot(Hyperoptimized_SGD_valid_losses, label='SGD with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(SGD_valid_accs, label='SGD')
ax3.plot(Hyperoptimized_SGD_valid_accs, label='SGD with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

#### Adam



In [None]:

optimizer = Adam(alpha= 0.000535776955075562, beta1= 0.8984992504119873, beta2= 0.9990102648735046)
cnn, Hyperoptimized_Adam_train_losses, Hyperoptimized_Adam_valid_losses, Hyperoptimized_Adam_valid_accs, _ = train_UNet_general_optimizer(args, cnn=None,optimizer=optimizer)



In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(Adam_train_losses, label='Adam')
ax1.plot(Hyperoptimized_Adam_train_losses, label='Adam with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(Adam_valid_losses, label='Adam')
ax2.plot(Hyperoptimized_Adam_valid_losses, label='Adam with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(Adam_valid_accs, label='Adam')
ax3.plot(Hyperoptimized_Adam_valid_accs, label='Adam with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

#### AdaBelief

In [None]:

optimizer = AdaBelief(alpha= 0.00835033692419529)
cnn, Hyperoptimized_AdaBelief_train_losses, Hyperoptimized_AdaBelief_valid_losses, Hyperoptimized_AdaBelief_valid_accs, _ = train_UNet_general_optimizer(args, cnn=None,optimizer=optimizer)



In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(AdaBelief_train_losses, label='AdaBelief')
ax1.plot(Hyperoptimized_AdaBelief_train_losses, label='AdaBelief with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(AdaBelief_valid_losses, label='AdaBelief')
ax2.plot(Hyperoptimized_AdaBelief_valid_losses, label='AdaBelief with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AdaBelief_valid_accs, label='AdaBelief')
ax3.plot(Hyperoptimized_AdaBelief_valid_accs, label='AdaBelief with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()

#### AMSGrad

In [None]:
optimizer = AMSGrad(alpha= 0.013616234995424747, beta1= 0.8991430997848511, beta2=0.9989991784095764)
cnn, Hyperoptimized_AMSGrad_train_losses, Hyperoptimized_AMSGrad_valid_losses, Hyperoptimized_AMSGrad_valid_accs, Hyperoptimized_AMSGrad_final_parameters = train_UNet_general_optimizer(args, cnn=None,optimizer=optimizer)

In [None]:
fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(8, 13))
ax1.plot(AdaBelief_train_losses, label='AMSGrad')
ax1.plot(Hyperoptimized_AMSGrad_train_losses, label='AMSGrad with tuned hyperparameters')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.set_title('Training Loss')

ax2.plot(AMSGrad_valid_losses, label='AMSGrad')
ax2.plot(Hyperoptimized_AMSGrad_valid_losses, label='AMSGrad with tuned hyperparameters')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.set_title('Validation Loss')

ax3.plot(AMSGrad_valid_accs, label='AMSGrad')
ax3.plot(Hyperoptimized_AMSGrad_valid_accs, label='AMSGrad with tuned hyperparameters')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy')
ax3.legend()
ax3.set_title('Validation Accuracy')
plt.tight_layout()
plt.show()