# Task 3, project 1

## Contents

1. Loading data  
    1. Normalizing dataset and loading CIFAR-10  
    2. From CIFAR-10 to CIFAR-2  
2. MLP Architecture and data inspection
    1. MLP
    2. Data inspection
3. Train() and train_manual()
    1. Train() with learning rate
    2. train_manual() with learning rate
    3. Train() with learning rate and weight decay
    4. train_manual() with learning rate and weight decay
    5. Train() with learning rate, weight decay and momentum
    6. train_manual() with learning rate, weight decay and momentum
4. Several models and model selection

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from datetime import datetime
from torch.utils.data import random_split
import numpy as np
import plotly.express as px
import pandas as pd

torch.manual_seed(123)
torch.set_default_dtype(torch.double)

## 1. Loading data

### 1.1 Normalizing dataset and loading CIFAR-10

In [21]:
def mean_std():
    cifar_trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())

    imgs = [item[0] for item in cifar_trainset] # item[0] and item[1] are image and its label
    imgs = torch.stack(imgs, dim=0).numpy()
    mean = []
    std = []
    # calculate mean over each channel (r,g,b)

    for i in range (3):
        mean.append(imgs[:,i,:,:].mean())
        std.append(imgs[:,i,:,:].std())

    prepross = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((mean[0],mean[1],mean[2]),(std[0],std[1],std[2]))
    ])
    
    return (prepross)

In [22]:
def load_cifar(train_val_split=0.9, data_path='../data/', preprocessor=None):
    
    # Define preprocessor
    if preprocessor is None:
        preprocessor = mean_std()

    
    # load datasets
    data_train_val = datasets.CIFAR10(
        data_path,       
        train=True,      
        download=True,  
        transform=preprocessor)

    data_test = datasets.CIFAR10(
        data_path, 
        train=False,
        download=True,
        transform=preprocessor)

    # train/validation split
    n_train = int(len(data_train_val)*train_val_split)
    n_val =  len(data_train_val) - n_train

    data_train, data_val = random_split(
        data_train_val, 
        [n_train, n_val],
        generator=torch.Generator().manual_seed(123)
    )

    print("Size of the train dataset:        ", len(data_train))
    print("Size of the validation dataset:   ", len(data_val))
    print("Size of the test dataset:         ", len(data_test))
    
    return (data_train, data_val, data_test)

cifar10_train, cifar10_val, cifar10_test = load_cifar()

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Size of the train dataset:         45000
Size of the validation dataset:    5000
Size of the test dataset:          10000


### 1.2 From CIFAR-10 to CIFAR-2

We define a lighter version of CIFAR-10, which is now CIFAR-2, containing only the planes and birds

In [23]:
label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']

# For each dataset, keep only airplanes and birds
cifar2_train = [(img, label_map[label]) for img, label in cifar10_train if label in [0, 2]]
cifar2_val = [(img, label_map[label]) for img, label in cifar10_val if label in [0, 2]]
cifar2_test = [(img, label_map[label]) for img, label in cifar10_test if label in [0, 2]]

print('Size of the training dataset: ', len(cifar2_train))
print('Size of the validation dataset: ', len(cifar2_val))
print('Size of the test dataset: ', len(cifar2_test))

Size of the training dataset:  9017
Size of the validation dataset:  983
Size of the test dataset:  2000


# 2. MLP Architecture and data inspection

### 2.1 MLP

In [24]:
class MyMLP(nn.Module):
    def __init__(self, n_in, n_hidden1, n_hidden2, n_hidden3, n_out):
        torch.manual_seed(123)
        super(MyMLP, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(n_in, n_hidden1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(n_hidden1, n_hidden2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(n_hidden2, n_hidden3)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(n_hidden3, n_out)

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        return x

# Setting up the model with the specified dimensions
n_in = 32*32*3   # Determined by our dataset: 32x32 RGB images
n_hidden1 = 512  
n_hidden2 = 128
n_hidden3 = 32
n_out = 2        # Determined by our number of classes, so 2: birds and planes

model_seq = MyMLP(n_in, n_hidden1, n_hidden2, n_hidden3, n_out)


### 2.2 Data inspection

In [25]:
# Shape of an image
print("Shape of an image:                       ", cifar2_train[0][0].shape)
# Add a extra dimension for the batch dimension
batch_t = torch.unsqueeze(cifar2_train[0][0], 0)
print("Shape of our input batch of one image:   ", batch_t.shape)
# Feed our batch into our network and get the output
out = model_seq(batch_t)
print("Shape of our output batch of one image:  ", out.shape)   
print("Output tensor (values are just rubbish because the nn is not trained yet!):\n Ouput: ", out)

Shape of an image:                        torch.Size([3, 32, 32])
Shape of our input batch of one image:    torch.Size([1, 3, 32, 32])
Shape of our output batch of one image:   torch.Size([1, 2])
Output tensor (values are just rubbish because the nn is not trained yet!):
 Ouput:  tensor([[ 0.1908, -0.0874]], grad_fn=<AddmmBackward0>)


In [26]:
print("Inspecting parameters")
# Iterate over all the named parameters of our network
for p in model_seq.named_parameters():
    # p is a tuple: 
    # - p[0] is the name of parameter
    # - p[1] is a tensor containing the current parameter values
    print("name: ", p[0], "   length: ", p[1].numel())
    

print("\nTotal number of trainable parameters: ", sum([p.numel() for p in model_seq.parameters() if p.requires_grad == True]))

print("\nInspecting modules")
# Iterate over all the named modules of our network
for m in model_seq.named_modules():
    print(m)

Inspecting parameters
name:  fc1.weight    length:  1572864
name:  fc1.bias    length:  512
name:  fc2.weight    length:  65536
name:  fc2.bias    length:  128
name:  fc3.weight    length:  4096
name:  fc3.bias    length:  32
name:  fc4.weight    length:  64
name:  fc4.bias    length:  2

Total number of trainable parameters:  1643234

Inspecting modules
('', MyMLP(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=3072, out_features=512, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=128, out_features=32, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=32, out_features=2, bias=True)
))
('flatten', Flatten(start_dim=1, end_dim=-1))
('fc1', Linear(in_features=3072, out_features=512, bias=True))
('relu1', ReLU())
('fc2', Linear(in_features=512, out_features=128, bias=True))
('relu2', ReLU())
('fc3', Linear(in_features=128, out_features=32, bias=True))
('relu3', ReLU(

# 3. Train() and Train_manual()

## 3.1 Train() with learning rate

In [27]:
def train(n_epochs, optimizer, model, loss_fn, train_loader):
    
    n_batch = len(train_loader)
    
    # We'll store the training loss for each epoch
    losses_train = []
    
    # Set the network in training mode
    model.train()
    
    # Re-initialize gradients, just in case the model has been inappropriately 
    # manipulated before the training
    optimizer.zero_grad(set_to_none=True)
    
    for epoch in range(1, n_epochs + 1): 
        
        # Training loss for the current epoch
        loss_train = 0

        # Loop over our dataset (in batches the data loader creates for us)
        for imgs, labels in train_loader:
            
            
            imgs.to(device=device, dtype=torch.double)
            labels.to(device=device, dtype=torch.double)
            
            # Feed a batch into our model
            outputs = model(imgs)
            
            # Compute the loss we wish to minimize 
            # Note that by default, it is the mean loss that is computed
            # (so entire_batch_loss / batch_size)
            loss = loss_fn(outputs, labels) 
            
            
            # Perform the backward step. That is, compute the gradients of all parameters we want the network to learn
            loss.backward()
            
            # Update the model
            optimizer.step() 
            
            # Zero out gradients before the next round (or the end of training)
            optimizer.zero_grad() 

            # Update loss for this epoch
            # It is important to transform the loss to a number with .item()
            loss_train += loss.item()
            
        # Store current epoch loss. 
        losses_train.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
            
    fig = px.line(y=losses_train) 
    fig.show()      
    return losses_train

In [28]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f"Training on device {device}.")

# The DataLoader batches up the examples of our cifar dataset
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False) 

# Instantiate the optimizer, here:
# 1. Stochastic Gradient Descent optimizer, 
# 2. that has to be applied to our parameters (model.parameters())
# 3. With a learning rate of 1e-2
optimizer = optim.SGD(model_seq.parameters(), lr=1e-2)

# Instantiate the loss function (here we use cross entropy)
loss_fn = nn.CrossEntropyLoss()

# Now all we have to do is calling the training loop
train(
    n_epochs = 21,
    optimizer = optimizer,
    model = model_seq,
    loss_fn = loss_fn,
    train_loader = train_loader,
)
print('')

Training on device cpu.


14:20:48.623912  |  Epoch 1  |  Training loss 0.645
14:21:03.223303  |  Epoch 10  |  Training loss 0.326
14:21:19.593329  |  Epoch 20  |  Training loss 0.193





In [30]:
# Here we use shuffle = False
# Because it is easier to check the predictions made.
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(cifar2_test, batch_size=64, shuffle=False)


def compute_accuracy(model, loader):
    model.eval()
    correct = 0
    total = 0

    # We do not want gradients here, as we will not want to update the parameters.
    with torch.no_grad():
        for imgs, labels in loader:

            outputs = model(imgs)
            _, predicted = torch.max(outputs, dim=1)
            total += labels.shape[0]
            correct += int((predicted == labels).sum())

    acc =  correct / total
    print("Accuracy: {:.2f}".format(acc))
    return acc

print("Training accuracy:")
compute_accuracy(model_seq, train_loader)
print("Validation accuracy:")
compute_accuracy(model_seq, val_loader)
print("Test accuracy:")
compute_accuracy(model_seq, test_loader)

Training accuracy:
Accuracy: 0.93
Validation accuracy:
Accuracy: 0.84
Test accuracy:
Accuracy: 0.85


0.8475

In [14]:
def plot_loss(x, t_loss, v_loss):
    df = pd.DataFrame({
                'epoch':[i for i in range(1, x+1)],
                'training_loss':t_loss,
                'validation_loss':v_loss
            })            
    fig = px.line(df, x='epoch', y=['training_loss', 'validation_loss'], title='training loss and val loss')
    fig.update_layout(yaxis_title="loss")
    fig.show()

## 3.2 train_manual() with learning rate

In [15]:
def train_manual_update(n_epochs, lr, model, loss_fn, train_loader,val_loader, weight_decay=None, momentum=None, earlystopping = False):
    
    n_batch = len(train_loader)
    n_batch_val = len(val_loader)
    stop_count = 2
    # We'll store the training loss for each epoch
    losses_train = []
    val_loss_list = []
    # Set the network in training mode
    model.train()

    # If hyperparameter "momentum" is initialized
    if momentum is not None:
        # Initialize the velocity for each parameter to zero
        velocities = [torch.zeros_like(param) for param in model.parameters()]
    
    for epoch in range(1, n_epochs + 1): 
        
        # Training loss for the current epoch
        loss_train = 0
        loss_val = 0
        # Loop over our dataset (in batches the data loader creates for us)
        for imgs, labels in train_loader:
            
            
            imgs.to(device=device, dtype=torch.double)
            labels.to(device=device, dtype=torch.double)
            
            # Feed a batch into our model
            outputs = model(imgs)
            
            # Compute the loss we wish to minimize, including L2 regularization with weight decay
            loss = loss_fn(outputs, labels) 
            
            # Perform the backward step. That is, compute the gradients of all parameters we want the network to learn
            loss.backward()
            
            # If only hyperparameter "learning rate" is initialized.
            if (weight_decay is None) and (momentum is None):
                # Manual update of all trainable parameters using gradient descent
                with torch.no_grad():
                    for param in model.parameters():
                        if param.requires_grad:
                            param -= lr * param.grad
            
            
            # If hyperparameter "learning rate" and "weight decay" is initialized.
            elif (weight_decay is not None) and (momentum is None):
                # Manual update of all trainable parameters using gradient descent with L2 regularization
                with torch.no_grad():
                    for param in model.parameters():
                        if param.requires_grad:
                            param -= lr * (param.grad + weight_decay * param)
            
            
            # If all hyperparameters are initialized (learning rate, weight decay and momentum).    
            else:
                # Update each parameter based on the gradients and the previous velocity
                with torch.no_grad():
                    for i, param in enumerate(model.parameters()):
                        if param.requires_grad:
                            velocities[i] = momentum * velocities[i] - lr * (param.grad + weight_decay * param)
                            param += velocities[i]
            
            
            # Zero out gradients before the next round (or the end of training)
            with torch.no_grad():
                for param in model.parameters():
                    if param.grad is not None:
                        param.grad.zero_()
 

            # Update loss for this epoch
            # It is important to transform the loss to a number with .item()
            loss_train += loss.item()

        for imgs2, lab2 in val_loader:
            out_val = model(imgs2)
            val_loss = loss_fn(out_val, lab2)
            loss_val += val_loss.item()  
     
        current_val_loss = loss_val / n_batch_val
        # Store current epoch loss. 
        losses_train.append(loss_train / n_batch)

        

        if earlystopping :
            if epoch == 1:
                val_loss_list.append(current_val_loss)

            elif val_loss_list[-1] < current_val_loss:
                stop_count+=1
                val_loss_list.append(current_val_loss)

                if(stop_count==2):
                    n_epochs = epoch
                        
                    return (epoch, losses_train, val_loss_list)
                
            else:
                stop_count=0
                val_loss_list.append(current_val_loss)

        else: val_loss_list.append(current_val_loss)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    
    
    return (epoch, losses_train, val_loss_list)
            
    

In [17]:
# Initialize MLP
torch.manual_seed(123)
model_seq = MyMLP(n_in, n_hidden1, n_hidden2, n_hidden3, n_out)
# The DataLoader batches up the examples of our cifar dataset
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False) 

# Instantiate the loss function (here we use cross entropy)
loss_fn = nn.CrossEntropyLoss()

# Now all we have to do is calling the training loop
train_manual_update(
    n_epochs = 21,
    lr = 1e-2,
    model = model_seq,
    loss_fn = loss_fn,
    train_loader = train_loader,
    val_loader= val_loader,
    earlystopping = False,
)
print('')

14:18:25.892078  |  Epoch 1  |  Training loss 0.645
14:18:46.938846  |  Epoch 10  |  Training loss 0.326
14:19:11.955925  |  Epoch 20  |  Training loss 0.193



In [None]:
# Here we use shuffle = False
# Because it is easier to check the predictions made.
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(cifar2_test, batch_size=64, shuffle=False)


print("Training accuracy:")
compute_accuracy(model_seq, train_loader)
print("Validation accuracy:")
compute_accuracy(model_seq, val_loader)
print("Test accuracy:")
compute_accuracy(model_seq, test_loader)

Training accuracy:
Accuracy: 0.91
Validation accuracy:
Accuracy: 0.83
Test accuracy:
Accuracy: 0.84


0.8435

## 3.3 Train() with learning rate and weight decay

In [None]:
# Initialize MLP
torch.manual_seed(123)
model_seq = MyMLP(n_in, n_hidden1, n_hidden2, n_hidden3, n_out)
# The DataLoader batches up the examples of our cifar dataset
# Here we use shuffle = True in order to shuffle the dataset for the training
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False) 

# Instantiate the optimizer, here:
# 1. Stochastic Gradient Descent optimizer, 
# 2. that has to be applied to our parameters (model.parameters())
# 3. With a learning rate of 1e-2
optimizer = optim.SGD(model_seq.parameters(), lr=1e-2, weight_decay=0.01)

# Instantiate the loss function (here we use cross entropy)
loss_fn = nn.CrossEntropyLoss()

# Now all we have to do is calling the training loop
train(
    n_epochs = 21,
    optimizer = optimizer,
    model = model_seq,
    loss_fn = loss_fn,
    train_loader = train_loader,
)
print('')

23:17:34.265887  |  Epoch 1  |  Training loss 0.647
23:17:51.017986  |  Epoch 10  |  Training loss 0.340
23:18:09.715077  |  Epoch 20  |  Training loss 0.223





In [None]:
# Here we use shuffle = False
# Because it is easier to check the predictions made.
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(cifar2_test, batch_size=64, shuffle=False)


print("Training accuracy:")
compute_accuracy(model_seq, train_loader)
print("Validation accuracy:")
compute_accuracy(model_seq, val_loader)
print("Test accuracy:")
compute_accuracy(model_seq, test_loader)

Training accuracy:
Accuracy: 0.94
Validation accuracy:
Accuracy: 0.85
Test accuracy:
Accuracy: 0.85


0.851

## 3.4 train_manual() with learning rate and weight decay

In [None]:
# Initialize MLP
torch.manual_seed(123)
model_seq = MyMLP(n_in, n_hidden1, n_hidden2, n_hidden3, n_out)
# The DataLoader batches up the examples of our cifar dataset
# Here we use shuffle = True in order to shuffle the dataset for the training
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False) 

# Instantiate the loss function (here we use cross entropy)
loss_fn = nn.CrossEntropyLoss()

# Now all we have to do is calling the training loop
train_manual_update(
    n_epochs = 21,
    lr = 1e-2,
    model = model_seq,
    loss_fn = loss_fn,
    train_loader = train_loader,
    val_loader= val_loader,
    weight_decay = 0.01,
    earlystopping = False,
)
print('')

23:18:15.089365  |  Epoch 1  |  Training loss 0.647
23:18:40.413478  |  Epoch 10  |  Training loss 0.340
23:19:09.201984  |  Epoch 20  |  Training loss 0.223



In [None]:
# Here we use shuffle = False
# Because it is easier to check the predictions made.
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(cifar2_test, batch_size=64, shuffle=False)


print("Training accuracy:")
compute_accuracy(model_seq, train_loader)
print("Validation accuracy:")
compute_accuracy(model_seq, val_loader)
print("Test accuracy:")
compute_accuracy(model_seq, test_loader)

Training accuracy:
Accuracy: 0.94
Validation accuracy:
Accuracy: 0.85
Test accuracy:
Accuracy: 0.85


0.851

## 3.5 Train() with learning rate, weight decay and momentum

In [None]:
# Initialize MLP
torch.manual_seed(123)
model_seq = MyMLP(n_in, n_hidden1, n_hidden2, n_hidden3, n_out)
# The DataLoader batches up the examples of our cifar dataset
# Here we use shuffle = True in order to shuffle the dataset for the training
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False) 

# Instantiate the optimizer, here:
# 1. Stochastic Gradient Descent optimizer, 
# 2. that has to be applied to our parameters (model.parameters())
# 3. With a learning rate of 1e-2
optimizer = optim.SGD(model_seq.parameters(), lr=1e-2, weight_decay=0.01, momentum=0.1)

# Instantiate the loss function (here we use cross entropy)
loss_fn = nn.CrossEntropyLoss()

# Now all we have to do is calling the training loop
train(
    n_epochs = 21,
    optimizer = optimizer,
    model = model_seq,
    loss_fn = loss_fn,
    train_loader = train_loader,
)
print('')

23:19:15.159995  |  Epoch 1  |  Training loss 0.641
23:19:36.418203  |  Epoch 10  |  Training loss 0.328
23:19:59.780204  |  Epoch 20  |  Training loss 0.207





In [None]:
# Here we use shuffle = False
# Because it is easier to check the predictions made.
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(cifar2_test, batch_size=64, shuffle=False)


print("Training accuracy:")
compute_accuracy(model_seq, train_loader)
print("Validation accuracy:")
compute_accuracy(model_seq, val_loader)
print("Test accuracy:")
compute_accuracy(model_seq, test_loader)

Training accuracy:
Accuracy: 0.95
Validation accuracy:
Accuracy: 0.84
Test accuracy:
Accuracy: 0.85


0.853

## 3.6 train_manual() with learning rate, weight decay and momentum

In [None]:
# Initialize MLP
torch.manual_seed(123)
model_seq = MyMLP(n_in, n_hidden1, n_hidden2, n_hidden3, n_out)
# The DataLoader batches up the examples of our cifar dataset
# Here we use shuffle = True in order to shuffle the dataset for the training
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False) 

# Instantiate the loss function (here we use cross entropy)
loss_fn = nn.CrossEntropyLoss()

# Now all we have to do is calling the training loop
train_manual_update(
    n_epochs = 21,
    lr = 1e-2,
    model = model_seq,
    loss_fn = loss_fn,
    train_loader = train_loader,
    val_loader= val_loader,
    weight_decay = 0.01,
    momentum = 0.1,
)
print('')

23:20:06.079855  |  Epoch 1  |  Training loss 0.641
23:20:35.724817  |  Epoch 10  |  Training loss 0.328
23:21:08.872297  |  Epoch 20  |  Training loss 0.207



In [None]:
# Here we use shuffle = False
# Because it is easier to check the predictions made.
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(cifar2_test, batch_size=64, shuffle=False)


print("Training accuracy:")
compute_accuracy(model_seq, train_loader)
print("Validation accuracy:")
compute_accuracy(model_seq, val_loader)
print("Test accuracy:")
compute_accuracy(model_seq, test_loader)

Training accuracy:
Accuracy: 0.95
Validation accuracy:
Accuracy: 0.84
Test accuracy:
Accuracy: 0.85


0.853

# 4. Several models and model selection

In [None]:
def model_selection(train_load, val_load, test_load, early_stopping):
    #Here we use shuffle = True in order to shuffle the dataset for the training
    train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=64, shuffle=True) 
    val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)

    # Instantiate the loss function (here we use cross entropy)
    loss_fn = nn.CrossEntropyLoss()
    
    losses = []
    models_train = []
    models_val = []
    models_test = []
    lr_list = []
    weight_decay_list = []
    momentum_list = []
    i = 1


    models = []
    for x in [0, 0.01, 0.1, 0.5]:
        for y in [0, 0.01,0.1, 0.8]:
            for z in [0.1, 0.01,0.001,0.0001]:
                # Initialize MLP
                torch.manual_seed(123)
                model_seq = MyMLP(n_in, n_hidden1, n_hidden2, n_hidden3, n_out)
                
                print(" ====================== model %d ====================== " %i)
                print(f"Iteration using weight value {x}, and momentum value {y}:")
                
                losses.append(train_manual_update(
                    n_epochs = 21,
                    lr = z,
                    model = model_seq,
                    loss_fn = loss_fn,
                    train_loader = train_loader,
                    val_loader = val_loader,
                    weight_decay = x,
                    momentum = y,
                    earlystopping = early_stopping,
                ))
                print('')
                
                lr_list.append(z)
                weight_decay_list.append(x)
                momentum_list.append(y)
                models.append(model_seq)
                # Here we use shuffle = False
                # Because it is easier to check the predictions made.
                train_loader = torch.utils.data.DataLoader(train_load, batch_size=64, shuffle=False)
                val_loader = torch.utils.data.DataLoader(val_load, batch_size=64, shuffle=False)
                test_loader = torch.utils.data.DataLoader(test_load, batch_size=64, shuffle=False)

                train_acc = compute_accuracy(model_seq, train_loader)
                models_train.append(train_acc)
                print("Training accuracy: \n", train_acc)
                val_acc = compute_accuracy(model_seq, val_loader)
                models_val.append(val_acc)
                print("Validation accuracy: \n", val_acc)
            
                i += 1
    
    best_model = np.argmax(models_val)
    #compute accuracy on test data with the model that performed best on validationdata
    test_acc = compute_accuracy(models[best_model], test_loader)
    models_test.append(test_acc)
    best_loss = losses[best_model]
    plot_loss(best_loss[0],best_loss[1],best_loss[2])
    return models_train, models_val, models_test, lr_list, weight_decay_list, momentum_list, best_model

models_train, models_val, models_test, lr, weight_decay, momentum, best_model = model_selection(cifar2_train, cifar2_val, cifar2_test, False)
models_train_es, models_val_es, models_test_es, lr_es, weight_decay_es, momentum_es, best_model_es = model_selection(cifar2_train, cifar2_val, cifar2_test, True)

Iteration using weight value 0, and momentum value 0:
23:21:16.189668  |  Epoch 1  |  Training loss 0.504
23:21:46.378841  |  Epoch 10  |  Training loss 0.207
23:22:20.425196  |  Epoch 20  |  Training loss 0.097

Accuracy: 0.96
Training accuracy: 
 0.9573028723522236
Accuracy: 0.84
Validation accuracy: 
 0.8413021363173957
Iteration using weight value 0, and momentum value 0:
23:22:27.838354  |  Epoch 1  |  Training loss 0.645
23:22:58.736369  |  Epoch 10  |  Training loss 0.326
23:23:32.720613  |  Epoch 20  |  Training loss 0.193

Accuracy: 0.93
Training accuracy: 
 0.9311300876122879
Accuracy: 0.84
Validation accuracy: 
 0.8382502543234995
Iteration using weight value 0, and momentum value 0:
23:23:39.896094  |  Epoch 1  |  Training loss 0.690
23:24:10.591069  |  Epoch 10  |  Training loss 0.589
23:24:45.198522  |  Epoch 20  |  Training loss 0.495

Accuracy: 0.79
Training accuracy: 
 0.7939447709881335
Accuracy: 0.79
Validation accuracy: 
 0.7894201424211598
Iteration using weight va

Iteration using weight value 0, and momentum value 0:
00:36:24.619715  |  Epoch 1  |  Training loss 0.504

Accuracy: 0.92
Training accuracy: 
 0.9185982033935899
Accuracy: 0.83
Validation accuracy: 
 0.8311291963377416
Iteration using weight value 0, and momentum value 0:
00:36:51.085020  |  Epoch 1  |  Training loss 0.645
00:37:19.972382  |  Epoch 10  |  Training loss 0.326

Accuracy: 0.93
Training accuracy: 
 0.9322391039148276
Accuracy: 0.85
Validation accuracy: 
 0.8504577822990844
Iteration using weight value 0, and momentum value 0:
00:37:43.217000  |  Epoch 1  |  Training loss 0.690
00:38:12.188337  |  Epoch 10  |  Training loss 0.589
00:38:44.497519  |  Epoch 20  |  Training loss 0.495

Accuracy: 0.79
Training accuracy: 
 0.7939447709881335
Accuracy: 0.79
Validation accuracy: 
 0.7894201424211598
Iteration using weight value 0, and momentum value 0:
00:38:51.464392  |  Epoch 1  |  Training loss 0.694
00:39:20.406597  |  Epoch 10  |  Training loss 0.686
00:39:53.065254  |  Epoch

In [None]:
torch.manual_seed(123)
model_seq = MyMLP(n_in, n_hidden1, n_hidden2, n_hidden3, n_out)
print("\nThe best model without earlystopping is Model", best_model+1)
print(" ----------------------------------------- ")
print(" Model:               ", model_seq)
print(" Learning rate:       ", lr[best_model])
print(" Weight decay:        ", weight_decay[best_model])
print(" Momentum:            ", momentum[best_model])
print(" ----------------------------------------- ")
print("\n Performance:   ")
print(" ----------------------------------------- ")
print(' Training accuracy:        %.2f' %models_train[best_model])
print(' Validation accuracy:      %.2f' %models_val[best_model])
print(' Test accuracy:      %.2f' %models_test[0])
print(" ----------------------------------------- ")




The best model is Model 18
 ----------------------------------------- 
 Model:                MyMLP(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=3072, out_features=512, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=128, out_features=32, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=32, out_features=2, bias=True)
)
 Learning rate:        0.01
 Weight decay:         0.01
 Momentum:             0
 ----------------------------------------- 

 Performance:   
 ----------------------------------------- 
 Training accuracy:        0.94
 Validation accuracy:      0.85
 Test accuracy:      0.85
 ----------------------------------------- 


In [None]:
torch.manual_seed(123)
model_seq = MyMLP(n_in, n_hidden1, n_hidden2, n_hidden3, n_out)
print("\nThe best model with earlystopping is Model", best_model_es+1)
print(" ----------------------------------------- ")
print(" Model:               ", model_seq)
print(" Learning rate:       ", lr_es[best_model_es])
print(" Weight decay:        ", weight_decay_es[best_model_es])
print(" Momentum:            ", momentum_es[best_model_es])
print(" ----------------------------------------- ")
print("\n Performance:   ")
print(" ----------------------------------------- ")
print(' Training accuracy:        %.2f' %models_train_es[best_model_es])
print(' Validation accuracy:      %.2f' %models_val_es[best_model_es])
print(' Test accuracy:      %.2f' %models_test_es[0])
print(" ----------------------------------------- ")


The best model with earlystopping is Model 18
 ----------------------------------------- 
 Model:                MyMLP(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=3072, out_features=512, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=128, out_features=32, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=32, out_features=2, bias=True)
)
 Learning rate:        0.01
 Weight decay:         0
 Momentum:             0
 ----------------------------------------- 

 Performance:   
 ----------------------------------------- 
 Training accuracy:        0.93
 Validation accuracy:      0.85
 Test accuracy:      0.86
 ----------------------------------------- 
