In [24]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import random_split
from datetime import datetime
import numpy as np

# Getting the same results with train and train_manual_update
- Write torch.manual_seed(42) at the beginning of your notebook.
- Write torch.set_default_dtype(torch.double) at the beginning of your notebook to alleviate precision errors

In [25]:
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))

# Tasks
Load, analyse and preprocess the CIFAR-10 dataset. Split it into 3
datasets: training, validation and test. Take a subset of these datasets
by keeping only 2 labels: cat and car

In [26]:
def load_cifar(train_val_split=0.9, data_path='../data/', preprocessor=None):

    # transformer to resize images to 16x16 pixels
    if preprocessor is None:
        preprocessor = transforms.Compose([
            transforms.Resize(16),
            transforms.ToTensor()
            ])
    
    # load datasets
    train_val_data = datasets.CIFAR10(data_path, train=True, download=True, transform=preprocessor)
    test_data = datasets.CIFAR10(data_path, train=False, download=True, transform=preprocessor)
    
    # sizes of train and validation data
    train_size = int(train_val_split * len(train_val_data))
    val_size = len(train_val_data) - train_size

    # split train_val_data into train and validation sets
    train_data, val_data = random_split(train_val_data, [train_size, val_size])

    # create subsets with only cat (0) and car (1)
    label_map = {3: 0, 1: 1} 

    train = [(img, label_map[label]) for img, label in train_data if label in [1,3]]
    val = [(img, label_map[label]) for img, label in val_data if label in [1,3]]
    test = [(img, label_map[label]) for img, label in test_data if label in [1,3]]

    # create dataloaders?

    return train, val, test


def compute_accuracy(model, loader):
    # set the model to evaluation mode
    model.eval()
    correct = 0
    total = 0

    # disable gradient tracking
    with torch.no_grad():
        for images, labels in loader:
            # forward pass
            outputs = model(images)
            predicted = torch.argmax(outputs, dim=1)
            total += labels.size(0)
            # compare predicted with labels
            correct += torch.eq(predicted, labels).sum().item()

    accuracy = correct / total * 100.0
    return accuracy


Write a MyMLP class that implements a MLP in PyTorch (so only fully
connected layers) such that:
    
    - The input dimension is 768(= 16 ∗ 16 ∗ 3) and the output dimension is 2 (for the 2 classes).
    - The hidden layers have respectively 128 and 32 hidden units.
    - All activation functions are ReLU. The last layer has no activation function since the cross-entropy loss already includes a softmax activation
function.

In [27]:
class MyNet(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(16*16*3, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )

    def forward(self, x):
        x = self.flatten(x)
        x = self.layers(x)
        return x

Write a train(n_epochs, optimizer, model, loss_fn, train_loader) function that trains model for n_epochs epochs given an optimizer optimizer, a loss function loss_fn and a dataloader train_loader.

In [28]:
def train(n_epochs, optimizer, model, loss_fn, train_loader):

    n_batch = len(train_loader)
    train_losses = []
    model.train()
    optimizer.zero_grad(set_to_none=True)

    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:
            
            imgs.to(device=device, dtype=torch.double) 
            labels = labels.to(device=device)

            outputs = model(imgs)

            loss = loss_fn(outputs, labels)
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

            loss_train += loss.item()

        train_losses.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    
    return train_losses


Write a similar function train manual_update that has no optimizer parameter, but a learning rate lr parameter instead and that manually updates each trainable parameter of model using equation (2). Do not forget to zero out all gradients after each iteration. 

Train 2 instances of MyMLP, one using train and the other using train_manual_update (use the same parameter values for both models). Compare their respective training losses. To get exactly the same results with both functions, see section 3.3

In [29]:
def train_manual_update(n_epochs, model, loss_fn, train_loader, lr=1e-2, momentum_coeff=0., weight_decay=0.):
    
    n_batch = len(train_loader)
    train_losses = []
    model.train()
    model.zero_grad()
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, label in train_loader:
            
            outputs = model(imgs)
            loss = loss_fn(outputs, label)
            loss.backward()

            # update all parameters
            for p in model.parameters():
                
                # compute new parameter value
                p.data -=  (lr * p.grad)
                #p.grad = 0

            # zero out the gradients
            model.zero_grad()

            loss_train += loss.item()

        train_losses.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
            
    return train_losses

In [40]:
def train_manual_update_L2(n_epochs, model, loss_fn, train_loader, lr=1e-2, momentum_coeff=0., weight_decay=0.):
    
    n_batch = len(train_loader)
    train_losses = []
    model.train()
    model.zero_grad()
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, label in train_loader:
            
            outputs = model(imgs)
            loss = loss_fn(outputs, label)
            loss.backward()

            # update all parameters
            for p in model.parameters():
                
                # compute new parameter value
                p.data -=  weight_decay * lr * p.grad
                #p.grad = 0

            # zero out the gradients
            model.zero_grad()

            loss_train += loss.item()

        train_losses.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
            
    return train_losses

### Task 7 (not finished)

In [59]:
def train_manual_update_L2_my(n_epochs, model, loss_fn, train_loader, lr=1e-2, momentum_coeff=0., weight_decay=0.):
    
    n_batch = len(train_loader)
    train_losses = []
    model.train()
    model.zero_grad()
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, label in train_loader:
            
            outputs = model(imgs)
            loss = loss_fn(outputs, label)
            loss.backward()

            # update all parameters
            t = 1
            for p in model.parameters():
                
                g_t = p.grad

                # with weight decay
                if weight_decay != 0:
                    g_t += weight_decay * p.data #p.data here or grad? data in SGD notes?

                # with momentum
                if momentum_coeff != 0: 
                    if t > 1:
                        print(b_t.shape, g_t.shape)
                        b_t = (momentum_coeff * b_t) + g_t
                    else:
                        b_t = g_t

                    g_t = b_t

                # compute new parameter value
                p.data -=  lr * g_t

                t += 1

            # zero out the gradients
            model.zero_grad()

            loss_train += loss.item()

        train_losses.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
            
    return train_losses

In [30]:
# load datasets
data_train, data_val, data_test = load_cifar()

# create an instance of MyNet
torch.manual_seed(42)
model_1 = MyNet()
torch.manual_seed(42)
model_2 = MyNet()

# optimizer, loss function, dataloader
optimizer = optim.SGD(model_1.parameters(), lr=1e-2)
loss_fn = nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(data_train, batch_size=64, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [34]:
train_losses = train(
    n_epochs = 10,
    optimizer= optimizer, 
    model = model_1, 
    loss_fn = loss_fn, 
    train_loader = train_loader
)
print(train_losses)

12:39:43.732543  |  Epoch 1  |  Training loss 0.435
12:39:44.287683  |  Epoch 10  |  Training loss 0.349
[0.435492932385567, 0.42678182860324104, 0.41777411167456774, 0.40836909292769846, 0.39845052451651397, 0.38817815523412974, 0.3778513767732199, 0.36760161499761973, 0.3576546298809369, 0.3485385585318173]


In [35]:
train_manual_losses = train_manual_update(
    n_epochs = 10,
    model = model_2, 
    loss_fn = loss_fn, 
    train_loader = train_loader
)
print(train_manual_losses)

12:39:44.985468  |  Epoch 1  |  Training loss 0.435
12:39:45.579355  |  Epoch 10  |  Training loss 0.349
[0.43549293238556697, 0.42678182860324104, 0.41777411167456774, 0.4083690929276986, 0.39845052451651397, 0.38817815523412974, 0.3778513767732198, 0.36760161499761973, 0.3576546298809369, 0.3485385585318173]


In [38]:
print(train_losses == train_manual_losses)


False


In [None]:
train_manual_losses_L2 = train_manual_update_L2(
    n_epochs = 10,
    model = model_2, 
    loss_fn = loss_fn, 
    train_loader = train_loader, 
    weight_decay=0.9 # what to put to?
)
print(train_manual_losses_L2)

08:44:08.553518  |  Epoch 1  |  Training loss 0.149
08:44:09.197209  |  Epoch 10  |  Training loss 0.135
[0.1486143608688836, 0.14691703552031246, 0.1450711569680034, 0.1434314581946205, 0.1412237691715824, 0.13970836757279684, 0.13839612823876332, 0.13731512321168213, 0.1356373076203175, 0.13499526262928313]


In [60]:
train_manual_losses_L2_my = train_manual_update_L2_my(
    n_epochs = 10,
    model = model_2, 
    loss_fn = loss_fn, 
    train_loader = train_loader, 
    momentum_coeff=0.1,
    weight_decay=0.9 # what to put to?
)
print(train_manual_losses_L2_my)

torch.Size([128, 768]) torch.Size([128])


RuntimeError: The size of tensor a (768) must match the size of tensor b (128) at non-singleton dimension 1