<a href="https://colab.research.google.com/github/Memento2121/Dataflowr_Practicals/blob/main/prac_module_5_Stacking_layers_MLP_CIFAR10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Dataflowr](https://raw.githubusercontent.com/dataflowr/website/master/_assets/dataflowr_logo.png)](https://dataflowr.github.io/website/)

# [Module 5](https://dataflowr.github.io/website/modules/5-stacking-layers/): overfitting a MLP on CIFAR10

Training loop over CIFAR10 (40,000 train images, 10,000 test images). What happens if you
- switch the training to a GPU? Is it faster?
- Remove the `ReLU()`?
- Increase the learning rate?
- Stack more layers?
- Perform more epochs?

Can you completely overfit the training set (i.e. get 100% accuracy?)

This code is highly non-modulable. Create functions for each specific task.
(hint: see [this](https://github.com/pytorch/examples/blob/master/mnist/main.py))

Your training went well. Good. Why not save the weights of the network (`net.state_dict()`) using `torch.save()`?

In [None]:
import torch
import torchvision
import torch.nn as nn
import torchvision.transforms as t

# define network structure
net = nn.Sequential(nn.Linear(3 * 32 * 32, 1000), nn.ReLU(), nn.Linear(1000, 10))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr = 0.001, momentum=0.9)

# load data
to_tensor =  t.ToTensor()
normalize = t.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
flatten =  t.Lambda(lambda x:x.view(-1))

transform_list = t.Compose([to_tensor, normalize, flatten])
train_set = torchvision.datasets.CIFAR10(root='.', train=True, transform=transform_list, download=True)
test_set = torchvision.datasets.CIFAR10(root='.', train=False, transform=transform_list, download=True)

train_loader = torch.utils.data.DataLoader(train_set, batch_size=64)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=64)

# === Train === ###
net.train()

# train loop
for epoch in range(3):
    train_correct = 0
    train_loss = 0
    print('Epoch {}'.format(epoch))

    # loop per epoch
    for i, (batch, targets) in enumerate(train_loader):

        output = net(batch)
        loss = criterion(output, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pred = output.max(1, keepdim=True)[1]
        train_correct += pred.eq(targets.view_as(pred)).sum().item()
        train_loss += loss

        if i % 100 == 10: print('Train loss {:.4f}, Train accuracy {:.2f}%'.format(
            train_loss / ((i+1) * 64), 100 * train_correct / ((i+1) * 64)))

print('End of training.\n')

# === Test === ###
test_correct = 0
net.eval()

# loop, over whole test set
for i, (batch, targets) in enumerate(test_loader):

    output = net(batch)
    pred = output.max(1, keepdim=True)[1]
    test_correct += pred.eq(targets.view_as(pred)).sum().item()

print('End of testing. Test accuracy {:.2f}%'.format(
    100 * test_correct / (len(test_loader) * 64)))

Files already downloaded and verified
Files already downloaded and verified
Epoch 0
Train loss 0.0358, Train accuracy 13.78%
Train loss 0.0333, Train accuracy 24.72%
Train loss 0.0321, Train accuracy 27.73%
Train loss 0.0313, Train accuracy 29.81%
Train loss 0.0306, Train accuracy 31.77%
Train loss 0.0301, Train accuracy 32.96%
Train loss 0.0297, Train accuracy 33.97%
Train loss 0.0294, Train accuracy 34.69%
Epoch 1
Train loss 0.0261, Train accuracy 41.76%
Train loss 0.0262, Train accuracy 41.12%
Train loss 0.0263, Train accuracy 41.83%
Train loss 0.0262, Train accuracy 42.01%
Train loss 0.0260, Train accuracy 42.56%
Train loss 0.0259, Train accuracy 42.71%
Train loss 0.0258, Train accuracy 42.83%
Train loss 0.0258, Train accuracy 42.84%
Epoch 2
Train loss 0.0244, Train accuracy 46.45%
Train loss 0.0245, Train accuracy 45.14%
Train loss 0.0246, Train accuracy 45.52%
Train loss 0.0246, Train accuracy 45.48%
Train loss 0.0244, Train accuracy 45.87%
Train loss 0.0244, Train accuracy 45.95

In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as t

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# define network structure
class model(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(3 * 32 * 32, 1000)
    self.fc2 = nn.Linear(1000, 10)
  def forward(self,x):
    x = self.fc1(x)
    x = F.relu(x)
    x = self.fc2(x)
    return x

net = model().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr = 0.01)


# load data
to_tensor =  t.ToTensor()
normalize = t.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
flatten =  t.Lambda(lambda x:x.view(-1))

transform_list = t.Compose([to_tensor, normalize, flatten])
train_set = torchvision.datasets.CIFAR10(root='.', train=True, transform=transform_list, download=True)
test_set = torchvision.datasets.CIFAR10(root='.', train=False, transform=transform_list, download=True)

train_loader = torch.utils.data.DataLoader(train_set, batch_size=64)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=64)

# === Train === ###
net.train()

# train loop
for epoch in range(3):
    train_correct = 0
    train_loss = 0
    print('Epoch {}'.format(epoch))

    # loop per epoch
    for i, (batch, targets) in enumerate(train_loader):
        batch,targets = batch.to(device),targets.to(device)
        output = net(batch)
        loss = criterion(output, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pred = output.max(1, keepdim=True)[1]
        train_correct += pred.eq(targets.view_as(pred)).sum().item()
        train_loss += loss

        if i % 100 == 10: print('Train loss {:.4f}, Train accuracy {:.2f}%'.format(
            train_loss / ((i+1) * 64), 100 * train_correct / ((i+1) * 64)))

print('End of training.\n')

# === Test === ###
test_correct = 0
net.eval()

# loop, over whole test set
for i, (batch, targets) in enumerate(test_loader):
    batch,targets = batch.to(device),targets.to(device)
    output = net(batch)
    pred = output.max(1, keepdim=True)[1]
    test_correct += pred.eq(targets.view_as(pred)).sum().item()

print('End of testing. Test accuracy {:.2f}%'.format(
    100 * test_correct / (len(test_loader) * 64)))

Using device: cuda
Files already downloaded and verified
Files already downloaded and verified
Epoch 0
Train loss 0.0364, Train accuracy 18.61%
Train loss 0.0320, Train accuracy 29.31%
Train loss 0.0306, Train accuracy 32.37%
Train loss 0.0297, Train accuracy 34.37%
Train loss 0.0289, Train accuracy 36.26%
Train loss 0.0285, Train accuracy 37.01%
Train loss 0.0282, Train accuracy 37.73%
Train loss 0.0279, Train accuracy 38.33%
Epoch 1
Train loss 0.0250, Train accuracy 43.61%
Train loss 0.0245, Train accuracy 45.71%
Train loss 0.0243, Train accuracy 46.02%
Train loss 0.0242, Train accuracy 46.45%
Train loss 0.0239, Train accuracy 47.20%
Train loss 0.0238, Train accuracy 47.26%
Train loss 0.0238, Train accuracy 47.36%
Train loss 0.0237, Train accuracy 47.42%
Epoch 2
Train loss 0.0222, Train accuracy 51.42%
Train loss 0.0222, Train accuracy 51.15%
Train loss 0.0221, Train accuracy 51.09%
Train loss 0.0221, Train accuracy 51.38%
Train loss 0.0218, Train accuracy 51.86%
Train loss 0.0218, T

## Autograd tips and tricks

Pointers are everywhere!

In [None]:
net = nn.Linear(2, 2)
w = net.weight
print(w)

x = torch.rand(1, 2)
y = net(x).sum()
y.backward()
net.weight.data -= 0.01 * net.weight.grad # <--- What is this?
print(w)

Parameter containing:
tensor([[-0.2617,  0.6555],
        [-0.0558, -0.2010]], requires_grad=True)
Parameter containing:
tensor([[-0.2622,  0.6462],
        [-0.0563, -0.2103]], requires_grad=True)


In [None]:
net = nn.Linear(2, 2)
w = net.weight.clone()
print(w)

x = torch.rand(1, 2)
y = net(x).sum()
y.backward()
net.weight.data -= 0.01 * net.weight.grad # <--- What is this?
print(w)

tensor([[-0.6454, -0.3360],
        [-0.2955,  0.1087]], grad_fn=<CloneBackward0>)
tensor([[-0.6454, -0.3360],
        [-0.2955,  0.1087]], grad_fn=<CloneBackward0>)


Sharing weights

In [None]:
net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
net[0].weight = net[1].weight  # weight sharing

x = torch.rand(1, 2)
y = net(x).sum()
y.backward()
print(net[0].weight.grad)
print(net[1].weight.grad)

tensor([[-1.0636, -1.1708],
        [-1.5021, -1.6026]])
tensor([[-1.0636, -1.1708],
        [-1.5021, -1.6026]])


[![Dataflowr](https://raw.githubusercontent.com/dataflowr/website/master/_assets/dataflowr_logo.png)](https://dataflowr.github.io/website/)