In [1]:
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

In [20]:
!nvidia-smi

Der Befehl "nvidia-smi" ist entweder falsch geschrieben oder
konnte nicht gefunden werden.


# PyTorch
In this notebook you will gain some hands-on experience with [PyTorch](https://pytorch.org/), one of the major frameworks for deep learning. To install PyTorch run `conda install pytorch torchvision cudatoolkit=10.1 -c pytorch`, with cudatoolkit set to whichever CUDA version you have installed. You can check this by running `nvcc --version`. If you do not have an Nvidia GPU you can run `conda install pytorch torchvision cpuonly -c pytorch` instead. However, in this case we recommend using [Google Colab](https://colab.research.google.com/).

You will start by re-implementing some common features of deep neural networks (dropout and batch normalization) and then implement a very popular modern architecture for image classification (ResNet) and improve its training loop.

# 1. Dropout
Dropout is a form of regularization for neural networks. It works by randomly setting activations (values) to 0, each one with equal probability `p`. The values are then scaled by a factor $\frac{1}{1-p}$ to conserve their mean.

Dropout effectively trains a pseudo-ensemble of models with stochastic gradient descent. During evaluation we want to use the full ensemble and therefore have to turn off dropout. Use `self.training` to check if the model is in training or evaluation mode.

In [3]:
class Dropout(nn.Module):
    """
    Dropout, as discussed in the lecture and described here:
    https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
    
    Args:
        p: float, dropout probability
    """
    def __init__(self, p):
        super().__init__()
        self.p = p
        
    def forward(self, input):
        """
        The module's forward pass.
        This has to be implemented for every PyTorch module.
        PyTorch then automatically generates the backward pass
        by dynamically generating the computational graph during
        execution.
        
        Args:
            input: PyTorch tensor, arbitrary shape

        Returns:
            PyTorch tensor, same shape as input
        """
        if self.training:
            mask = np.random.random(input.shape)
            return torch.from_numpy(np.where(mask <= self.p, 0, (1 / (1 - self.p))))
        
        # TODO: Set values randomly to 0.

In [4]:
# Test dropout
test = torch.ones(10_000)
dropout = Dropout(0.5)
test_dropped = dropout(test)

print(test_dropped.sum().item())
print((test_dropped > 0).sum().item())

print(np.isclose(test_dropped.sum().item(), 10_000, atol=400))
print(np.isclose((test_dropped > 0).sum().item(), 5_000, atol=200))

# These assertions can in principle fail due to bad luck, but
# if implemented correctly they should almost always succeed.
assert np.isclose(test_dropped.sum().item(), 10_000, atol=400)
assert np.isclose((test_dropped > 0).sum().item(), 5_000, atol=200)

10060.0
5030
True
True


# 2. Batch normalization
Batch normalization is a trick used to smoothen the loss landscape and improve training. It is defined as the function
$$y = \frac{x - \mu_x}{\sigma_x + \epsilon} \cdot \gamma + \beta$$,
where $\gamma$ and $\beta$ and learnable parameters and $\epsilon$ is a some small number to avoid dividing by zero. The Statistics $\mu_x$ and $\sigma_x$ are taken separately for each feature. In a CNN this means averaging over the batch and all pixels.

In [5]:
class BatchNorm(nn.Module):
    """
    Batch normalization, as discussed in the lecture and similar to
    https://pytorch.org/docs/stable/nn.html#torch.nn.BatchNorm1d
    
    Only uses batch statistics (no running mean for evaluation).
    Batch statistics are calculated for a single dimension.
    Gamma is initialized as 1, beta as 0.
    
    Args:
        num_features: Number of features to calculate batch statistics for.
    """
    def __init__(self, num_features):
        super().__init__()
        
        # TODO: Initialize the required parameters
        self.gamma = nn.Parameter(torch.ones(num_features)).unsqueeze(0).unsqueeze(-1)
        #self.gamma = self.gamma.
        self.beta = nn.Parameter(torch.zeros(num_features)).unsqueeze(0).unsqueeze(-1)
        #self.beta = self.beta.
        
    def forward(self, input):
        """
        Batch normalization over the dimension C of (N, C, L).
        
        Args:
            input: PyTorch tensor, shape [N, C, L]
            
        Return:
            PyTorch tensor, same shape as input
        """
        eps = 1e-5

        mean = input.mean(dim=[0, 2], keepdim=True)
        input_mean_norm = input - mean
        var = torch.sqrt(input.var(dim=[0, 2],  keepdim=True))

        return (input_mean_norm / (var + eps)) * self.gamma + self.beta
        
        # TODO: Implement the required transformation

In [6]:
# Tests the batch normalization implementation
torch.random.manual_seed(42)
test = torch.randn(8, 2, 4)

b1 = BatchNorm(2)
test_b1 = b1(test)

b2 = nn.BatchNorm1d(2, affine=False, track_running_stats=False)
test_b2 = b2(test)

print(test_b1)
print("-----")
print(test_b2)

print(torch.allclose(test_b1, test_b2, rtol=0.02))
assert torch.allclose(test_b1, test_b2, rtol=0.02)

tensor([[[ 1.6380,  1.2470,  0.7253, -1.9484],
         [ 0.6971, -1.2133, -0.0234, -1.5829]],

        [[-0.7447,  1.3905, -0.4249, -1.3242],
         [-0.7073, -0.5391, -0.7482,  0.7810]],

        [[ 1.3849, -0.2178, -0.5182,  0.3152],
         [-0.7375,  1.0964,  0.8193,  1.6979]],

        [[ 1.0618,  1.0772,  0.4671,  1.1113],
         [-0.2117,  0.0613, -0.2317,  0.8782]],

        [[-1.3073, -0.8507, -0.2745,  1.4516],
         [ 0.3380, -0.4044,  0.3249, -0.7540]],

        [[-1.4611,  0.8097, -0.8583, -0.6105],
         [-1.2529,  2.1395, -1.2134, -0.4677]],

        [[-0.8886, -0.6611, -0.0064,  0.3918],
         [-0.4678,  1.2093, -0.7933, -0.7154]],

        [[-1.3238, -0.0438, -0.1323,  0.5251],
         [-0.0781,  1.8616, -1.1634,  1.4012]]], grad_fn=<AddBackward0>)
-----
tensor([[[ 1.6642,  1.2669,  0.7369, -1.9796],
         [ 0.7082, -1.2327, -0.0238, -1.6083]],

        [[-0.7567,  1.4128, -0.4317, -1.3454],
         [-0.7186, -0.5477, -0.7602,  0.7935]],

        [[

# 3. ResNet
ResNet is the model that first introduced residual connections (a form of skip connections). It is a rather simple, but successful and very popular architecture. In this part of the exercise we will re-implement it step by step.

Note that there is also an [improved version of ResNet](https://arxiv.org/abs/1603.05027) with optimized residual blocks. Here we will implement the [original version](https://arxiv.org/abs/1512.03385) for CIFAR-10.

This is just a convenience function to make e.g. `nn.Sequential` more flexible. It is e.g. useful in combination with `x.squeeze()`.

In [7]:
class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func

    def forward(self, x):
        return self.func(x)

We begin by implementing the residual blocks. The block is illustrated by this sketch:

![Residual connection](attachment:residual_connection.png)

Note that we use 'SAME' padding, no bias, and batch normalization after each convolution.

In [8]:
class ResidualBlock(nn.Module):
    """
    The residual block used by ResNet.
    
    Args:
        in_channels: The number of channels (feature maps) of the incoming embedding
        out_channels: The number of channels after the first convolution
        stride: Stride size of the first convolution, used for downsampling
    """
    
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()        
        if stride > 1 or in_channels != out_channels:
            # Add strides in the skip connection and zeros for the new channels.
            self.skip = Lambda(lambda x: F.pad(x[:, :, ::stride, ::stride],
                                               (0, 0, 0, 0, 0, out_channels - in_channels),
                                               mode="constant", value=0))
        else:
            self.skip = nn.Sequential()
            
        # TODO: Initialize the required layers
        #self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding="SAME", bias=False, stride=stride)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False, stride=stride, groups=1)
        self.bn1 = nn.BatchNorm2d(out_channels) #BatchNorm(out_channels)
        #self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding="SAME", bias=False, stride=stride) 
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False, stride=1, groups=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()

    def forward(self, input):
        # TODO: Execute the required layers and functions
        #print("input.shape:", input.shape)
        identity = self.skip(input)
        #print("identity.shape:", identity.shape)

        out = self.conv1(input)
        out = self.bn1(out)
        out = self.relu(out)
        #print("out1.shape:", out.shape)

        out = self.conv2(out)
        out = self.bn2(out)

        #print("out2.shape:", out.shape)
        out += identity
        out = self.relu(out)

        #print("out.shape:", out.shape)
        #print("")
        return out

Next we implement a stack of residual blocks for convenience. The first layer in the block is the one changing the number of channels and downsampling. You can use `nn.ModuleList` to use a list of child modules.

In [9]:
class ResidualStack(nn.Module):
    """
    A stack of residual blocks.
    
    Args:
        in_channels: The number of channels (feature maps) of the incoming embedding
        out_channels: The number of channels after the first layer
        stride: Stride size of the first layer, used for downsampling
        num_blocks: Number of residual blocks
    """
    
    def __init__(self, in_channels, out_channels, stride, num_blocks):
        super().__init__()
        
        # TODO: Initialize the required layers (blocks)
        self.layers = nn.ModuleList([ResidualBlock(in_channels, out_channels, stride)])
        for i in range(1, num_blocks):
          self.layers.append(ResidualBlock(out_channels, out_channels, 1))
        
    def forward(self, input):
        # TODO: Execute the layers (blocks)
        #print("Execute " + str(len(self.layers)) + " ResidualBlock layers")
        for layer in self.layers:
          input = layer(input)
        return input

Now we are finally ready to implement the full model! To do this, use the `nn.Sequential` API and carefully read the following paragraph from the paper (Fig. 3 is not important):

![ResNet CIFAR10 description](attachment:resnet_cifar10_description.png)

Note that a convolution layer is always convolution + batch norm + activation (ReLU), that each ResidualBlock contains 2 layers, and that you might have to `squeeze` the embedding before the dense (fully-connected) layer.

In [10]:
n = 5
num_classes = 10

class Squeeze(torch.nn.Module):
    def forward(self, input):
      out = input.squeeze()
      return out

# TODO: Implement ResNet via nn.Sequential
resnet = nn.Sequential(
            ResidualStack(3, 16, 1, 1),
            ResidualStack(16, 16, 1, 2 * n),
            ResidualStack(16, 32, 2, 2 * n),
            ResidualStack(32, 64, 2, 2 * n),
            nn.AdaptiveAvgPool2d((1,1)),
            #nn.AdaptiveAvgPool2d(64),
            Squeeze(),
            nn.Linear(64, num_classes)
          )

Next we need to initialize the weights of our model.

In [11]:
def initialize_weight(module):
    if isinstance(module, (nn.Linear, nn.Conv2d)):
        nn.init.kaiming_normal_(module.weight, nonlinearity='relu')
    elif isinstance(module, nn.BatchNorm2d):
        nn.init.constant_(module.weight, 1)
        nn.init.constant_(module.bias, 0)
        
resnet.apply(initialize_weight);

# 4. Training
So now we have a shiny new model, but that doesn't really help when we can't train it. So that's what we do next.

First we need to load the data. Note that we split the official training data into train and validation sets, because you must not look at the test set until you are completely done developing your model and report the final results. Some people don't do this properly, but you should not copy other people's bad habits.

In [12]:
class CIFAR10Subset(torchvision.datasets.CIFAR10):
    """
    Get a subset of the CIFAR10 dataset, according to the passed indices.
    """
    def __init__(self, *args, idx=None, **kwargs):
        super().__init__(*args, **kwargs)
        
        if idx is None:
            return
        
        self.data = self.data[idx]
        targets_np = np.array(self.targets)
        self.targets = targets_np[idx].tolist()

We next define transformations that change the images into PyTorch tensors, standardize the values according to the precomputed mean and standard deviation, and provide data augmentation for the training set.

In [13]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, 4),
    transforms.ToTensor(),
    normalize,
])
transform_eval = transforms.Compose([
    transforms.ToTensor(),
    normalize
])

In [14]:
ntrain = 45_000
train_set = CIFAR10Subset(root='./data', train=True, idx=range(ntrain),
                          download=True, transform=transform_train)
val_set = CIFAR10Subset(root='./data', train=True, idx=range(ntrain, 50_000),
                        download=True, transform=transform_eval)
test_set = torchvision.datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=transform_eval)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [15]:
dataloaders = {}
dataloaders['train'] = torch.utils.data.DataLoader(train_set, batch_size=128,
                                                   shuffle=True, num_workers=0,
                                                   pin_memory=True)
dataloaders['val'] = torch.utils.data.DataLoader(val_set, batch_size=128,
                                                 shuffle=False, num_workers=0,
                                                 pin_memory=True)
dataloaders['test'] = torch.utils.data.DataLoader(test_set, batch_size=128,
                                                  shuffle=False, num_workers=0,
                                                  pin_memory=True)

Next we push the model to our GPU (if there is one).

In [22]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(torch.cuda.is_available())
resnet.to(device);

Next we define a helper method that does one epoch of training or evaluation. We have only defined training here, so you need to implement the necessary changes for evaluation!

In [23]:
def run_epoch(model, optimizer, dataloader, train):
    """
    Run one epoch of training or evaluation.
    
    Args:
        model: The model used for prediction
        optimizer: Optimization algorithm for the model
        dataloader: Dataloader providing the data to run our model on
        train: Whether this epoch is used for training or evaluation
        
    Returns:
        Loss and accuracy in this epoch.
    """
    # TODO: Change the necessary parts to work correctly during evaluation (train=False)
    
    device = next(model.parameters()).device
    
    # Set model to training mode (for e.g. batch normalization, dropout)
    model.train()

    epoch_loss = 0.0
    epoch_acc = 0.0

    # Iterate over data
    for xb, yb in dataloader:
        xb, yb = xb.to(device), yb.to(device)
        
        # zero the parameter gradients
        if train != False:
            optimizer.zero_grad()

        # forward
        with torch.set_grad_enabled(True):
            pred = model(xb)
            loss = F.cross_entropy(pred, yb)
            top1 = torch.argmax(pred, dim=1)
            ncorrect = torch.sum(top1 == yb)

            if train != False:
                loss.backward()
                optimizer.step()

        # statistics
        epoch_loss += loss.item()
        epoch_acc += ncorrect.item()
    
    epoch_loss /= len(dataloader.dataset)
    epoch_acc /= len(dataloader.dataset)
    return epoch_loss, epoch_acc

Next we implement a method for fitting (training) our model. For many models early stopping can save a lot of training time. Your task is to add early stopping to the loop (based on validation accuracy)! And don't forget to save the best model parameters according to validation accuracy. You will need `copy.deepcopy` and the `state_dict` for this.

In [24]:
def fit(model, optimizer, lr_scheduler, dataloaders, max_epochs, patience):
    """
    Fit the given model on the dataset.
    
    Args:
        model: The model used for prediction
        optimizer: Optimization algorithm for the model
        lr_scheduler: Learning rate scheduler that improves training
                      in late epochs with learning rate decay
        dataloaders: Dataloaders for training and validation
        max_epochs: Maximum number of epochs for training
        patience: Number of epochs to wait with early stopping the
                  training if validation loss has decreased
                  
    Returns:
        Loss and accuracy in this epoch.
    """
    
    best_acc = 0
    curr_patience = 0
    best_model_weights = None
    for epoch in range(max_epochs):
        train_loss, train_acc = run_epoch(model, optimizer, dataloaders['train'], train=True)
        lr_scheduler.step()
        print(f"Epoch {epoch + 1: >3}/{max_epochs}, train loss: {train_loss:.2e}, accuracy: {train_acc * 100:.2f}%")
        
        val_loss, val_acc = run_epoch(model, None, dataloaders['val'], train=False)
        print(f"Epoch {epoch + 1: >3}/{max_epochs}, val loss: {val_loss:.2e}, accuracy: {val_acc * 100:.2f}%")
        
        # TODO: Add early stopping and save the best weights (in best_model_weights)
        #best_model_weights = copy.deepcopy(model.load_state_dict())
    
    model.load_state_dict(best_model_weights)

In most cases you should just use the Adam optimizer for training, because it works well out of the box. However, a well-tuned SGD (with momentum) will in most cases outperform Adam. And since the original paper gives us a well-tuned SGD we will just use that.

In [None]:
optimizer = torch.optim.SGD(resnet.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1)

# Fit model
fit(resnet, optimizer, lr_scheduler, dataloaders, max_epochs=200, patience=50)

Once the model is trained we run it on the test set to obtain our final accuracy.
Note that we can only look at the test set once, everything else would lead to overfitting. So you _must_ ignore the test set while developing your model!

In [None]:
test_loss, test_acc = run_epoch(resnet, None, dataloaders['test'], train=False)
print(f"Test loss: {test_loss:.1e}, accuracy: {test_acc * 100:.2f}%")

That's almost what was reported in the paper (92.49%) and we didn't even train on the full training set.

# Optional task: Squeeze out all the juice!

Can you do even better? Have a look at [A Recipe for Training Neural Networks](https://karpathy.github.io/2019/04/25/recipe/) and at the [EfficientNet architecture](https://ai.googleblog.com/2019/05/efficientnet-improving-accuracy-and.html) we discussed in the lecture. Play around with the possibilities PyTorch offers you and see how close you can get to the [state of the art on CIFAR-10](https://paperswithcode.com/sota/image-classification-on-cifar-10).

Hint: You can use [Google Colab](https://colab.research.google.com/) to access some free GPUs for your experiments.