In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

import numpy as np
import matplotlib.pyplot as plt

# Exercise 1: Convolutional Neural Networks


In this exercise, you will train a simple CNN to classify images from the CIFAR10 dataset.


1. Download the CIFAR10 dataset using `torchvision.datasets.CIFAR10`, and build the train and test dataloaders, setting the batch size to 32 and activating reshuffling at each epoch for the train data by setting `shuffle=True`. Visualize some images and their different color channels.

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

training_data = torchvision.datasets.CIFAR10(
    root="./data",
    train=True,
    download=True,
    transform=transform
)

test_data = ### YOUR CODE ###

train_dataloader = ### YOUR CODE ###
test_dataloader = ### YOUR CODE ###

In [None]:
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

def imshow(img):
    img = img / 2 + 0.5 # Unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

dataiter = iter(train_dataloader)
images, labels = next(dataiter)

imshow(torchvision.utils.make_grid(images))
print([classes[labels[j]] for j in range(batch_size)])

In [None]:
images.shape

In [None]:
imshow(images[0])

In [None]:
im_red = images[0].clone()
im_red[1:3,:,:] = 0.
imshow(im_red) # Red channel of the image

2. Define a function returning a convolutional neural network built with `nn.Sequential`. Use a first layer of 6 convolutional channels with filter size 5, a max-pooling layer over a $2 \times 2$ window, a second convolutional layer made of 16 channels with filter size 5, another $2 \times 2$ max-pooling layer, two dense layers with 120 and 84 neurons respectively, and a final linear layer with 10 outputs.

In [None]:
def initialize_cnn():

    ### YOUR CODE ###

3. Using the cross-entropy loss and SGD with learning rate 0.01, train the model for 5 epochs. After training, compute the accuracy on the test set.

In [None]:
model = initialize_cnn()

### YOUR CODE ###

# Exercise 2: Momentum

1. Making an analogy with a physical system, we can think of the negative gradient as a force moving a particle through parameter space, following Newton’s laws. Adding a momentum or inertia term, the optimization algorithm remembers the directions of the past gradients and continues to move in their direction. Mathematically,
\begin{equation}
    v_t = \gamma v_{t-1} + \eta \nabla_\theta L(\theta_t)
\end{equation}
\begin{equation}
    \theta_{t+1} = \theta_t - v_t,
\end{equation}
where $\gamma \in [0,1]$ is the momentum parameter, $\eta$ the learning rate, and $\theta$ the parameters of the model. Momentum helps the optimization dynamics gain speed in directions with persistent small gradients and suppresses oscillations. Repeat training, adding `momentum=0.9` to the SGD dynamics.

In [None]:
model = initialize_cnn()

### YOUR CODE ###

# Exercise 3: Feature Maps

1. Using t`orch.fx`, we can visualize the transformations of an input inside our neural network. For different input images, check the outputs of the first convolutional layer, of the first ReLU application, and of the first pooling layer.


In [None]:
from torchvision.models.feature_extraction import get_graph_node_names
from torchvision.models.feature_extraction import create_feature_extractor

nodes, _ = get_graph_node_names(model)
print(nodes) # Prints the nn.Sequential layer names

feature_extractor = create_feature_extractor(
	model, return_nodes=['0', '1', '2']) # Outputs of first conv. layer, ReLU, and first pooling layer

['input', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']


In [None]:
dataloader = DataLoader(training_data, batch_size=batch_size)

image = ### YOUR CODE (Select one CIFAR Image) ###

out = feature_extractor(image.unsqueeze(0)) # Return dictionary with the feature maps of 'image'

imshow(image)

print('conv1')
for i in range(6):
    plt.imshow(out['0'][0,i].detach())
    plt.show()

print('relu')
for i in range(6):
    plt.imshow(out['1'][0,i].detach())
    plt.show()

print('pool')
for i in range(6):
    plt.imshow(out['2'][0,i].detach())
    plt.show()

2. Can we look at what a particular neuron reacts to? What are the features learned by deep models? A simple idea to visualize these features, called activation maximization, consists in looking for the input with bounded norm that maximizes the activation of a given neuron ($x^* = \arg \max_{x: \; \|x\|=1} h_i^{\ell}(x,\theta^*)$, where $h_i^\ell$ is the activation of the neuron $i$ at layer $\ell$ of a trained network). Open https://distill.pub/2017/feature-visualization/appendix and check how the neurons in different layers of the GoogLeNet network are specializing to recognize features with various complexity, from simple textures to meaningful semantic concepts!