# Exercise 4

## Import packages

In [3]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

## Task 1
(3 points)

Implement the training loop for one training epoch.
An epoch trains on the whole training dataset once.

In [43]:
def train(model, use_cuda, train_loader, optimizer, epoch, log_interval):
    """
    Train one epoch
    
    model -- the neural network
    use_cuda -- true if GPU should be used
    train_loader -- data loader
    optimizer -- network optimizer
    epoch -- number of current epoch
    log_interval -- number of training steps between logs
    """
    # TODO: set the model to train mode
    device = torch.device("cuda")
    model.to(device)
    model.train()
    
    # TODO: enumerate over the dataloader to get mini batches
    #       of images and ground truth labels
    # HINT: the builtin python function enumerate() also gives you indices
    for i,(item, groundtruth) in enumerate(train_loader):
        
        # TODO: set the optimizers gradients to zero
        model.zero_grad()
        print(item)
        
        # TODO: run the network
        prediction = model(item)
        
        
        # TODO: compute negative log likelihood loss
        loss = model.NLLLoss(prediction, groundtruth)
        print(loss)
        
        # TODO: do backpropagation
        loss.backward()
       
        
        # TODO: optimize
        optimizer.step()
       
        
        # TODO: print current loss for every nth ("log_interval"th) iteration
        if i % log_interval:
            print("Current Loss:", loss, "Cycle:", i)


        

We already implemented the validation function for you (this is essentially validate() from the last exercise)

In [5]:
def validate(model, use_cuda, test_loader):
    """
    Compute test metrics
    
    model -- the neural network
    use_cuda -- true if GPU should be used
    test_loader -- data loader
    """
    # create a 10x10 grid of subplots
    _, axis = plt.subplots(10, 10)
    
    # set model to evaluation mode
    model.eval()
    test_loss = 0
    correct = 0
    plotted = 0
    
    # disable gradients globally
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            # for each batch
            if use_cuda:
                # transfer to GPU
                data = data.cuda()
                target = target.cuda()
            
            # run network and compute metrics
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            
            img_correct = pred.eq(target.view_as(pred))
            correct += pred.eq(target.view_as(pred)).sum().item()
            
            # plot the first 100 images
            img_idx = 0
            data = data.cpu().numpy()
            
            while plotted < 100 and img_idx < data.shape[0]:
                # compute position of ith image in the grid
                y = plotted % 10
                x = plotted // 10
                
                # convert image tensor to numpy array and normalize to [0, 1]
                img = data[img_idx, 0]
                img = (img - np.min(img)) / (np.max(img) - np.min(img))
                
                # make wrongly predicted images red
                img = np.stack([img] * 3, 2)
                if img_correct[img_idx] == 0:
                    img[:, :, 1:] = 0.0
                
                # disable axis and show image
                axis[y][x].axis('off')
                axis[y][x].imshow(img)
                
                # show the predicted class next to each image
                axis[y][x].text(30, 25, pred[img_idx].item())
                
                plotted += 1
                img_idx += 1
            
    test_loss /= len(test_loader.dataset)

    # show results
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    plt.show()

## Task 2
(4 points)

Implement a five-layer fully connected neural network.
The dimensions (without batch size) should change like this: 784->200->100->60->30->10
Use log softmax to compute the class predictions.

Run the code at the end of the notebook to train and validate your implementation.

### Task 2.1
* sigmoid non-linear activation function
* note that the last layer does not need an activation function!

### Task 2.2
* add a new class "FCNet2"
* replace sigmoid with ReLU

### Task 2.3
* add a new class "FCNet2"
* add batch normalization to the first and third layers (note the difference between 1D/2D/3D versions)


**NOTE:** The perfomance should improve slightly with each step. However, due to the random weight initialization applied by PyTorch, your results may vary a bit between trainings.

In [6]:
class FCNet1(nn.Module):
    """
    Fully Connected Neural Network
    
    Five fully connected layers with sigmoid non-linearity
    
    Dimensions
    784->200->100->60->30->10
    """
    def __init__(self):
        super(FCNet1, self).__init__()
        #Check for hardware acceleration
        device = (
            "cuda"
            if torch.cuda.is_available()
            else "mps"
            if torch.backends.mps.is_available()
            else "cpu"
        )
        print(f"=== Using {device} device ===")
        
        # TODO: initialize network layers
        # HINT: take a look at "torch.nn" (imported as "nn")

        layers = [784, 200, 100, 60, 30, 10]

        self.network = nn.Sequential(
            nn.Linear(layers[0], layers[1]),
            nn.Sigmoid(),
            nn.Linear(layers[1], layers[2]),
            nn.Sigmoid(),
            nn.Linear(layers[2], layers[3]),
            nn.Sigmoid(),
            nn.Linear(layers[3], layers[4]),
            nn.Sigmoid(),
            nn.Linear(layers[4], layers[5])
        )

       

    def forward(self, x):
        # TODO: reshape batch of images to batch of 1D vectors
        input = torch.flatten(x)
        
        # TODO: run network layers
        output = self.network(input)
        
        # TODO: compute log softmax over the output
        # HINT: take a look at "torch.nn.functional" (imported as "F")
        output = F.softmax(output)
        
        return output
model = FCNet1()
data_rnd = np.abs(torch.randn(28, 28))
#print(data_rnd)
model(data_rnd)


=== Using cuda device ===


  output = F.softmax(output)


tensor([0.1319, 0.0659, 0.0920, 0.0769, 0.1042, 0.1422, 0.1158, 0.0859, 0.1119,
        0.0734], grad_fn=<SoftmaxBackward0>)

In [15]:
class FCNet2(nn.Module):
    def __init__(self):
        super(FCNet2, self).__init__()

        #Check for hardware acceleration
        device = (
            "cuda"
            if torch.cuda.is_available()
            else "mps"
            if torch.backends.mps.is_available()
            else "cpu"
        )
        print(f"=== Using {device} device ===")


        layers = [784, 200, 100, 60, 30, 10]

        self.network = nn.Sequential(
            nn.Linear(layers[0], layers[1]),
            nn.BatchNorm1d(layers[1]),
            nn.ReLU(),
            nn.Linear(layers[1], layers[2]),
            nn.ReLU(),
            nn.Linear(layers[2], layers[3]),
            nn.BatchNorm1d(layers[3]),
            nn.ReLU(),
            nn.Linear(layers[3], layers[4]),
            nn.ReLU(),
            nn.Linear(layers[4], layers[5])
        )



    def forward(self, x):
        # TODO: reshape batch of images to batch of 1D vectors
        input = torch.flatten(x)
        
        # TODO: run network layers
        output = self.network(input)
        
        # TODO: compute log softmax over the output
        # HINT: take a look at "torch.nn.functional" (imported as "F")
        output = F.softmax(output)
        
        return output

model = FCNet1()
data_rnd = np.abs(torch.randn(28, 28))
#print(data_rnd)
model(data_rnd)

=== Using cuda device ===


  output = F.softmax(output)


tensor([0.1124, 0.1180, 0.0992, 0.0909, 0.0671, 0.0627, 0.1057, 0.1053, 0.1414,
        0.0973], grad_fn=<SoftmaxBackward0>)

In [None]:
class FCNet3(nn.Module):
    def __init__(self):
        super(FCNet3, self).__init__()
        
        #Check for hardware acceleration
        device = (
            "cuda"
            if torch.cuda.is_available()
            else "mps"
            if torch.backends.mps.is_available()
            else "cpu"
        )
        print(f"=== Using {device} device ===")


        layers = [784, 200, 100, 60, 30, 10]

        self.network = nn.Sequential(
            nn.Linear(layers[0], layers[1]),
            nn.ReLU(),
            nn.Linear(layers[1], layers[2]),
            nn.ReLU(),
            nn.Linear(layers[2], layers[3]),
            nn.ReLU(),
            nn.Linear(layers[3], layers[4]),
            nn.ReLU(),
            nn.Linear(layers[4], layers[5])
        )


        
    def forward(self, x):
        # TODO: reshape batch of images to batch of 1D vectors
        input = torch.flatten(x)
        
        # TODO: run network layers
        output = self.network(input)
        
        # TODO: compute log softmax over the output
        # HINT: take a look at "torch.nn.functional" (imported as "F")
        output = F.softmax(output)
        
        return output
        pass

model = FCNet1()
data_rnd = np.abs(torch.randn(28, 28))
#print(data_rnd)
model(data_rnd)

=== Using cuda device ===


  output = F.softmax(output)


tensor([0.1156, 0.0849, 0.1096, 0.0807, 0.1244, 0.0883, 0.1553, 0.1004, 0.0778,
        0.0631], grad_fn=<SoftmaxBackward0>)

## Task 3
(3 points)

Implement a convolutional neural network, consisting of two convolutional and two fully connected layers.
This time, the dimensions (without batch size) should change like this: 1x28x28->32x26x26->64x12x12->128->10

### Task 3.1
* two convolutional layers (kernel size 3)
* two fully-connected layers
* ReLU activation function

### Task 3.2
* add batch normalization to first convolutional and first fully connected layer

### Task 3.3
* use max pooling instead of stride to reduce the dimensions to 64x12x12

In [9]:
class ConvNet1(nn.Module):
    """
    Convolutional Neural Network
    
    Two convolutional layers and two fully connected layers
    
    Dimensions:
    1x28x28->32x26x26->64x12x12->128->10
    """
    def __init__(self):
        super(ConvNet1, self).__init__()
        
        # TODO: initialize network layers
        layers = [[1,28,28], [32,26,26], [64,12,12], 128, 10]

        self.conv_network = nn.Sequential(
            nn.Conv2d(layers[0][0], layers[1][0], 3, 1),
            nn.ReLU(),
            nn.Conv2d(layers[1][0], layers[2][0], 3, 2),
            nn.ReLU()
        )
        self.lin_network = nn.Sequential(
            nn.Linear(layers[2][0], layers[3]),
            nn.ReLU(),
            nn.Linear(layers[3], layers[4]),
        )
        

    def forward(self, x):
        # TODO: run convolutional layers
        output = self.conv_network(x)
        
        
        # TODO: reshape batch of images to batch of 1D vectors
        output.Flatten()
        
        
        # TODO: run fully connected layers
        output = self.lin_network(output)

       
        
        # TODO: compute log softmax over the output
        output = nn.Softmax(output)
        
        
        return output
model = ConvNet1()

In [10]:
class ConvNet2(nn.Module):
    """
    Convolutional Neural Network
    
    Two convolutional layers and two fully connected layers
    
    Dimensions:
    1x28x28->32x26x26->64x12x12->128->10
    """
    def __init__(self):
        super(ConvNet2, self).__init__()
        
        # TODO: initialize network layers
        layers = [[1,28,28], [32,26,26], [64,12,12], 128, 10]

        self.conv_network = nn.Sequential(
            nn.Conv2d(layers[0][0], layers[1][0], 3, 1),
            nn.BatchNorm2d(layers[1][0]),
            nn.ReLU(),
            nn.Conv2d(layers[1][0], layers[2][0], 3, 2),
            nn.ReLU()
        )
        self.lin_network = nn.Sequential(
            nn.Linear(layers[2][0], layers[3]),
            nn.BatchNorm2d(layers[3]),
            nn.ReLU(),
            nn.Linear(layers[3], layers[4]),
        )
        

    def forward(self, x):
        # TODO: run convolutional layers
        output = self.conv_network(x)
        
        
        # TODO: reshape batch of images to batch of 1D vectors
        output.Flatten()
        
        
        # TODO: run fully connected layers
        output = self.lin_network(output)

       
        
        # TODO: compute log softmax over the output
        output = nn.Softmax(output)
        
        
        return output
model = ConvNet2()

In [11]:
class ConvNet3(nn.Module):
    """
    Convolutional Neural Network
    
    Two convolutional layers and two fully connected layers
    
    Dimensions:
    1x28x28->32x26x26->64x12x12->128->10
    """
    def __init__(self):
        super(ConvNet3, self).__init__()
        
        # TODO: initialize network layers
        layers = [[1,28,28], [32,26,26], [64,12,12], 128, 10]

        self.conv_network = nn.Sequential(
            nn.Conv2d(layers[0][0], layers[1][0], 3, 1),
            nn.BatchNorm2d(layers[1][0]),
            nn.ReLU(),
            nn.Conv2d(layers[1][0], layers[2][0], 3, 1),
            nn.MaxPool2d(3, 2),
            nn.ReLU()
        )
        self.lin_network = nn.Sequential(
            nn.Linear(layers[2][0], layers[3]),
            nn.BatchNorm2d(layers[3]),
            nn.ReLU(),
            nn.Linear(layers[3], layers[4])
        )
        

    def forward(self, x):
        # TODO: run convolutional layers
        output = self.conv_network(x)
        
        
        # TODO: reshape batch of images to batch of 1D vectors
        output.Flatten()
        
        
        # TODO: run fully connected layers
        output = self.lin_network(output)

       
        
        # TODO: compute log softmax over the output
        output = nn.Softmax(output)
        
        
        return output
model = ConvNet3()

In [44]:
# hyper parameters
batch_size = 64
test_batch_size = 1000
epochs = 10
lr = 1.0
gamma = 0.7
log_interval = 100

# use GPU if available
use_cuda = torch.cuda.is_available()
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

# initialize data loaders
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])), batch_size=batch_size, shuffle=True, **kwargs)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, 
    transform=transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.1307,), (0.3081,))
    ])),
    batch_size=test_batch_size, shuffle=True, **kwargs)

device = torch.device("cuda" if use_cuda else "cpu")
model = FCNet2().to(device)
if use_cuda:
    model = model.cuda()

# initialize optimizer and scheduler
optimizer = optim.Adadelta(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

for epoch in range(1, epochs + 1):
    # train one epoch
    train(model, use_cuda, train_loader, optimizer, epoch, log_interval)
    
    # run on test dataset
    validate(model, use_cuda, test_loader)
    scheduler.step()
    
    
    torch.save(model.state_dict(), "models/mnist/checkpoint.pt")

=== Using cuda device ===
tensor([[[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          ...,
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],


        [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          ...,
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],


        [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)