In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.optim

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d

In [2]:
cuda0 = torch.device('cuda:0')

# 1 Introduction

<span style="color:green;font-weight:bold">
Comment: equal to the sample solution. A plot to visualize the training progress would be nice (also for the following tasks) but this is also missing in the sample solution.
</span>

In [3]:
mb_size = 100 # mini-batch size of 100


trans = transforms.Compose([transforms.ToTensor(),
                            transforms.Normalize((0.5,), (0.5,))])


dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)

test_dataset = dset.MNIST("./", download=True,
                          train=False,
                          transform = trans)


dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=mb_size,
                                          shuffle=True, num_workers=1,
                                          pin_memory=True)

In [4]:
def init_weights(shape):
    # Kaiming He initialization (a good initialization is important)
    # https://arxiv.org/abs/1502.01852
    std = np.sqrt(2. / shape[0])
    w = torch.randn(size=shape, device=cuda0) * std
    w.requires_grad = True

    return w.to(cuda0)


def rectify(X):
    return torch.max(torch.zeros_like(X, device=cuda0), X).to(cuda0)


# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.5, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(grad, grad, value=1-alpha)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(grad, avg, value=-group['lr'])


def model(X, w_h, w_h2, w_o):
    h = rectify(X @ w_h)
    h2 = rectify(h @ w_h2)
    pre_softmax = h2 @ w_o
    return pre_softmax


In [5]:
w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

optimizer = RMSprop([w_h, w_h2, w_o])


# put this into a training loop over 100 epochs
for i in range(101):
    print("Epoch: {}".format(i+1))
    avg_train_loss = 0.
    for (j, (X, y)) in enumerate(dataloader):
        noise_py_x = model(X.reshape(mb_size, 784).to(cuda0), w_h, w_h2, w_o).to(cuda0)
        optimizer.zero_grad()
        # the cross-entropy loss function already contains the softmax
        cost = torch.nn.functional.cross_entropy(noise_py_x, y.to(cuda0), reduction="mean").to(cuda0)
        avg_train_loss += cost
        cost.backward()
        optimizer.step()

    if i % 10 == 0:
        print("Average Train Loss: {}".format(avg_train_loss / (j + 1)))

        # no need to calculate gradients for validation
        with torch.no_grad():
            avg_test_loss = 0.
            for (k, (X, y)) in enumerate(test_loader):
                noise_py_x = model(X.reshape(mb_size, 784).to(cuda0), w_h, w_h2, w_o).to(cuda0)
                cost = torch.nn.functional.cross_entropy(noise_py_x, y.to(cuda0), reduction="mean").to(cuda0)
                avg_test_loss += cost

            print("Average Test Loss: {}".format(avg_test_loss / (k + 1)))

Epoch: 1
Average Train Loss: 0.40141400694847107
Average Test Loss: 0.2639123797416687
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Average Train Loss: 0.15982545912265778
Average Test Loss: 0.23186850547790527
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Average Train Loss: 0.10280334204435349
Average Test Loss: 0.3892219364643097
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Average Train Loss: 0.08715856820344925
Average Test Loss: 0.616568386554718
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Average Train Loss: 0.05260054022073746
Average Test Loss: 0.6245554089546204
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Average Train Loss: 0.0411297045648098
Average Test Loss: 0.7247546911239624
Epoch: 52
Epoch: 53
Epoch: 54
Ep

# 2 Dropout

<span style="color:green;font-weight:bold">
Comment: This is a nice way to calculate the dropout, since you don't have to use numpy as in the sample solution. The answers to the questions seem good and correct. The rest is essentially equal to the sample solution.
</span>

In [6]:
from scipy.stats import binom

def dropout(X, p_drop= 0.5):
    
    if 0 < p_drop < 1:
        binomial = torch.distributions.binomial.Binomial(probs=1-p_drop)
        return X * binomial.sample(X.size()).to(cuda0) * (1.0/(1-p_drop))
    else:
        return X
    
def dropout_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    
    h = rectify(dropout(X, p_drop_input) @ w_h)
    h2 = rectify(dropout(h, p_drop_hidden) @ w_h2)
    pre_softmax = dropout(h2, p_drop_hidden)  @ w_o
    return pre_softmax

Dropout increases robustness and reduces overfitting due to the network always training on a random subset of neurons. This reduces the potential overreliance of the network on only a couple single neurons. Without dropout we often see many co-adaptions in networks, meaining that some neurons cancel out the mistakes of others. This in general does to generalize well; dropout prevents this as the network can not rely on all neurons being present at all the time, significantly reducing the amount of co-adaption.

For the test-set we obviously do not want to use the dropout model, as we want to test the model on all the activations. At the same time we need to keep in mind that the network activations are now much larger that the networks was used to during training (since we are using all the neuros at once now. A simple fix is to downscale the weights after training by multiplying them with 1-p_drop.
Here we realize this by including the scaling directly in the dropout training model (inverted dropout), allowing us to use the simple model during evaluation.

In [7]:
w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

optimizer = RMSprop([w_h, w_h2, w_o])
p_drop = 0.5

# put this into a training loop over 100 epochs
for i in range(101):
    print("Epoch: {}".format(i+1))
    avg_train_loss = 0.
    for (j, (X, y)) in enumerate(dataloader):
        noise_py_x = dropout_model(X.reshape(mb_size, 784).to(cuda0), w_h, w_h2, w_o, p_drop, p_drop)
        optimizer.zero_grad()
        # the cross-entropy loss function already contains the softmax
        cost = torch.nn.functional.cross_entropy(noise_py_x, y.to(cuda0), reduction="mean")
        avg_train_loss += cost
        cost.backward()
        optimizer.step()

    if i % 10 == 0:
        print("Average Train Loss: {}".format(avg_train_loss / (j + 1)))

        # no need to calculate gradients for validation
        with torch.no_grad():
            avg_test_loss = 0.
            for (k, (X, y)) in enumerate(test_loader):
                noise_py_x = model(X.reshape(mb_size, 784).to(cuda0), w_h, w_h2, w_o)
                cost = torch.nn.functional.cross_entropy(noise_py_x, y.to(cuda0), reduction="mean")
                avg_test_loss += cost

            print("Average Test Loss: {}".format(avg_test_loss / (k + 1)))

Epoch: 1
Average Train Loss: 1.0107272863388062
Average Test Loss: 0.32930248975753784
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Average Train Loss: 0.9989118576049805
Average Test Loss: 0.30084410309791565
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Average Train Loss: 1.2981696128845215
Average Test Loss: 0.3829096257686615
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Average Train Loss: 1.5067752599716187
Average Test Loss: 0.4098247289657593
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Average Train Loss: 1.7145969867706299
Average Test Loss: 0.5594747066497803
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Average Train Loss: 1.853873372077942
Average Test Loss: 0.6022546291351318
Epoch: 52
Epoch: 53
Epoch: 54
Epoch:

As expected the test loss with dropout is smaller.

# Parametric Relu

<span style="color:green;font-weight:bold">
Comment: You use the same mask three times, which could be cached for efficiency. Also you reassign values from the input which, according to the sample solution, gives an error. Otherwise, 'PRelu' is implemented differently but gives the same results.
<br>
In the 'prelu_model' you didn't include dropout, but this wasn't required. This could be the reason though, that your PRelu-model overfits 
</span>

In [8]:
def PRelu(X, a):
    X[X <= 0] = X[X <= 0] * a.repeat(X.shape[0], 1)[X <= 0]
    return X


def prelu_model(X, w_h, w_h2, w_o, a):
    h = PRelu(X @ w_h, a[0])
    h2 = PRelu(h @ w_h2, a[1])
    pre_softmax = h2 @ w_o
    return pre_softmax

In [9]:
a = init_weights((2, 625))
w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

optimizer = RMSprop([w_h, w_h2, w_o, a])


# put this into a training loop over 100 epochs
for i in range(101):
    print("Epoch: {}".format(i+1))
    avg_train_loss = 0.
    for (j, (X, y)) in enumerate(dataloader):
        noise_py_x = prelu_model(X.reshape(mb_size, 784).to(cuda0), w_h, w_h2, w_o, a)
        optimizer.zero_grad()
        # the cross-entropy loss function already contains the softmax
        cost = torch.nn.functional.cross_entropy(noise_py_x, y.to(cuda0), reduction="mean")
        avg_train_loss += cost
        cost.backward()
        optimizer.step()

    if i % 10 == 0:
        print("Average Train Loss: {}".format(avg_train_loss / (j + 1)))

        # no need to calculate gradients for validation
        with torch.no_grad():
            avg_test_loss = 0.
            for (k, (X, y)) in enumerate(test_loader):
                noise_py_x = prelu_model(X.reshape(mb_size, 784).to(cuda0), w_h, w_h2, w_o, a)
                cost = torch.nn.functional.cross_entropy(noise_py_x, y.to(cuda0), reduction="mean")
                avg_test_loss += cost

            print("Average Test Loss: {}".format(avg_test_loss / (k + 1)))

Epoch: 1
Average Train Loss: 0.8047875761985779
Average Test Loss: 0.4409887194633484
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Average Train Loss: 0.16160310804843903
Average Test Loss: 0.3577435612678528
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Average Train Loss: 0.13123373687267303
Average Test Loss: 0.30757373571395874
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Average Train Loss: 0.10430516302585602
Average Test Loss: 0.5955327749252319
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Average Train Loss: 0.08805377781391144
Average Test Loss: 0.40722155570983887
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Average Train Loss: 0.07319553196430206
Average Test Loss: 0.7236766815185547
Epoch: 52
Epoch: 53
Epoch: 54


The resulting loss is smaller than the plain ReLu, but looks like of an improvement than the dropout method. But one would need to gather some statistic with more training runs.


# 4 Convolutional layers

<span style="color:green;font-weight:bold">
Comment: The convolutional network is correct, but most of task 4.2 is missing, i.e. the sketch of the cnn, the plots and the second part of 4.2
</span>

In [10]:
def convolutional_network(X, w_c1, w_c2, w_c3, w_h2, w_o, p_drop_input, p_drop_hidden):

    X = X.reshape(-1, 1, 28, 28)
    
    con_l1 = convolutional_layer(X, w_c1, p_drop_input)
    con_l2 = convolutional_layer(con_l1, w_c2, p_drop_input)
    con_l3 = convolutional_layer(con_l2, w_c3, p_drop_input)
    
    con_l3 = con_l3.reshape(con_l3.shape[0], -1)

    h2 = rectify(dropout(con_l3, p_drop_hidden) @ w_h2)
    pre_softmax = dropout(h2, p_drop_hidden)  @ w_o
    
    return pre_softmax

def convolutional_layer(input_layer, weights, p_drop):
    
    con_layer = rectify(conv2d(input_layer, weights))
    subsampling_layer = max_pool2d(con_layer, (2, 2))
    out_layer = dropout(subsampling_layer, p_drop)
    
    return out_layer

In [11]:
w_c1 = init_weights((32, 1, 5, 5))
w_c2 = init_weights((64, 32, 5, 5))
w_c3 = init_weights((128, 64, 2, 2))

number_of_output_pixel = 128

w_h2 = init_weights((number_of_output_pixel, 625))
w_o = init_weights((625, 10))

optimizer = RMSprop([w_c1, w_c2, w_c3, w_h2, w_o])
p_drop = 0.5

In [12]:
# put this into a training loop over 100 epochs
for i in range(101):
    print("Epoch: {}".format(i+1))
    avg_train_loss = 0.
    for (j, (X, y)) in enumerate(dataloader):
      
        noise_py_x = convolutional_network(X.reshape(mb_size, 784).to(cuda0), w_c1, w_c2, w_c3, w_h2, w_o, p_drop, p_drop)
        optimizer.zero_grad()
        # the cross-entropy loss function already contains the softmax
        cost = torch.nn.functional.cross_entropy(noise_py_x.to(cuda0), y.to(cuda0), reduction="mean")
        avg_train_loss += cost
        cost.backward()
        optimizer.step()

    if i % 10 == 0:
        print("Average Train Loss: {}".format(avg_train_loss / (j + 1)))

        # no need to calculate gradients for validation
        with torch.no_grad():
            avg_test_loss = 0.
            for (k, (X, y)) in enumerate(test_loader):
                noise_py_x = convolutional_network(X.reshape(mb_size, 784).to(cuda0), w_c1, w_c2, w_c3, w_h2, w_o, p_drop, p_drop)
                cost = torch.nn.functional.cross_entropy(noise_py_x, y.to(cuda0), reduction="mean")
                avg_test_loss += cost

            print("Average Test Loss: {}".format(avg_test_loss / (k + 1)))

Epoch: 1
Average Train Loss: 6.98406982421875
Average Test Loss: 2.3138933181762695
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Average Train Loss: 2.3272807598114014
Average Test Loss: 2.3050177097320557
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Average Train Loss: 2.3267812728881836
Average Test Loss: 2.370023727416992
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Average Train Loss: 2.331801414489746
Average Test Loss: 2.3651843070983887
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Average Train Loss: 2.3353028297424316
Average Test Loss: 2.2922468185424805
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Average Train Loss: 2.343891143798828
Average Test Loss: 2.3128747940063477
Epoch: 52
Epoch: 53
Epoch: 54
Epoch: 55
Ep

The implementation seems to be correct, but we had some troubles succesfully training the network. Even using a GPU (google collab), the loss did not seem to reduce enough. The training took extremely long compared to before which made it difficult to identify the mistakes.