# 1 Introduction

In [2]:

import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.optim as optim
from torch.utils.data import DataLoader

import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d, cross_entropy

plt.rc("figure", dpi=100)

batch_size = 100

# transform images into normalized tensors
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5,), std=(0.5,))
])

train_dataset = datasets.MNIST(
    "./",
    download=True,
    train=True,
    transform=transform,
)

test_dataset = datasets.MNIST(
    "./",
    download=True,
    train=False,
    transform=transform,
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=1,
    pin_memory=True,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=1,
    pin_memory=True,
)

def init_weights(shape):
    # Kaiming He initialization (a good initialization is important)
    # https://arxiv.org/abs/1502.01852
    std = np.sqrt(2. / shape[0])
    w = torch.randn(size=shape) * std
    w.requires_grad = True
    return w


def rectify(x):
    # Rectified Linear Unit (ReLU)
    return torch.max(torch.zeros_like(x), x)


class RMSprop(optim.Optimizer):
    """
    This is a reduced version of the PyTorch internal RMSprop optimizer
    It serves here as an example
    """
    def __init__(self, params, lr=1e-3, alpha=0.5, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # state initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(grad, avg, value=-group['lr'])


# define the neural network
def model(x, w_h, w_h2, w_o):
    h = rectify(x @ w_h)
    h2 = rectify(h @ w_h2)
    pre_softmax = h2 @ w_o
    return pre_softmax


# initialize weights

# input shape is (B, 784)
w_h = init_weights((784, 625))
# hidden layer with 625 neurons
w_h2 = init_weights((625, 625))
# hidden layer with 625 neurons
w_o = init_weights((625, 10))
# output shape is (B, 10)

optimizer = RMSprop(params=[w_h, w_h2, w_o])


n_epochs = 100

train_loss = []
test_loss = []

# put this into a training loop over 100 epochs
for epoch in range(n_epochs + 1):
    train_loss_this_epoch = []
    for idx, batch in enumerate(train_dataloader):
        x, y = batch

        # our model requires flattened input
        x = x.reshape(batch_size, 784)
        # feed input through model
        noise_py_x = model(x, w_h, w_h2, w_o)

        # reset the gradient
        optimizer.zero_grad()

        # the cross-entropy loss function already contains the softmax
        loss = cross_entropy(noise_py_x, y, reduction="mean")

        train_loss_this_epoch.append(float(loss))

        # compute the gradient
        loss.backward()
        # update weights
        optimizer.step()

    train_loss.append(np.mean(train_loss_this_epoch))

    # test periodically
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}")
        print(f"Mean Train Loss: {train_loss[-1]:.2e}")
        test_loss_this_epoch = []

        # no need to compute gradients for validation
        with torch.no_grad():
            for idx, batch in enumerate(test_dataloader):
                x, y = batch
                x = x.reshape(batch_size, 784)
                noise_py_x = model(x, w_h, w_h2, w_o)

                loss = cross_entropy(noise_py_x, y, reduction="mean")
                test_loss_this_epoch.append(float(loss))

        test_loss.append(np.mean(test_loss_this_epoch))

        print(f"Mean Test Loss:  {test_loss[-1]:.2e}")

plt.plot(np.arange(n_epochs + 1), train_loss, label="Train")
plt.plot(np.arange(1, n_epochs + 2, 10), test_loss, label="Test")
plt.title("Train and Test Loss over Training")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz


0.3%

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting ./MNIST\raw\train-images-idx3-ubyte.gz to ./MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST\raw\train-labels-idx1-ubyte.gz


100.0%


Extracting ./MNIST\raw\train-labels-idx1-ubyte.gz to ./MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST\raw\t10k-images-idx3-ubyte.gz


100.0%


Extracting ./MNIST\raw\t10k-images-idx3-ubyte.gz to ./MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz


100.0%

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST\raw\t10k-labels-idx1-ubyte.gz
Extracting ./MNIST\raw\t10k-labels-idx1-ubyte.gz to ./MNIST\raw






Epoch: 0
Mean Train Loss: 4.04e-01
Mean Test Loss:  3.38e-01
Epoch: 10
Mean Train Loss: 1.53e-01
Mean Test Loss:  2.69e-01
Epoch: 20
Mean Train Loss: 1.03e-01
Mean Test Loss:  3.91e-01
Epoch: 30
Mean Train Loss: 7.52e-02
Mean Test Loss:  5.69e-01
Epoch: 40
Mean Train Loss: 4.93e-02
Mean Test Loss:  6.11e-01
Epoch: 50
Mean Train Loss: 3.99e-02
Mean Test Loss:  6.93e-01
Epoch: 60
Mean Train Loss: 2.15e-02
Mean Test Loss:  6.55e-01
Epoch: 70
Mean Train Loss: 2.19e-02
Mean Test Loss:  8.42e-01
Epoch: 80
Mean Train Loss: 1.26e-02
Mean Test Loss:  8.98e-01
Epoch: 90
Mean Train Loss: 1.53e-02
Mean Test Loss:  7.61e-01
Epoch: 100
Mean Train Loss: 9.44e-03
Mean Test Loss:  8.84e-01


<matplotlib.legend.Legend at 0x27af79a87d0>

: 

: 

# 2 Dropout

In [9]:

import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.optim as optim
from torch.utils.data import DataLoader

import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d, cross_entropy

def dropout(X, p_drop=0.5):
    if 0 < p_drop < 1:
        mask = torch.bernoulli(torch.full(X.shape, 1 - p_drop))
        X_drop = torch.where(mask == 1, torch.zeros_like(X), X) / (1 - p_drop)
        return X_drop
    else:
        return X


In [10]:
def dropout_model(x, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    x_drop = dropout(x, p_drop_input)
    h = rectify(x_drop @ w_h)
    h_drop = dropout(h, p_drop_hidden)
    h2 = rectify(h_drop @ w_h2)
    h2_drop = dropout(h2, p_drop_hidden)
    pre_softmax = h2_drop @ w_o
    
    return pre_softmax


Task: Explain in a few sentences how the dropout method works and how it reduces overfitting.

Answer: During training, dropout randomly sets a fraction of input units or hidden units to zero at each update, effectively "dropping out" those units. This prevents the model from relying to heavily on specific input or hidden units, as they are sometimes "dropped out". During testing you obviously dont apply dropout. Instead you scale the input units units with the factor of 1-p_drop to compensate for the missing dropout.

Task: Why do we need a different model configuration for evaluating the test loss? 

Answer: During testing you obviously dont apply dropout, as this only leaves out information and simply wouldnt make any sense. Instead you scale the input units units with the factor of 1-p_drop to compensate for the missing dropout.


In [11]:
def test_model(x, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):

    scaled_x = (1 - p_drop_input) * x
    h = rectify(scaled_x @ w_h)
    scaled_h = (1 - p_drop_hidden) * h
    h2 = rectify(scaled_h @ w_h2)
    scaled_h2 = (1 - p_drop_hidden) * h2
    pre_softmax = scaled_h2 @ w_o
    
    return pre_softmax

In [13]:
plt.rc("figure", dpi=100)

batch_size = 100

# transform images into normalized tensors
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5,), std=(0.5,))
])

train_dataset = datasets.MNIST(
    "./",
    download=False,
    train=True,
    transform=transform,
)

test_dataset = datasets.MNIST(
    "./",
    download=False,
    train=False,
    transform=transform,
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=1,
    pin_memory=True,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=1,
    pin_memory=True,
)

def init_weights(shape):
    # Kaiming He initialization (a good initialization is important)
    # https://arxiv.org/abs/1502.01852
    std = np.sqrt(2. / shape[0])
    w = torch.randn(size=shape) * std
    w.requires_grad = True
    return w


def rectify(x):
    # Rectified Linear Unit (ReLU)
    return torch.max(torch.zeros_like(x), x)


class RMSprop(optim.Optimizer):
    """
    This is a reduced version of the PyTorch internal RMSprop optimizer
    It serves here as an example
    """
    def __init__(self, params, lr=1e-3, alpha=0.5, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # state initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(grad, avg, value=-group['lr'])
# initialize weights

# input shape is (B, 784)
w_h = init_weights((784, 625))
# hidden layer with 625 neurons
w_h2 = init_weights((625, 625))
# hidden layer with 625 neurons
w_o = init_weights((625, 10))
# output shape is (B, 10)

optimizer = RMSprop(params=[w_h, w_h2, w_o])


n_epochs = 100

train_loss = []
test_loss = []

# put this into a training loop over 100 epochs
for epoch in range(n_epochs + 1):
    train_loss_this_epoch = []
    for idx, batch in enumerate(train_dataloader):
        x, y = batch

        # our model requires flattened input
        x = x.reshape(batch_size, 784)
        # feed input through model
        noise_py_x = dropout_model(x, w_h, w_h2, w_o, 0.4, 0.4)

        # reset the gradient
        optimizer.zero_grad()

        # the cross-entropy loss function already contains the softmax
        loss = cross_entropy(noise_py_x, y, reduction="mean")

        train_loss_this_epoch.append(float(loss))

        # compute the gradient
        loss.backward()
        # update weights
        optimizer.step()

    train_loss.append(np.mean(train_loss_this_epoch))

    # test periodically
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}")
        print(f"Mean Train Loss: {train_loss[-1]:.2e}")
        test_loss_this_epoch = []

        # no need to compute gradients for validation
        with torch.no_grad():
            for idx, batch in enumerate(test_dataloader):
                x, y = batch
                x = x.reshape(batch_size, 784)
                noise_py_x = test_model(x, w_h, w_h2, w_o, 0.4, 0.4)

                loss = cross_entropy(noise_py_x, y, reduction="mean")
                test_loss_this_epoch.append(float(loss))

        test_loss.append(np.mean(test_loss_this_epoch))

        print(f"Mean Test Loss:  {test_loss[-1]:.2e}")

plt.plot(np.arange(n_epochs + 1), train_loss, label="Train")
plt.plot(np.arange(1, n_epochs + 2, 10), test_loss, label="Test")
plt.title("Train and Test Loss over Training")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

Epoch: 0
Mean Train Loss: 1.06e+00
Mean Test Loss:  4.95e-01
Epoch: 10
Mean Train Loss: 8.40e-01
Mean Test Loss:  3.39e-01
Epoch: 20
Mean Train Loss: 1.02e+00
Mean Test Loss:  2.75e-01
Epoch: 30
Mean Train Loss: 1.10e+00
Mean Test Loss:  2.56e-01
Epoch: 40
Mean Train Loss: 1.19e+00
Mean Test Loss:  2.74e-01
Epoch: 50
Mean Train Loss: 1.25e+00
Mean Test Loss:  2.54e-01
Epoch: 60
Mean Train Loss: 1.33e+00
Mean Test Loss:  2.94e-01
Epoch: 70
Mean Train Loss: 1.35e+00
Mean Test Loss:  2.77e-01
Epoch: 80
Mean Train Loss: 1.41e+00
Mean Test Loss:  2.85e-01
Epoch: 90
Mean Train Loss: 1.45e+00
Mean Test Loss:  3.14e-01
Epoch: 100
Mean Train Loss: 1.47e+00
Mean Test Loss:  2.93e-01


<matplotlib.legend.Legend at 0x14ca248cb50>

: 

: 

Task: Compare the test error with the test error from Section 1.

Answer: 

The training error in Section 1 is continously decreasing, even during the last epochs. However, the test error in Section 1 even increases slightly over time. This is a clear indication that the model overfits on the training data.

The training error in the Section with dropout doesnt really converge. The test error decreases slightly over time and is overall better than in the run without the dropout. This is a clear indication, that the model is better than the one from section 1 and does not overfit on the training data.

# 3 Parametric Relu