Code based on https://github.com/pytorch/examples/blob/master/mnist/main.py

This exercise covers two aspects:
* In tasks 1-6 you will implement mechanisms that allow training deeper models (better initialization, batch normalization). Note that for dropout and batch norm you are expected to implement it yourself without relying on ready-made components from Pytorch.
* In task 7 you will implement a convnet using [conv2d](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html).


Tasks:


6. Implement batch normalization (use train mode also for testing - it should perform well enough):
    * compute batch mean and variance
    * add new variables beta and gamma
    * check that the networks learns much faster for 5 layers
    * check that the network learns even for 10 hidden layers.
7. So far we worked with a fully connected network. Design and implement in pytorch (by using pytorch functions) a simple convolutional network and achieve 99% test accuracy. The architecture is up to you, but even a few convolutional layers should be enough.

In [92]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.parameter import Parameter
from torch.nn import init
import torchvision
import torchvision.transforms as transforms

In [93]:
class Linear(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        self.bias = Parameter(torch.Tensor(out_features))
        self.reset_parameters()

    def reset_parameters(self):
        self.weight.data.normal_(mean=0,std=0.25)
        init.zeros_(self.bias)

    def forward(self, x):
        r = x.matmul(self.weight.t())
        r += self.bias
        return r


class Net(nn.Module):
    def __init__(self, dims):
        super(Net, self).__init__()
        self.fc = torch.nn.ModuleList([Linear(dim_in, dim_out)\
                                       for dim_in, dim_out in zip(dims[:-1], dims[1:])])

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        for layer in self.fc[:-1]:
          x = F.relu(layer(x))

        return self.fc[-1](x)


In [94]:
class MnistTrainer(object):
    def __init__(self, batch_size):
        transform = transforms.Compose(
                [transforms.ToTensor()])
        self.trainset = torchvision.datasets.MNIST(
            root='./data',
            download=True,
            train=True,
            transform=transform)
        self.trainloader = torch.utils.data.DataLoader(
            self.trainset, batch_size=batch_size, shuffle=True, num_workers=2)

        self.testset = torchvision.datasets.MNIST(
            root='./data',
            train=False,
            download=True, transform=transform)
        self.testloader = torch.utils.data.DataLoader(
            self.testset, batch_size=1, shuffle=False, num_workers=2)

    def train(self, net, epochs=20):

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.9)

        for epoch in range(epochs):
            running_loss = 0.0
            net.train()
            for i, data in enumerate(self.trainloader, 0):
                inputs, labels = data
                optimizer.zero_grad()

                outputs = net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                if i % 100 == 99:
                    print('[%d, %5d] loss: %.3f' %
                          (epoch + 1, i + 1, running_loss / 100))
                    running_loss = 0.0
            correct = 0
            total = 0
            net.eval()
            with torch.no_grad():
                for data in self.testloader:
                    images, labels = data
                    outputs = net(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            print('Accuracy of the network on the {} test images: {} %'.format(
                total, 100 * correct / total))

## 1. Baseline

Check that the given implementation reaches 95% test accuracy for
   architecture input-64-64-10 in a few thousand batches.

In [None]:
trainer = MnistTrainer(batch_size=128)
trainer.train(Net(dims=[784, 64, 64, 10]))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

[1,   100] loss: 1.004
[1,   200] loss: 0.342
[1,   300] loss: 0.286
[1,   400] loss: 0.259
Accuracy of the network on the 10000 test images: 92.06 %
[2,   100] loss: 0.195
[2,   200] loss: 0.189
[2,   300] loss: 0.178
[2,   400] loss: 0.172
Accuracy of the network on the 10000 test images: 95.17 %
[3,   100] loss: 0.136
[3,   200] loss: 0.141
[3,   300] loss: 0.151
[3,   400] loss: 0.132
Accuracy of the network on the 10000 test images: 95.39 %
[4,   100] loss: 0.121
[4,   200] loss: 0.121
[4,   300] loss: 0.118
[4,   400] loss: 0.124
Accuracy of the network on the 10000 test images: 96.03 %
[5,   100] loss: 0.098
[5,   200] loss: 0.089
[5,   300] loss: 0.107
[5,   400] loss: 0.110
Accuracy of the network on the 10000 test images: 96.41 %
[6,   100] loss: 0.086
[6,   200] loss: 0.094
[6,   300] loss: 0.089
[6,   400] loss: 0.095
Accuracy of the network on the 10000 test images: 96.08 %
[7,   100] loss: 0.076
[7

## 2. Glorot Initialization (Xaview Normal)

Improve initialization and check that the network learns much faster
   and reaches over 97% test accuracy. A good basic initialization scheme is so-called Glorot initialization. For a set of weights going from a layer with $n_{in}$ neurons to a layer with $n_{out}$ neurons, it samples each weight from normal distribution with $0$ mean and standard deviation of $\sqrt{\frac{2}{n_{in}+n_{out}}}$.

In [95]:
def glorot_init(m):
  if isinstance(m, nn.ModuleList):
    for layer in m: 
      torch.nn.init.xavier_normal_(layer.weight)
      layer.bias.data.fill_(0.00)

Net(
  (fc): ModuleList(
    (0): Linear()
    (1): Linear()
  )
)

In [None]:
net = Net(dims=[784, 64, 10])
net.apply(glorot_init)

trainer = MnistTrainer(batch_size=128)
trainer.train(net)

[1,   100] loss: 0.653
[1,   200] loss: 0.300
[1,   300] loss: 0.247
[1,   400] loss: 0.223
Accuracy of the network on the 10000 test images: 94.17 %
[2,   100] loss: 0.181
[2,   200] loss: 0.170
[2,   300] loss: 0.157
[2,   400] loss: 0.151
Accuracy of the network on the 10000 test images: 95.82 %
[3,   100] loss: 0.119
[3,   200] loss: 0.123
[3,   300] loss: 0.121
[3,   400] loss: 0.117
Accuracy of the network on the 10000 test images: 96.73 %
[4,   100] loss: 0.092
[4,   200] loss: 0.098
[4,   300] loss: 0.094
[4,   400] loss: 0.089
Accuracy of the network on the 10000 test images: 96.76 %
[5,   100] loss: 0.084
[5,   200] loss: 0.075
[5,   300] loss: 0.082
[5,   400] loss: 0.082
Accuracy of the network on the 10000 test images: 97.19 %
[6,   100] loss: 0.065
[6,   200] loss: 0.068
[6,   300] loss: 0.067
[6,   400] loss: 0.072
Accuracy of the network on the 10000 test images: 97.29 %
[7,   100] loss: 0.058
[7,   200] loss: 0.056
[7,   300] loss: 0.062
[7,   400] loss: 0.063
Accuracy

## 3. 64-64-64-64-64-10

Check, that with proper initialization we can train architecture
   input-64-64-64-64-64-10, while with bad initialization it does
   not even get off the ground.

#### Bad initialization

In [None]:
trainer = MnistTrainer(batch_size=128)
trainer.train(Net(dims=[784, 64, 64, 64, 64, 64, 64, 10]), epochs=5)

[1,   100] loss: 4.348
[1,   200] loss: 2.302
[1,   300] loss: 2.302
[1,   400] loss: 2.302
Accuracy of the network on the 10000 test images: 11.35 %
[2,   100] loss: 2.302
[2,   200] loss: 2.303
[2,   300] loss: 2.302
[2,   400] loss: 2.301
Accuracy of the network on the 10000 test images: 11.35 %
[3,   100] loss: 2.302
[3,   200] loss: 2.302
[3,   300] loss: 2.301
[3,   400] loss: 2.302
Accuracy of the network on the 10000 test images: 11.35 %
[4,   100] loss: 2.302
[4,   200] loss: 2.302
[4,   300] loss: 2.302
[4,   400] loss: 2.303
Accuracy of the network on the 10000 test images: 11.35 %
[5,   100] loss: 2.303
[5,   200] loss: 2.302
[5,   300] loss: 2.302
[5,   400] loss: 2.302
Accuracy of the network on the 10000 test images: 11.35 %


In [None]:
trainer = MnistTrainer(batch_size=128)
trainer.train(Net(dims=[784, 64, 64, 64, 64, 64, 64, 10]), epochs=5)

#### Proper initialization

In [None]:
net = Net(dims=[784, 64, 64, 64, 64, 64, 64, 10])
net.apply(glorot_init)

trainer = MnistTrainer(batch_size=128)
trainer.train(net, epochs=5)

[1,   100] loss: 1.114
[1,   200] loss: 0.360
[1,   300] loss: 0.261
[1,   400] loss: 0.253
Accuracy of the network on the 10000 test images: 94.4 %
[2,   100] loss: 0.169
[2,   200] loss: 0.173
[2,   300] loss: 0.162
[2,   400] loss: 0.165
Accuracy of the network on the 10000 test images: 94.67 %
[3,   100] loss: 0.136
[3,   200] loss: 0.140
[3,   300] loss: 0.128
[3,   400] loss: 0.117
Accuracy of the network on the 10000 test images: 95.89 %
[4,   100] loss: 0.097
[4,   200] loss: 0.099
[4,   300] loss: 0.108
[4,   400] loss: 0.100
Accuracy of the network on the 10000 test images: 96.51 %
[5,   100] loss: 0.086
[5,   200] loss: 0.081
[5,   300] loss: 0.091
[5,   400] loss: 0.091
Accuracy of the network on the 10000 test images: 96.91 %


## 4. Dropout

Add dropout implemented in pytorch

In [None]:
class Net_Dropout(nn.Module):
    def __init__(self, dims):
        super(Net_Dropout, self).__init__()
        self.fc = torch.nn.ModuleList([Linear(dim_in, dim_out)\
                                       for dim_in, dim_out in zip(dims[:-1], dims[1:])])

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        for layer in self.fc[:-1]:
          if self.training:
            x = F.dropout(x, p=0.3)
          x = F.relu(layer(x))

        return self.fc[-1](x)

trainer = MnistTrainer(batch_size=128)
trainer.train(Net_Dropout(dims=[784, 64, 64, 64, 64, 64, 10]), epochs=5)

[1,   100] loss: 2.977
[1,   200] loss: 2.118
[1,   300] loss: 1.861
[1,   400] loss: 1.736
Accuracy of the network on the 10000 test images: 36.84 %
[2,   100] loss: 1.584
[2,   200] loss: 1.514
[2,   300] loss: 1.426
[2,   400] loss: 1.353
Accuracy of the network on the 10000 test images: 54.46 %
[3,   100] loss: 1.248
[3,   200] loss: 1.228
[3,   300] loss: 1.192
[3,   400] loss: 1.143
Accuracy of the network on the 10000 test images: 61.1 %
[4,   100] loss: 1.092
[4,   200] loss: 1.082
[4,   300] loss: 1.047
[4,   400] loss: 1.011
Accuracy of the network on the 10000 test images: 67.31 %
[5,   100] loss: 0.990
[5,   200] loss: 0.967
[5,   300] loss: 0.951
[5,   400] loss: 0.947
Accuracy of the network on the 10000 test images: 71.92 %


## 5. 64(10)-10

Check that with 10 hidden layers (64 units each) even with proper
    initialization the network has a hard time to start learning.

In [None]:
# 13 h layers
net = Net(dims=[784, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 10])
net.apply(glorot_init)

trainer = MnistTrainer(batch_size=128)
trainer.train(net, epochs=5)

[1,   100] loss: 2.053
[1,   200] loss: 1.064
[1,   300] loss: 0.646
[1,   400] loss: 0.456
Accuracy of the network on the 10000 test images: 89.06 %
[2,   100] loss: 0.348
[2,   200] loss: 0.312
[2,   300] loss: 0.290
[2,   400] loss: 0.264
Accuracy of the network on the 10000 test images: 91.49 %
[3,   100] loss: 0.244
[3,   200] loss: 0.261
[3,   300] loss: 0.206
[3,   400] loss: 0.217
Accuracy of the network on the 10000 test images: 94.46 %
[4,   100] loss: 0.220
[4,   200] loss: 0.162
[4,   300] loss: 0.180
[4,   400] loss: 0.168
Accuracy of the network on the 10000 test images: 95.53 %
[5,   100] loss: 0.154
[5,   200] loss: 0.143
[5,   300] loss: 0.145
[5,   400] loss: 0.154
Accuracy of the network on the 10000 test images: 96.0 %
[6,   100] loss: 0.129
[6,   200] loss: 0.130
[6,   300] loss: 0.127
[6,   400] loss: 0.125
Accuracy of the network on the 10000 test images: 96.28 %
[7,   100] loss: 0.105
[7,   200] loss: 0.113
[7,   300] loss: 0.127
[7,   400] loss: 0.121
Accuracy 

## 6. BatchNorm1d

In [104]:
def glorot_init(m):
  if isinstance(m, nn.Linear):
    torch.nn.init.xavier_normal_(layer.weight)
    layer.bias.data.fill_(0.00)


class BatchNorm1d(torch.nn.Module):
  def __init__(self, n_feat, eps=1e-5, momentum=0.1):
    super(BatchNorm1d, self).__init__()
    self.eps = eps
    self.m = momentum
    self.beta = torch.nn.Parameter(torch.zeros(1, n_feat), requires_grad=True)
    self.gamma = torch.nn.Parameter(torch.ones(1, n_feat), requires_grad=True)
    self.register_buffer('mu', tensor=torch.zeros(1, n_feat))
    self.register_buffer('sigma2', tensor=torch.ones(1, n_feat))

  def forward(self, x):
    if self.training:
      n = x.numel() / x.size(1)
      mean = x.mean(dim=0, keepdim=True)
      var = x.var(dim=0, keepdim=True, unbiased=False)
      self.mu = (1 - self.m) * self.mu + self.m * mean
      self.sigma2 = (1 - self.m) * self.sigma2 * n / (n-1) + self.m * var
    else:
      mean = self.mu
      var = self.sigma2

    z = (x - mean) / torch.sqrt(var + self.eps)
    return self.gamma * z + self.beta


class Net_BatchNorm1d(nn.Module):
    def __init__(self, dims):
        super(Net_BatchNorm1d, self).__init__()
        self.fc1 = Linear(784, 64)
        self.bn1 = BatchNorm1d(64)
        self.fc2 = Linear(64, 64)
        self.bn2 = BatchNorm1d(64)
        self.fc3 = Linear(64, 64)
        self.bn3 = BatchNorm1d(64)
        self.fc4 = Linear(64, 64)
        self.bn4 = BatchNorm1d(64)
        self.fc5 = Linear(64, 64)
        self.bn5 = BatchNorm1d(64)
        self.fc6 = Linear(64, 64)
        self.bn6 = BatchNorm1d(64)
        self.fc7 = Linear(64, 64)
        self.bn7 = BatchNorm1d(64)
        self.fc8 = Linear(64, 64)
        self.bn8 = BatchNorm1d(64)
        self.fc9 = Linear(64, 64)
        self.bn9 = BatchNorm1d(64)
        self.fc10 = Linear(64, 64)
        self.bn10 = BatchNorm1d(64)

        self.fc11 = Linear(64, 64)
        

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        self.bn1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(self.bn2(x))
        x = self.fc3(x)
        x = F.relu(self.bn3(x))
        x = self.fc4(x)
        x = F.relu(self.bn4(x))
        x = self.fc5(x)
        x = F.relu(self.bn5(x))
        x = self.fc6(x)
        x = F.relu(self.bn6(x))
        x = self.fc7(x)
        x = F.relu(self.bn7(x))
        x = self.fc8(x)
        x = F.relu(self.bn8(x))
        x = self.fc9(x)
        x = F.relu(self.bn9(x))
        x = self.fc10(x)
        x = F.relu(self.bn10(x))

        return self.fc11(x)

net = Net_BatchNorm1d()
net.apply(glorot_init)

trainer = MnistTrainer(batch_size=128)
trainer.train(net, epochs=5)

[1,   100] loss: 1.040
[1,   200] loss: 0.343
[1,   300] loss: 0.279
[1,   400] loss: 0.254
Accuracy of the network on the 10000 test images: 94.21 %
[2,   100] loss: 0.197
[2,   200] loss: 0.181
[2,   300] loss: 0.180
[2,   400] loss: 0.173
Accuracy of the network on the 10000 test images: 95.46 %
[3,   100] loss: 0.138
[3,   200] loss: 0.135
[3,   300] loss: 0.140
[3,   400] loss: 0.143
Accuracy of the network on the 10000 test images: 95.86 %
[4,   100] loss: 0.114
[4,   200] loss: 0.112
[4,   300] loss: 0.119
[4,   400] loss: 0.113
Accuracy of the network on the 10000 test images: 96.32 %
[5,   100] loss: 0.092
[5,   200] loss: 0.105
[5,   300] loss: 0.096
[5,   400] loss: 0.093
Accuracy of the network on the 10000 test images: 96.47 %


## 7. Convolutional Network

In [None]:
def glorot_init(m):
  if isinstance(m, nn.ModuleList):
    for layer in m: 
      torch.nn.init.xavier_normal_(layer.weight)
      layer.bias.data.fill_(0.00)

In [115]:
class Conv_Net(nn.Module):
  def __init__(self):
    super(Conv_Net, self).__init__()
    self.net = nn.Sequential(nn.Conv2d(1, 16, kernel_size=3, stride=2),
                             nn.ReLU(),
                             nn.Conv2d(16, 32, kernel_size=3, stride=2),
                             nn.ReLU(),
                             nn.MaxPool2d(kernel_size=2, stride=2))
    self.fc = nn.Linear(3*3*32, 10)


  def forward(self, x):
    x = self.net(x)
    x = x.flatten(start_dim=1)
    x = self.fc(x)
    return x

net = Conv_Net()
# net.apply(glorot_init)

trainer = MnistTrainer(batch_size=128)
trainer.train(net, epochs=5)

[1,   100] loss: 1.236
[1,   200] loss: 0.245
[1,   300] loss: 0.185
[1,   400] loss: 0.149
Accuracy of the network on the 10000 test images: 96.4 %
[2,   100] loss: 0.128
[2,   200] loss: 0.116
[2,   300] loss: 0.117
[2,   400] loss: 0.107
Accuracy of the network on the 10000 test images: 96.59 %
[3,   100] loss: 0.091
[3,   200] loss: 0.088
[3,   300] loss: 0.091
[3,   400] loss: 0.089
Accuracy of the network on the 10000 test images: 97.19 %
[4,   100] loss: 0.074
[4,   200] loss: 0.078
[4,   300] loss: 0.076
[4,   400] loss: 0.072
Accuracy of the network on the 10000 test images: 97.51 %
[5,   100] loss: 0.069
[5,   200] loss: 0.066
[5,   300] loss: 0.069
[5,   400] loss: 0.071
Accuracy of the network on the 10000 test images: 98.13 %


## 8. Conv2d + BatchNorm2d

In [134]:
class BatchNorm2d(torch.nn.Module):
  def __init__(self, n_feat, eps=1e-5, momentum=0.1):
    super(BatchNorm2d, self).__init__()
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.eps = eps
    self.m = momentum
    self.beta = torch.nn.Parameter(torch.zeros(n_feat)).to(self.device)
    self.gamma = torch.nn.Parameter(torch.ones(n_feat)).to(self.device)
    self.register_buffer('mu', torch.zeros(n_feat).to(self.device))
    self.register_buffer('sigma2', torch.ones(n_feat).to(self.device))

  def forward(self, x):
    if self.training:
      n = x.numel() / x.size(1)
      mean = x.mean(dim=[0,2,3])
      var = x.var(dim=[0,2,3], unbiased=False)
      with torch.no_grad():
        self.mu = (1-self.m) * self.mu + self.m * mean
        self.sigma2 = (1-self.m) * self.sigma2 * n / (n-1) + self.m * var #czy to konieczne?
    else:
      mean = self.mu
      var = self.sigma2

    z = (x - mean[None, :, None, None]) / torch.sqrt(var[None, :, None, None] + self.eps)
    return self.gamma[None, :, None, None] * z + self.beta[None, :, None, None]
    

class BN2d_Conv_Net(nn.Module):
  def __init__(self):
    super(BN2d_Conv_Net, self).__init__()
    self.net = nn.Sequential(nn.Conv2d(1, 16, kernel_size=3, stride=2),
                             BatchNorm2d(16),
                             nn.ReLU(),
                             nn.Conv2d(16, 32, kernel_size=3, stride=2),
                             BatchNorm2d(32),
                             nn.ReLU(),
                             nn.MaxPool2d(kernel_size=2, stride=2))
    self.fc = nn.Linear(3*3*32, 10)


  def forward(self, x):
    x = self.net(x)
    x = x.flatten(start_dim=1)
    x = self.fc(x)
    return x

net = BN2d_Conv_Net()
# net.apply(glorot_init)

trainer = MnistTrainer(batch_size=128)
trainer.train(net, epochs=5)

[1,   100] loss: 0.477
[1,   200] loss: 0.160
[1,   300] loss: 0.127
[1,   400] loss: 0.115
Accuracy of the network on the 10000 test images: 96.66 %
[2,   100] loss: 0.082
[2,   200] loss: 0.081
[2,   300] loss: 0.082
[2,   400] loss: 0.080
Accuracy of the network on the 10000 test images: 97.99 %
[3,   100] loss: 0.067
[3,   200] loss: 0.062
[3,   300] loss: 0.069
[3,   400] loss: 0.068
Accuracy of the network on the 10000 test images: 97.92 %
[4,   100] loss: 0.052
[4,   200] loss: 0.056
[4,   300] loss: 0.050
[4,   400] loss: 0.051
Accuracy of the network on the 10000 test images: 97.97 %
[5,   100] loss: 0.041
[5,   200] loss: 0.043
[5,   300] loss: 0.048
[5,   400] loss: 0.046
Accuracy of the network on the 10000 test images: 98.49 %
