In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

In [23]:
bs_train = 64
bs_test = 1000

In [37]:
## prepare MNIST dataset
trainset = torchvision.datasets.MNIST('/files/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ]))
train_loader = torch.utils.data.DataLoader(trainset, batch_size=bs_train, shuffle=True)

testset = torchvision.datasets.MNIST('/files/', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ]))
test_loader = torch.utils.data.DataLoader(testset, batch_size=bs_test, shuffle=True)

### Training Data Statistics

In [42]:
p = torch.bincount(trainset.targets)
p

tensor([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949])

In [43]:
p.sum()

tensor(60000)

In [45]:
(torch.log2(p / 60000) * (p / 60000)).sum()

tensor(-3.3199)

### Training Functions

In [51]:
def test(network):
    network.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to('cuda'), target.to('cuda')
            output = network(data)
            test_loss += F.nll_loss(output, target, size_average=False).item()
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).sum()
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

def train(network, epoch=3):
    optimizer = torch.optim.SGD(network.parameters(), lr=0.005,
                      momentum=0.5)
    network.train()
    for e in range(epoch):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to('cuda'), target.to('cuda')
            optimizer.zero_grad()
            output = network(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
        print("Epoch {}, Loss {}".format(e, loss.item()))
        test(network)

### Model Architectures

In [26]:
class Net1(nn.Module):
    def __init__(self):
        super(Net1, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=3, padding=1),
            nn.MaxPool2d(2),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(10, 20, kernel_size=3, padding=1),
            nn.MaxPool2d(2),
            nn.ReLU()
        )
        self.fc1 = nn.Sequential(
            nn.Linear(980, 120),
            nn.ReLU()
        )
        self.fc2 = nn.Linear(120, 10)

    def forward(self, x):
        x = self.conv2(self.conv1(x))
        x = x.view(-1, 980)
        x = self.fc2(self.fc1(x))
        return F.log_softmax(x, dim=1)

In [30]:
class Net2(nn.Module):
    def __init__(self):
        super(Net2, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 15, kernel_size=3, padding=1),
            nn.MaxPool2d(2),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(15, 30, kernel_size=3, padding=1),
            nn.MaxPool2d(2),
            nn.ReLU()
        )
        self.fc1 = nn.Sequential(
            nn.Linear(1470, 120),
            nn.ReLU()
        )
        self.fc2 = nn.Linear(120, 10)

    def forward(self, x):
        x = self.conv2(self.conv1(x))
        x = x.view(-1, 1470)
        x = self.fc2(self.fc1(x))
        return F.log_softmax(x, dim=1)

In [34]:
class Net3(nn.Module):
    def __init__(self):
        super(Net3, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 5, kernel_size=3, padding=1),
            nn.MaxPool2d(2),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(5, 10, kernel_size=3, padding=1),
            nn.MaxPool2d(2),
            nn.ReLU()
        )
        self.fc1 = nn.Sequential(
            nn.Linear(490, 120),
            nn.ReLU()
        )
        self.fc2 = nn.Linear(120, 10)

    def forward(self, x):
        x = self.conv2(self.conv1(x))
        x = x.view(-1, 490)
        x = self.fc2(self.fc1(x))
        return F.log_softmax(x, dim=1)

In [27]:
net = Net1().cuda()
train(net, epoch=5)

Epoch 0, Loss 0.2979038655757904





Test set: Avg. loss: 0.2679, Accuracy: 9149/10000 (91%)

Epoch 1, Loss 0.07840929180383682

Test set: Avg. loss: 0.1360, Accuracy: 9580/10000 (96%)

Epoch 2, Loss 0.052997052669525146

Test set: Avg. loss: 0.1034, Accuracy: 9688/10000 (97%)

Epoch 3, Loss 0.014399625360965729

Test set: Avg. loss: 0.0757, Accuracy: 9762/10000 (98%)

Epoch 4, Loss 0.02289530262351036

Test set: Avg. loss: 0.0673, Accuracy: 9795/10000 (98%)



In [31]:
net = Net2().cuda()
train(net, epoch=5)

Epoch 0, Loss 0.07484915852546692





Test set: Avg. loss: 0.2086, Accuracy: 9394/10000 (94%)

Epoch 1, Loss 0.05931271240115166

Test set: Avg. loss: 0.1337, Accuracy: 9590/10000 (96%)

Epoch 2, Loss 0.1019870713353157

Test set: Avg. loss: 0.0883, Accuracy: 9714/10000 (97%)

Epoch 3, Loss 0.012291406281292439

Test set: Avg. loss: 0.0711, Accuracy: 9780/10000 (98%)

Epoch 4, Loss 0.02485601045191288

Test set: Avg. loss: 0.0611, Accuracy: 9807/10000 (98%)



In [35]:
net = Net3().cuda()
train(net, epoch=5)

Epoch 0, Loss 0.28483662009239197





Test set: Avg. loss: 0.2674, Accuracy: 9176/10000 (92%)

Epoch 1, Loss 0.15086805820465088

Test set: Avg. loss: 0.1536, Accuracy: 9529/10000 (95%)

Epoch 2, Loss 0.2042219042778015

Test set: Avg. loss: 0.1107, Accuracy: 9646/10000 (96%)

Epoch 3, Loss 0.14078378677368164

Test set: Avg. loss: 0.0974, Accuracy: 9668/10000 (97%)

Epoch 4, Loss 0.03554607927799225

Test set: Avg. loss: 0.0861, Accuracy: 9700/10000 (97%)



In [48]:
net1 = Net1()
print(sum(p.numel() for p in net1.parameters() if p.requires_grad))
net2 = Net2()
print(sum(p.numel() for p in net2.parameters() if p.requires_grad))
net3 = Net3()
print(sum(p.numel() for p in net3.parameters() if p.requires_grad))

120850
181960
60640


In [49]:
class Net4(nn.Module):
    def __init__(self):
        super(Net4, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 4, kernel_size=3, padding=1),
            nn.MaxPool2d(2),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(4, 8, kernel_size=3, padding=1),
            nn.MaxPool2d(2),
            nn.ReLU()
        )
        self.fc1 = nn.Sequential(
            nn.Linear(392, 80),
            nn.ReLU()
        )
        self.fc2 = nn.Linear(80, 10)

    def forward(self, x):
        x = self.conv2(self.conv1(x))
        x = x.view(-1, 392)
        x = self.fc2(self.fc1(x))
        return F.log_softmax(x, dim=1)

In [56]:
net4 = Net4().cuda()
train(net4, epoch=5)

Epoch 0, Loss 0.20946656167507172





Test set: Avg. loss: 0.3134, Accuracy: 9043/10000 (90%)

Epoch 1, Loss 0.07053764164447784

Test set: Avg. loss: 0.1742, Accuracy: 9487/10000 (95%)

Epoch 2, Loss 0.13274231553077698

Test set: Avg. loss: 0.1351, Accuracy: 9584/10000 (96%)

Epoch 3, Loss 0.050590403378009796

Test set: Avg. loss: 0.1044, Accuracy: 9680/10000 (97%)

Epoch 4, Loss 0.048892855644226074

Test set: Avg. loss: 0.0829, Accuracy: 9737/10000 (97%)



In [54]:
print(sum(p.numel() for p in net4.parameters() if p.requires_grad))

32586
