### Inception

![Structure of the Inception block. ](http://www.d2l.ai/_images/inception.svg)

In [1]:
%matplotlib inline
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
import numpy as np
import torchvision
import torchvision.datasets as datasets
from torchvision import transforms
import torch.optim as optim
import time

In [2]:
class Inception(nn.Module):
    # c1 - c4 are the number of output channels for each layer in the path.
    def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
        super(Inception, self).__init__(**kwargs)
        # Path 1 is a single 1 x 1 convolutional layer.
        self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
        # Path 2 is a 1 x 1 convolutional layer followed by a 3 x 3 convolutional layer.
        self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
        self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
        # Path 3 is a 1 x 1 convolutional layer followed by a 5 x 5 convolutional layer.
        self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2)
        # Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1 convolutional layer.
        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)
        # Activation function
        self.relu = nn.ReLU()

    def forward(self, x):
        p1 = self.relu(self.p1_1(x))
        p2 = self.relu(self.p2_2(self.relu(self.p2_1(x))))
        p3 = self.relu(self.p3_2(self.relu(self.p3_1(x))))
        p4 = self.relu(self.p4_2(self.p4_1(x)))
        # Concatenate the outputs on the channel dimension
        return torch.cat((p1, p2, p3, p4), dim=1)

### Inception Model - Stage 1

In [3]:
b1 = nn.Sequential(
       nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
       nn.ReLU(),
       nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
       )

### Inception Model - Stage 2

In [4]:
b2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=1),
       nn.ReLU(),
       nn.Conv2d(64, 192, kernel_size=3, padding=1),
       nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
       )

### Inception Model - Stage 3

In [5]:
b3 = nn.Sequential(
       Inception(192, 64, (96, 128), (16, 32), 32),
       Inception(256, 128, (128, 192), (32, 96), 64),
       nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
       )

### Inception Model - Stage 4

We use a total of 512 channels (128 + 256 + 64 + 64) 

In [6]:
b4 = nn.Sequential(
       Inception(480, 192, (96, 208), (16, 48), 64),
       Inception(512, 160, (112, 224), (24, 64), 64),
       Inception(512, 128, (128, 256), (24, 64), 64),
       Inception(512, 112, (144, 288), (32, 64), 64),
       Inception(528, 256, (160, 320), (32, 128), 128),
       nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
       )

### Inception Model - Stage 5

In [7]:
b5 = nn.Sequential(
       Inception(832, 256, (160, 320), (32, 128), 128),
       Inception(832, 384, (192, 384), (48, 128), 128),
       nn.AdaptiveMaxPool2d((1,1))
       )

class flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

def weights_init(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)
net = nn.Sequential(
        *b1, 
        *b2, 
        *b3, 
        *b4, 
        *b5, 
        flatten(),
        nn.Linear(1024, 10)
        )

net = net.apply(weights_init)

Priming the network (at full size)

In [8]:
X = torch.randn(size=(1, 1, 96, 96))
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, 'output shape:\t', X.shape)

Conv2d output shape:	 torch.Size([1, 64, 48, 48])
ReLU output shape:	 torch.Size([1, 64, 48, 48])
MaxPool2d output shape:	 torch.Size([1, 64, 24, 24])
Conv2d output shape:	 torch.Size([1, 64, 24, 24])
ReLU output shape:	 torch.Size([1, 64, 24, 24])
Conv2d output shape:	 torch.Size([1, 192, 24, 24])
MaxPool2d output shape:	 torch.Size([1, 192, 12, 12])
Inception output shape:	 torch.Size([1, 256, 12, 12])
Inception output shape:	 torch.Size([1, 480, 12, 12])
MaxPool2d output shape:	 torch.Size([1, 480, 6, 6])
Inception output shape:	 torch.Size([1, 512, 6, 6])
Inception output shape:	 torch.Size([1, 512, 6, 6])
Inception output shape:	 torch.Size([1, 512, 6, 6])
Inception output shape:	 torch.Size([1, 528, 6, 6])
Inception output shape:	 torch.Size([1, 832, 6, 6])
MaxPool2d output shape:	 torch.Size([1, 832, 3, 3])
Inception output shape:	 torch.Size([1, 832, 3, 3])
Inception output shape:	 torch.Size([1, 1024, 3, 3])
AdaptiveMaxPool2d output shape:	 torch.Size([1, 1024, 1, 1])
flatten 

## Data Acquisition and Training

As before, we train our model using the Fashion-MNIST dataset. We transform it to $96 \times 96$ pixel resolution before invoking the training procedure.

In [9]:
def evaluate_accuracy(data_iter, net):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum,n = 0,0
    for (imgs, labels) in data_iter:
        # send data to the GPU if cuda is availabel
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        net.eval()
        with torch.no_grad():
            labels = labels.long()
            acc_sum += torch.sum((torch.argmax(net(imgs), dim=1) == labels)).float()
            n += labels.shape[0]
    return acc_sum.item()/n

if torch.cuda.is_available():
    print('Training using GPU.')
    net.cuda()
else:
    print('Training using CPU.')

#Initialize network parameters.
net.apply(weights_init)

lr, num_epochs, batch_size = 0.1, 5, 128
optimizer = torch.optim.SGD(net.parameters(), lr=lr)

# Apply resize to 96*96 at trasfrom
transform = transforms.Compose([transforms.Resize(96),
                                transforms.ToTensor()
                                ]) 
mnist_trainset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)
# Loading training set and test set using DataLoader.
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=batch_size,
    shuffle=True, num_workers=0)
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=batch_size,
    shuffle=True, num_workers=0)

criterion = nn.CrossEntropyLoss()

from tqdm import tqdm
for epoch in range(num_epochs):
    net.train() # Switch to training mode
    n, start = 0, time.time()
    train_l_sum = torch.tensor([0.0], dtype=torch.float32)
    train_acc_sum = torch.tensor([0.0], dtype=torch.float32)
    train_iter = iter(train_loader)
    # for _, (X, y) in tqdm(enumerate(train_iter)):
    for X, y in train_iter:
        optimizer.zero_grad()
        if torch.cuda.is_available():
            X = X.cuda()
            y = y.cuda()
            train_l_sum = train_l_sum.cuda()
            train_acc_sum = train_acc_sum.cuda()
        y_hat = net(X)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            y = y.long()
            train_l_sum += loss.float()
            train_acc_sum += (torch.sum((torch.argmax(y_hat, dim=1) == y))).float()
            n += y.shape[0]

    test_acc = evaluate_accuracy(iter(test_loader), net) 
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'\
        % (epoch + 1, train_l_sum/n, train_acc_sum/n, test_acc, time.time() - start))

Training using GPU.
epoch 1, loss 0.0150, train acc 0.287, test acc 0.549, time 47.0 sec
epoch 2, loss 0.0053, train acc 0.737, test acc 0.776, time 47.1 sec
epoch 3, loss 0.0036, train acc 0.827, test acc 0.817, time 47.4 sec
epoch 4, loss 0.0030, train acc 0.856, test acc 0.844, time 47.3 sec
epoch 5, loss 0.0027, train acc 0.870, test acc 0.871, time 47.3 sec
