# **NiN**

Network in Network (NiN) Get rid of the fully connected layers, it replaced AlexNet's densse layers with NiN blocks, and act like a global average pooling layer to comnine outputs. Although they may not perform as well as VGG, they were a key thing to go to inception or ResNet.

In [1]:
%matplotlib inline
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
import numpy as np
import torchvision
import torchvision.datasets as datasets
from torchvision import transforms
import torch.optim as optim
import time

batch_size = 128
num_epochs = 5
# Note not to flip two transform types, otherwise data type would be wrong.
transform = transforms.Compose([transforms.Resize(224),
                                transforms.ToTensor(),
                              ]) 

mnist_trainset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

# **Implementation for NiN**

In [2]:
def nin_block(num_in_channels, num_out_channels, kernel_size, strides, padding):
    ''' NiN block.
        Inputs: 
                num_in_channels: number of input channels of the NiN block
                num_out_channels: number of output channels of the NiN block
        Output: the NiN block at the given shape
    '''
    blk = nn.Sequential(
            nn.Conv2d(num_in_channels, num_out_channels, kernel_size, strides, padding),
            nn.ReLU(),
            nn.Conv2d(num_out_channels, num_out_channels, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(num_out_channels, num_out_channels, kernel_size=1),
            nn.ReLU())
    return blk


class flatten(nn.Module):
    ''' Flatten convotional layers output for classifier.'''
    def forward(self, x):
        return x.view(x.shape[0], -1)

# Establish NiN using nin_block
def nin():
    nin_1 = nin_block(num_in_channels=1,num_out_channels=96, kernel_size=11, strides=4, padding=0)
    nin_2 = nin_block(num_in_channels=96,num_out_channels=256, kernel_size=5, strides=1, padding=2)
    nin_3 = nin_block(num_in_channels=256,num_out_channels=384, kernel_size=3, strides=1, padding=1)
    nin_4 = nin_block(num_in_channels=384,num_out_channels=10, kernel_size=3, strides=1, padding=1)
    nin_net=nn.Sequential(
                      *nin_1,
                      nn.MaxPool2d(3,stride=2),
                      *nin_2,
                      nn.MaxPool2d(3,stride=2),
                      *nin_3,
                      nn.MaxPool2d(3,stride=2),
                      nn.Dropout2d(0.5),
                      # Get 10 label classes
                      *nin_4,
                      # Global Average Pooling
                      nn.AdaptiveMaxPool2d((1,1)),
                      # Transform the four-dimensional output into 2D output
                      # flatten()
                      torch.nn.Flatten()
                     )
    return nin_net

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.normal_(0.0, 0.01)
    elif classname.find('Linear') != -1:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.normal_(0.0, 0.01)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.01)
        m.bias.data.fill_(0)

def evaluate_accuracy(data_iter, net):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum,n = 0,0
    for (imgs, labels) in data_iter:
        # send data to the GPU if cuda is availabel
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        net.eval()
        with torch.no_grad():
            labels = labels.long()
            acc_sum += torch.sum((torch.argmax(net(imgs), dim=1) == labels)).float()
            n += labels.shape[0]
    return acc_sum.item()/n

NiN required significantly less memories to be computed, which is about 3.16 GB. In comparison, the VGG11 with same dataset and batch size consummes more than 14 GB of memory.

The computation time for NiN also computed much faster than VGG (~80 sec/epoch vs ~400 sec/epoch)

In [3]:
# Loading training set and test set using DataLoader.
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=batch_size,
    shuffle=True, num_workers=0)
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=batch_size,
    shuffle=True, num_workers=0)

if torch.cuda.is_available():
    print('Training using GPU.')
    net = nin().cuda()
else:
    print('Training using CPU.')
    net = nin()

#Initialize network parameters.
net.apply(weights_init)

#Loss function
if torch.cuda.is_available():
    loss = nn.CrossEntropyLoss().cuda()
else:
    loss = nn.CrossEntropyLoss()

# Train using SGD optimizer 
lr= 0.1 # Compare to LeNet, the learning rate is much smaller due to much larget images
opt_n = optim.SGD(net.parameters(), lr=lr)

# Training stage
from tqdm import tqdm
for epoch in range(1, num_epochs+1):
    train_loader_iter = iter(train_loader)
    train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
    
    for _, (imgs, labels) in tqdm(enumerate(train_loader_iter)):
        net.train()
        opt_n.zero_grad()
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        # Label prediction from LeNet
        y_hat = net(imgs)
        l = loss(y_hat, labels)
        # Backprobagation
        l.backward()
        opt_n.step()

        # Calculate tarining error
        with torch.no_grad():
            labels = labels.long()
            train_l_sum += l.item()
            train_acc_sum += (torch.sum(torch.argmax(y_hat, dim=1) == labels)).float().item()
            n += labels.shape[0]
    # calculate testing error every epoch.
    test_acc = evaluate_accuracy(iter(test_loader), net)
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
          % (epoch, train_l_sum/n, train_acc_sum/n, test_acc,
            time.time() - start))


            



Training using GPU.


469it [01:16,  6.11it/s]
1it [00:00,  6.08it/s]

epoch 1, loss 0.0134, train acc 0.358, test acc 0.503, time 81.1 sec


469it [01:17,  6.04it/s]
1it [00:00,  6.03it/s]

epoch 2, loss 0.0066, train acc 0.689, test acc 0.758, time 81.9 sec


469it [01:18,  5.97it/s]
1it [00:00,  5.77it/s]

epoch 3, loss 0.0048, train acc 0.775, test acc 0.767, time 82.8 sec


469it [01:18,  5.95it/s]
1it [00:00,  5.94it/s]

epoch 4, loss 0.0041, train acc 0.808, test acc 0.745, time 83.1 sec


469it [01:19,  5.93it/s]


epoch 5, loss 0.0037, train acc 0.821, test acc 0.830, time 83.3 sec
