# NiN Net with MNIST Dataset

`Author: YUAN Yanzhe`

- This notebook is a reproduction of the [NiN paper](https://arxiv.org/abs/1312.4400).
  - If you want to do parameter fine-tuning, setting hyperparameters on the entrance of the model is recommended.
    - e.g. def \_\_init\_\_(param) 
- The code runs on Google Colab, GPU mode

**一些细节**
- NiN：Network in Network.
  - LeNet、AlexNet和VGG在设计上的共同之处是：先以由卷积层构成的模块充分抽取空间特征，再以由全连接层构成的模块来输出分类结果。
  - 其中，AlexNet和VGG对LeNet的改进主要在于如何对这两个模块加宽（增加通道数）和加深。  
  - NiN提出了另外一个思路，即串联多个由卷积层和“全连接”层构成的小网络来构建一个深层网络。
- 1\*1卷积
  - 一般来说，1\*1卷积一般的目的是不改变image的size而改变in/out channel即feature_num，可以理解为在feature维度上进行卷积。
  - 在NiN Net，1\*1卷积的目的是替代全连接层，因为它有着更少的参数量，配合Global Average Pooling
- NiN结构
  - nin_block
    - conv-relu-conv-relu-conv-relu: 第一个conv控制feature_num，后面两个都是不改变feature_num的1\*1conv
  - 大体上是:
    - 3个：nin_block-pool：nin控制feature_num，pool控制image的size。
    - dropout: regularizaiton
    - nin_block+global_avgpool: nin降维和分类，avgpool将image的size变为1\*1
- 定义了一个global average pooling 类，前面的1\*1卷积是对feature数改动而不改动image的size的话，那么这个类才是用来取代全连接层的关键部分吗，它的作用是将任何的image size变为(1,1)。
  - 好处是减少参数。
  - Intuitively, 假如最后的一层的数据是10个6*6的特征图，global average pooling是将每一张特征图计算所有像素点的均值，输出一个数据值，这样10 个特征图就会输出10个数据点，将这些数据点组成一个1*10的向量的话，就成为一个特征向量，就可以送入到softmax的分类中计算了。
  - 如果是全连接层，会将feature_num个feature进行拼接，然后用一个简单的FNN网络输出到10维度进行softmax。

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/d2dl_pytorch')

In [3]:
# Import Packages
import torch
from torch import nn as nn
from torch import optim as optim
from torch.utils import data as Data

import torchvision
from torchvision import datasets
from torchvision import transforms

import numpy as np
import pandas as pd 
import time

import d2lzh_pytorch as d2dl

print(torch.__version__)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device on:', device)

1.7.0+cu101
device on: cuda


In [4]:
# Hyperparameters
batch_size = 256
num_epochs = 5
learning_rate = 0.001

# Load Data
# non-default argument follows default argument, has to define non-default value first
def load_data_from_mnist(batch_size, resize=None, root=''):
    trans = []
    if resize:
        trans.append(transforms.Resize(resize))
    trans.append(transforms.ToTensor())
    transform = transforms.Compose(trans)

    train_data = torchvision.datasets.MNIST(root=root,train=True,transform=transform,download=False)
    test_data = torchvision.datasets.MNIST(root=root,train=False,transform=transform,download=False)
    train_iterator = Data.DataLoader(train_data,batch_size=batch_size,shuffle=True,num_workers=4)
    test_iterator = Data.DataLoader(test_data,batch_size=batch_size,shuffle=True,num_workers=4)

    return train_iterator, test_iterator

def load_data_fashion_mnist(batch_size, resize=None, root=''):
    """Download the fashion mnist dataset and then load into memory."""
    trans = []
    if resize:
        trans.append(torchvision.transforms.Resize(size=resize))
    trans.append(torchvision.transforms.ToTensor())

    transform = torchvision.transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)

    train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=4)
    test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=4)

    return train_iter, test_iter

#train_iterator, test_iterator = load_data_fashion_mnist(batch_size,resize=224)
train_iterator, test_iterator = load_data_from_mnist(batch_size,resize=224)


In [9]:
# Define Model
class globalAvgPool(nn.Module):
    def __init__(self):
        super(globalAvgPool,self).__init__()
    def forward(self, x):
        return nn.functional.avg_pool2d(x,x.size()[2:])

class ninNet(nn.Module):
    def __init__(self):
        super(ninNet,self).__init__()
        self.nin_layer = nn.Sequential(
            self.nin_block(1,96,k_size=11,std=4,pad=0),
            nn.MaxPool2d(3,2),
            self.nin_block(96,256,5,1,2),  # keep the image size, increase the feature_num
            nn.MaxPool2d(3,2),  # reduce image_size by half 
            self.nin_block(256,384,3,1,1),  # keep the image size, increase the feature_num
            nn.MaxPool2d(3,2),  # reduce image_size by half 
            nn.Dropout(0.5), 
            self.nin_block(384,10,3,1,1),  # keep the image size, decrease the feature_num
            globalAvgPool()  # avg with kernel size(img.h,img,w): (batch,10,1,1)
        )
        
    def nin_block(self, c_in, c_out, k_size, std, pad):
        # each nin_block can be regarded as a tiny network in the huge ninNet
        block = []
        block.append(nn.Conv2d(c_in,c_out,k_size,std,pad))
        block.append(nn.ReLU())
        block.append(nn.Conv2d(c_out,c_out,1))  # 1*1 conv to replace fc so it looks like a tiny network
        block.append(nn.ReLU())
        block.append(nn.Conv2d(c_out,c_out,1))
        block.append(nn.ReLU())
        return nn.Sequential(*block)

    def forward(self, x):
        y = self.nin_layer(x)
        y = y.view(x.shape[0],-1)  # ultimately turn 4 dimension to 2 dimension:(batch,10)
        return y 


net = ninNet()
print(net)

loss_func = nn.CrossEntropyLoss()
optimizor = optim.Adam(net.parameters(), lr=learning_rate)

ninNet(
  (nin_layer): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4))
      (1): ReLU()
      (2): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1))
      (3): ReLU()
      (4): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1))
      (5): ReLU()
    )
    (1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): Sequential(
      (0): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
      (1): ReLU()
      (2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
      (3): ReLU()
      (4): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
      (5): ReLU()
    )
    (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU()
      (2): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1))
      (3): ReLU()
      (4): Conv2d(384, 384, kernel_size=(1, 1), 

In [10]:
# Train Model
def evaluate_model(net, test_iterator, device):
    net = net.to(device)
    print('testing on:', device)
    with torch.no_grad():
        correct,num_exp = 0.0,0
        for X,y in test_iterator:
            if isinstance(net, nn.Module):
                net.eval()  # eval mode will shut off dropout function
                correct += (net(X.to(device)).argmax(1)==y.to(device)).float().sum().cpu().item()
                net.train()
            else: 
                print('is this your self-defined nn module?? we are not considering GPU if so')
                if('is_training' in net.__code__.co_varnames): 
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            num_exp += y.size(0)
     
    return correct/num_exp*100

def train_model(num_epochs, train_iterator, test_iterator, loss_func, optimizor, net, device):
    net = net.to(device)
    print('training on:', device)
    for epoch in range(num_epochs):
        total_loss,total_batch,total_acc,total_num,start_time = 0.0,0,0.0,0,time.time()
        for X, y in train_iterator:
            X = X.to(device)
            y = y.to(device)

            output = net(X)
            loss = loss_func(output,y)
            optimizor.zero_grad()
            loss.backward()
            optimizor.step()
            
            total_loss += loss.cpu().item()
            total_batch += 1
            total_acc += (output.argmax(1)==y).sum().cpu().item()
            total_num += y.size(0)
        
        test_acc = evaluate_model(net, test_iterator, device)
        print('Epoch: {}, Average loss: {:.4f}, Average accuracy: {:.2f}%, Test Accuracy: {:.2f}%, time: {:.1f}sec' \
              .format(epoch+1, total_loss/total_batch, total_acc/total_num*100, test_acc, time.time()-start_time))

train_model(num_epochs,train_iterator,test_iterator,loss_func,optimizor,net,device)
        
# Prediction

training on: cuda
testing on: cuda
Epoch: 1, Average loss: 2.1091, Average accuracy: 19.04%, Test Accuracy: 46.99%, time: 44.8sec
testing on: cuda
Epoch: 2, Average loss: 0.8541, Average accuracy: 72.04%, Test Accuracy: 82.92%, time: 44.9sec
testing on: cuda
Epoch: 3, Average loss: 0.4695, Average accuracy: 86.28%, Test Accuracy: 91.83%, time: 45.2sec
testing on: cuda
Epoch: 4, Average loss: 0.2938, Average accuracy: 91.67%, Test Accuracy: 94.96%, time: 45.3sec
testing on: cuda
Epoch: 5, Average loss: 0.2136, Average accuracy: 93.92%, Test Accuracy: 96.01%, time: 44.8sec
