# ResNet with MNIST Dataset
`Author: YUAN Yanzhe`

- This notebook is a reproduction of the [ResNet paper](https://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
  - If you want to do parameter fine-tuning, setting hyperparameters on the entrance of the model is recommended.
    - e.g. def \_\_init\_\_(param) 
- The code runs on Google Colab, GPU mode

一些细节：
- ResNet的motivation：对神经网络添加新的layer，可能会使训练误差减少。
  - BN是一种思路，但实践中，添加BN看似并不能完全解决问题。
  - 是否能添加上一层（块）的输出到这一层（块）的输出上，这样能避免信息遗忘，添加一条通路也可以缓解梯度消失等问题。
  - 因为原网络解的空间只是新模型解的空间的子空间，如果我们能将新添加的层训练成恒等映射f(x)=x，新模型和原模型将同样有效。
- ResNet的结构：
  - 首先一个类似于googlenet第一层的结构
  - 然后是4个resnet block，每个block含有res_num个residual blocks
    - 对一个block，其包含的若干个residual block中，第一个是一个(c_in,c_out)且stride为2的residual block（控制特征数和image size），剩下的是(c_out,c_out)的residual block（提取特征）。
      - 一个residual block的输出：内部两个cnn的结构的输出，和输入在最后相加。
  - 然后是global average pool：将image size变为1，1
  - 然后是flatten+特征维度的fc：nn.Linear(～,10) 这里的feature_num是512（当然，可以更大）

- ResNet细节在代码中注释了。
  - ResNet的前两层跟之前介绍的GoogLeNet中的一样：在输出通道数为64、步幅为2的7×7卷积层后接步幅为2的3×3的最大池化层。不同之处在于ResNet每个卷积层后增加的批量归一化层。
  - ResNet包含若干个resnet_block，每个resnet_block包含指定num的residual_block，residual_block另定义了类，是一个残差思想的结构。
  - 不要忘记glovalAvgPooling以及flattenlayer
    - 前者起到fc作用，将image size变为1\*1
    - 后者将image维度去掉，只剩下(batch,feature_num)，用于linear层分类到softmax。

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/d2dl_pytorch')

In [3]:
# Import Packages
import torch
from torch import nn as nn
from torch import optim as optim
from torch.utils import data as Data

import torchvision
from torchvision import datasets
from torchvision import transforms

import numpy as np
import pandas as pd 
import time

import d2lzh_pytorch as d2dl

print(torch.__version__)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device on:', device)

1.7.0+cu101
device on: cuda


In [11]:
# Hyperparameters
batch_size = 256
num_epochs = 5
learning_rate = 0.001

num_classes = 10

# Load Data
# non-default argument follows default argument, has to define non-default value first
def load_data_from_mnist(batch_size, resize=None, root=''):
    trans = []
    if resize:
        trans.append(transforms.Resize(resize))
    trans.append(transforms.ToTensor())
    transform = transforms.Compose(trans)

    train_data = torchvision.datasets.MNIST(root=root,train=True,transform=transform,download=False)
    test_data = torchvision.datasets.MNIST(root=root,train=False,transform=transform,download=False)
    train_iterator = Data.DataLoader(train_data,batch_size=batch_size,shuffle=True,num_workers=4)
    test_iterator = Data.DataLoader(test_data,batch_size=batch_size,shuffle=True,num_workers=4)

    return train_iterator, test_iterator

def load_data_fashion_mnist(batch_size, resize=None, root=''):
    trans = []
    if resize:
        trans.append(torchvision.transforms.Resize(size=resize))
    trans.append(torchvision.transforms.ToTensor())

    transform = torchvision.transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)

    train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=4)
    test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=4)

    return train_iter, test_iter

#train_iterator, test_iterator = load_data_fashion_mnist(batch_size,resize=96)
train_iterator, test_iterator = load_data_from_mnist(batch_size,resize=96)

# Define Model
class globalAvgPool(nn.Module):
    # the function of global average pooling is to reduce the image size to (1,1),
    # which is convenient to reduce dimension later
    def __init__(self):
        super(globalAvgPool,self).__init__()
    def forward(self, x):
        return nn.functional.avg_pool2d(x,x.size()[2:])

class residualBlock(nn.Module):
    # residual block contains 2 parts: 
    # the first part is a CNN block: (conv-bn-relu), change the feature_num:(c_in,c_out), image size is (optional) remained.
    # the second part is another CNN block:(conv-bn-relu), feature_num and image size are remained the same.
    # the third part is an (optioanl) 1*1 conv on the input, change the feature_num of the input X to the same as the output Y (if needed).
    # in the forward process: the output of the two cnn block:Y and the input:X is added and fed into relu (both normalized).
    # the residual forward: res_out = relu(Y+X). (f_num: c_in-c_out, size: (optional determined by stride))
    def __init__(self, c_in, c_out, is_11_cnn=False, std=1):
        super(residualBlock,self).__init__()
        self.cnn_1 = nn.Conv2d(c_in,c_out,kernel_size=3,stride=std,padding=1)
        self.cnn_2 = nn.Conv2d(c_out,c_out,kernel_size=3,stride=1,padding=1)
        self.bn_1 = nn.BatchNorm2d(c_out)
        self.bn_2 = nn.BatchNorm2d(c_out)
        if is_11_cnn:
            self.cnn_3 = nn.Conv2d(c_in,c_out,kernel_size=1,stride=std)
        else:
            self.cnn_3 = None
    def forward(self, x):
        y = nn.functional.relu(self.bn_1(self.cnn_1(x)))
        y = self.bn_2(self.cnn_2(y))
        if self.cnn_3:
            x = self.cnn_3(x)
        return nn.functional.relu(y+x) 

class resNet(nn.Module):
    # the resNet contains the following parts:
    # the first part is like GoogLeNet except bn after cnn: 7*7conv-bn-relu-pool
  
    # the second part is a series of resnet block, one block contains (res_num) residual blocks 
    # among these residual blocks, the first is (c_in, c_out) with (stride=2 (reduce the image size by half) and 1*1) cnn
    # the others are repeated (c_out, c_out)

    # the third part is a globalAvgPooling based layer to reduce the image size to 1*1 to replace fc(fnn).
    # the fourth part is the linear layer to reduce feature_num and feed into softmax.

    def __init__(self):
        super(resNet,self).__init__()
        block_1 = nn.Sequential(
            nn.Conv2d(1,64,kernel_size=7,stride=2,padding=3),
            nn.BatchNorm2d(64),  # new
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
        )
        block_2 = self.resnet_block(c_in=64,c_out=64,res_num=2,is_first=True)
        block_3 = self.resnet_block(64,128,2)
        block_4 = self.resnet_block(128,256,2)
        block_5 = self.resnet_block(256,512,2)
        avgpool = globalAvgPool()
        self.resNet_layer = nn.Sequential(block_1,block_2,block_3,block_4,block_5,avgpool)
        # Noted that there should be a globalAvgPool to make image size (1*1) and a flatten to reduce the dimension to 2.
        self.fc_layer = nn.Linear(512,10)
  
    def resnet_block(self, c_in, c_out, res_num, is_first=False):
        if is_first:
            assert c_in == c_out
        block = []
        for i in range(res_num):
            if i == 0:
                block.append(residualBlock(c_in,c_out,is_11_cnn=True,std=2))
            else:
                block.append(residualBlock(c_out,c_out))  # is_11_cnn=False, stride=1 
        return nn.Sequential(*block)

    def forward(self, x):
        y = self.resNetBlock_layer(x)
        y = self.fc_layer(y.view(x.shape[0],-1))  # faltten layer
        return y

net = resNet()
print(net)

loss_func = nn.CrossEntropyLoss()
optimizor = optim.Adam(net.parameters(), lr=learning_rate)



resNet(
  (block_1): Sequential(
    (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (block_2): Sequential(
    (0): residualBlock(
      (cnn_1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (cnn_2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn_1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn_2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (cnn_3): Conv2d(64, 64, kernel_size=(1, 1), stride=(2, 2))
    )
    (1): residualBlock(
      (cnn_1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn_2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn_1): BatchNorm2d(64, eps=1e-05, mom

In [12]:
# Train Model
def evaluate_model(net, test_iterator, device):
    net = net.to(device)
    print('testing on:', device)
    with torch.no_grad():
        correct,num_exp = 0.0,0
        for X,y in test_iterator:
            if isinstance(net, nn.Module):
                net.eval()  # eval mode will shut off dropout function
                correct += (net(X.to(device)).argmax(1)==y.to(device)).float().sum().cpu().item()
                net.train()
            else: 
                print('is this your self-defined nn module?? we are not considering GPU if so')
                if('is_training' in net.__code__.co_varnames): 
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            num_exp += y.size(0)
     
    return correct/num_exp*100

def train_model(num_epochs, train_iterator, test_iterator, loss_func, optimizor, net, device):
    net = net.to(device)
    print('training on:', device)
    for epoch in range(num_epochs):
        total_loss,total_batch,total_acc,total_num,start_time = 0.0,0,0.0,0,time.time()
        for X, y in train_iterator:
            X = X.to(device)
            y = y.to(device)

            output = net(X)
            loss = loss_func(output,y)
            optimizor.zero_grad()
            loss.backward()
            optimizor.step()
            
            total_loss += loss.cpu().item()
            total_batch += 1
            total_acc += (output.argmax(1)==y).sum().cpu().item()
            total_num += y.size(0)
        
        test_acc = evaluate_model(net, test_iterator, device)
        print('Epoch: {}, Average loss: {:.4f}, Average accuracy: {:.2f}%, Test Accuracy: {:.2f}%, time: {:.1f}sec' \
              .format(epoch+1, total_loss/total_batch, total_acc/total_num*100, test_acc, time.time()-start_time))

train_model(num_epochs,train_iterator,test_iterator,loss_func,optimizor,net,device)
        
# Prediction

training on: cuda
testing on: cuda
Epoch: 1, Average loss: 0.1284, Average accuracy: 95.90%, Test Accuracy: 97.27%, time: 19.3sec
testing on: cuda
Epoch: 2, Average loss: 0.0433, Average accuracy: 98.69%, Test Accuracy: 98.51%, time: 19.3sec
testing on: cuda
Epoch: 3, Average loss: 0.0320, Average accuracy: 99.02%, Test Accuracy: 98.73%, time: 19.3sec
testing on: cuda
Epoch: 4, Average loss: 0.0270, Average accuracy: 99.17%, Test Accuracy: 98.97%, time: 19.3sec
testing on: cuda
Epoch: 5, Average loss: 0.0222, Average accuracy: 99.29%, Test Accuracy: 99.23%, time: 19.4sec
