# 0 DataLoader for FashionMNIST

In [1]:
import torchvision
import torchvision.transforms as transforms
import torch
from torch.utils.data import DataLoader 

def load_data_fashion_mnist(batch_size, resize=None):
    # compose transforms.Resize() and transforms.ToTensor() together
    trans = []
    if resize:
        trans.append(transforms.Resize(size=resize))
    trans.append(transforms.ToTensor())
    # compose transforms.Resize() and transforms.ToTensor() together
    transform = transforms.Compose(trans)
    train_data = torchvision.datasets.FashionMNIST(root = "./data/FashionMNIST", train=True, transform=transform, download=True)
    test_data = torchvision.datasets.FashionMNIST(root = "./data/FashionMNIST", train=False, transform=transform, download=True)

    train_iter = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
    test_iter = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=4)

    return train_iter, test_iter

# 1 AlexNet(simplified)
1. AlexNet uses ReLU insted of sigmoid
2. AlexNet uses Dropout

In [2]:
import time
import torch
from torch import nn, optim
import torchvision

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class AlexNet(nn.Module):
    def __init__(self):
        super(AlexNet, self).__init__()
        self.conv = nn.Sequential(
            #1*227*227->96*55*55
            nn.Conv2d(in_channels=1, out_channels=96, kernel_size=11, stride=4),
            nn.ReLU(),
            #96*55*55->96*27*27
            nn.MaxPool2d(kernel_size=3, stride=2),
            #96*27*27->256*27*27
            nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            #256*27*27->256*13*13
            nn.MaxPool2d(kernel_size=3, stride=2),
            #256*13*13->384*13*13
            nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            #384*13*13->384*13*13
            nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            #384*13*13->256*13*13
            nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            #256*13*13->256*6*6
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.fc = nn.Sequential(
            nn.Linear(256*6*6, 4096),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 10)
        )

    def forward(self, img):
        feature = self.conv(img)
        output = self.fc(feature.view(img.shape[0], -1))
        return output

In [3]:
net = AlexNet()
print(net)

AlexNet(
  (conv): Sequential(
    (0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=9216, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=4096, out_features=4096, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (

In [4]:
batch_size = 128
train_iter, test_iter = load_data_fashion_mnist(batch_size=batch_size, resize=227)

In [5]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # Use the device net is on
        device = list(net.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            net.eval() # close Dropout
            y_hat = net(X.to(device))
            acc_sum += (y_hat.argmax(dim=1) == y.to(device)).float().sum().cpu().item()
            n += y.shape[0]
            net.train() # reuse Dropout
    return acc_sum / n

def train(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
    net = net.to(device)
    print(" training on ", device)
    loss = nn.CrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, batch_count, start = 0.0, 0.0, 0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            # We do not use to store them on GPU
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))


In [6]:
lr, num_epochs = 0.001, 2
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

#train(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

# 2 Network in Network(NiN)
Typically we extract features exploiting **spatial structure** via a sequence of convolution and pooling layers and then post-process the representations via FC layers. A careless use of FC layers might give up the spacial structure.

Alternatively NiN offers an equivalent way to use FC layers earlier without giving up the spacial structure.

The NiN block consists of one (user-defined) convolutional layer followed by two 1*1 convolutional layers that act as per-pixel fully-connected layers with ReLU activations.

**If we see a whole channel as a feature, 1*1 convolutional layer is like to perform a FC layer between channels**

NiN avoids overfitting by preventing FC layers with too many parameters
## 2.1 NiN block

In [7]:
def nin_block(in_channels, out_channels, kernel_size, stride, padding):
    blk = nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding),
        nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1),
        nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1),
        nn.ReLU()
        )
    return blk

## 2.2 NiN version of AlexNet
Instead of using FC layers at the end of our network, we use global average Pooling layer to reshape the output

In [8]:
from torch import nn
import torch.nn.functional as F

class GlobalAvgPool2d(nn.Module):
    """
    input: b * c * h * w

    output: b * c * 1 * 1
    """
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()
    def forward(self, x):
        # x.size():[batch_size, channels, height, width]
        return F.avg_pool2d(x, kernel_size=x.size()[2:])

class FlattenLayer(nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x):
        return x.view(x.shape[0], -1)

net = nn.Sequential(
    # 1*227*227->96*55*55
    nin_block(in_channels=1, out_channels=96, kernel_size=11, stride=4, padding=0),
    # 96*55*55->96*27*27
    nn.MaxPool2d(kernel_size=3, stride=2),
    # 96*27*27->256*27*27
    nin_block(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2),
    # 256*27*27->256*13*13
    nn.MaxPool2d(kernel_size=3, stride=2),
    # 256*13*13->384*13*13
    nin_block(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1),
    # 384*13*13->384*6*6
    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Dropout(0.5),
    # 384*6*6->10*6*6
    nin_block(in_channels=384, out_channels=10, kernel_size=3, stride=1,padding=1),
    # 10*6*6->10*1*1
    GlobalAvgPool2d(),
    # 10*1*1->10
    FlattenLayer()
)

## 2.3 Output shape of NiN

In [9]:
X = torch.rand(1, 1, 227, 227)
for name, blk in net.named_children():
    X = blk(X)
    print(name, "output shape: ", X.shape)

0 output shape:  torch.Size([1, 96, 55, 55])
1 output shape:  torch.Size([1, 96, 27, 27])
2 output shape:  torch.Size([1, 256, 27, 27])
3 output shape:  torch.Size([1, 256, 13, 13])
4 output shape:  torch.Size([1, 384, 13, 13])
5 output shape:  torch.Size([1, 384, 6, 6])
6 output shape:  torch.Size([1, 384, 6, 6])
7 output shape:  torch.Size([1, 10, 6, 6])
8 output shape:  torch.Size([1, 10, 1, 1])
9 output shape:  torch.Size([1, 10])


## 2.4 Training

In [10]:
batch_size = 128
lr, num_epochs = 0.001, 2
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

train_iter, test_iter = load_data_fashion_mnist(batch_size=batch_size, resize=227)

#train(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

# 3 Batch Normalization
Batch Normalization layer should be between FC/Conv layer and activation function layer. Each channel has its own BatchNormalization layer.

Assume the input and output of Batch Normalization layer as $\boldsymbol{x},\ \boldsymbol{y}$: $\boldsymbol{y} = BN(\boldsymbol{x})$

Process:
- Calculate mean and variance of a batch:
$$\boldsymbol{\mu}_\mathcal{B} \leftarrow \frac{1}{m}\sum_{i = 1}^{m} \boldsymbol{x}^{(i)},$$
$$\boldsymbol{\sigma}_\mathcal{B}^2 \leftarrow \frac{1}{m} \sum_{i=1}^{m}(\boldsymbol{x}^{(i)} - \boldsymbol{\mu}_\mathcal{B})^2,$$
- Normalization
$$\hat{\boldsymbol{x}}^{(i)} \leftarrow \frac{\boldsymbol{x}^{(i)} - \boldsymbol{\mu}_\mathcal{B}}{\sqrt{\boldsymbol{\sigma}_\mathcal{B}^2 + \epsilon}},$$
- scale and shift
$${\boldsymbol{y}}^{(i)} \leftarrow \boldsymbol{\gamma} \odot \hat{\boldsymbol{x}}^{(i)} + \boldsymbol{\beta}.$$

$\boldsymbol{\gamma}$ and $\boldsymbol{\beta}$ are learnable parameters


In [11]:
# LeNet with Batch Normalization
import time
import torch
from torch import nn, optim
import torchvision

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FlattenLayer(nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x):
        return x.view(x.shape[0], -1)

net = nn.Sequential(
    # 1*28*28->6*24*24
    nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5),
    # 6*24*24->6*24*24
    nn.BatchNorm2d(num_features=6),
    # 6*24*24->6*24*24
    nn.Sigmoid(),
    # 6*24*24->6*12*12
    nn.MaxPool2d(kernel_size=2, stride=2),
    # 6*12*12->16*8*8
    nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
    # 16*8*8->16*8*8
    nn.BatchNorm2d(num_features=16),
    # 16*8*8->16*8*8
    nn.Sigmoid(),
    # 16*8*8->16*4*4
    nn.MaxPool2d(kernel_size=2, stride=2),
    FlattenLayer(),
    nn.Linear(16*4*4, 120),
    nn.BatchNorm1d(120),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    nn.BatchNorm1d(84),
    nn.Sigmoid(),
    nn.Linear(84, 10)
)


In [12]:
batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 2
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
#train(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

# 4 Residual Networks(ResNet)
Instead of directly leaning the mapping $f(x)$, we learn the **residual mapping** $f(x)-x$, which is easier to learn.

Note: $x$ and $f(x)$ have to be in the same shape. The only exception is when $x$ and $f(x)$ have different amount of channels. Then we need to use 1*1 convolutional layer to adjust the channel amount of $x$.

## 4.1 ResNet Block
We assume Residual Layer as 

x->

Conv->BN->ReLU->

Conv->BN->ReLU->

1*1 Conv->ReLU

->f(x)-x

In [13]:
import torch.nn.functional as F

class ResidualLayer(nn.Module):
    def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
        super(ResidualLayer, self).__init__()
        # h*h->([h-1]/s+1) * ([h-1]/s+1)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        # ([h-1]/s+1) * ([h-1]/s+1)->([h-1]/s+1) * ([h-1]/s+1)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            # h*h->([h-1]/s+1) * ([h-1]/s+1)
            self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        y = F.relu(self.bn1(self.conv1(x)))
        y = self.bn2(self.conv2(y))
        if self.conv3:
            x = self.conv3(x)
        # learn f(x) - x
        return F.relu(x + y)

## 4.2 ResNet Model
Our Model consists of several ResNet Modules and each ResNet Module consists of several (2 by default) ResNet Layers

In [14]:
# First part
net = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64), 
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

In [15]:
def resnet_block(in_channels, out_channels, num_residuals=2):
    blk = []
    for i in range(num_residuals):
        if i == 0:
            blk.append(ResidualLayer(in_channels, out_channels, use_1x1conv=True, stride=2))
        else:
            blk.append(ResidualLayer(out_channels, out_channels))
    return nn.Sequential(*blk)

# Second part: Residual Model
net.add_module("resnet_block1", resnet_block(64, 64, 2))
net.add_module("resnet_block2", resnet_block(64, 128, 2))
net.add_module("resnet_block3", resnet_block(128, 256, 2))
net.add_module("resnet_block4", resnet_block(256, 512, 2))


In [16]:
class GlobalAvgPool2d(nn.Module):
    """
    input: b * c * h * w

    output: b * c * 1 * 1
    """
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()
    def forward(self, x):
        # x.size():[batch_size, channels, height, width]
        return F.avg_pool2d(x, kernel_size=x.size()[2:])

class FlattenLayer(nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x):
        return x.view(x.shape[0], -1)
# Third part: global average Pooling and FC
net.add_module("global_avg_pool", GlobalAvgPool2d()) 
net.add_module("fc", nn.Sequential(FlattenLayer(), nn.Linear(512, 10))) 

In [17]:
X = torch.rand((1, 1, 227, 227))
for name, layer in net.named_children():
    X = layer(X)
    print(name, ' output shape:\t', X.shape)

0  output shape:	 torch.Size([1, 64, 114, 114])
1  output shape:	 torch.Size([1, 64, 114, 114])
2  output shape:	 torch.Size([1, 64, 114, 114])
3  output shape:	 torch.Size([1, 64, 57, 57])
resnet_block1  output shape:	 torch.Size([1, 64, 29, 29])
resnet_block2  output shape:	 torch.Size([1, 128, 15, 15])
resnet_block3  output shape:	 torch.Size([1, 256, 8, 8])
resnet_block4  output shape:	 torch.Size([1, 512, 4, 4])
global_avg_pool  output shape:	 torch.Size([1, 512, 1, 1])
fc  output shape:	 torch.Size([1, 10])


# 5 Densely Connected Networks (DenseNet)
Instead of adding two layers together in ResNet, we **concatenate** two layers towards a new layer

We assume a ConvBlock as the combination of BN, ReLU and Conv。

A Dense block can be depicted as

x -> ConvBlock -> \[x, ConvBlock(x)\] -> ...

## 5.1 Dense Block



In [18]:
def ConvBlock(in_channels, out_channels):
    blk = nn.Sequential(
        nn.BatchNorm2d(in_channels),
        nn.ReLU(),
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
    )
    return blk

class DenseBlock(nn.Module):
    def __init__(self, num_convs, in_channels, out_channels):
        super(DenseBlock, self).__init__()
        net = []
        in_c = in_channels
        for i in range(num_convs):
            net.append(ConvBlock(in_c, out_channels))
            # in_channels of next block should be the amount of channels of concatenated output of last block
            in_c += out_channels
        # Use ModuleList to avoid automatic forward()
        self.net = nn.ModuleList(net)
        # out_channels of the last block after concatenation
        self.out_channels = in_channels + num_convs * out_channels
    
    def forward(self, x):
        for blk in self.net:
            y = blk(x)
            # dim=1 means concatenation in channel dimension
            x = torch.cat((x, y), dim=1)
        return x

In [19]:
blk = DenseBlock(2, 3, 10)
x = torch.rand(4, 3, 8, 8)
y = blk(x)
y.shape # 2 + 3 * 10 = 23

torch.Size([4, 23, 8, 8])

## 5.2 Transition Block
Transition Block is used to avoid accumulation of channels using 1*1 Convolutional layer

In [20]:
def TransitionBlock(in_channels, out_channels):
    blk = nn.Sequential(
        nn.BatchNorm2d(in_channels),
        nn.ReLU(),
        nn.Conv2d(in_channels, out_channels, kernel_size=1),
        # different from self-defined GlobalAvgPool2d
        nn.AvgPool2d(kernel_size=2, stride=2)
    )
    return blk

blk = TransitionBlock(23, 10)
blk(y).shape

torch.Size([4, 10, 4, 4])

## 5.3 DesNet Model

In [21]:
# First Part
net = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64), 
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

In [22]:
# Second Part
# num_channels: the current amount of output channels of the first part
# growth_rate: also the output of Conv Block
num_channels, growth_rate = 64, 32  
num_convs_in_dense_blocks = [4, 4, 4, 4]

for i, num_convs in enumerate(num_convs_in_dense_blocks):
    DB = DenseBlock(num_convs, num_channels, growth_rate)
    net.add_module("DenseBlosk_%d" % i, DB)
    num_channels = DB.out_channels
    # Add transition block that cut off half of the channels
    if i != len(num_convs_in_dense_blocks) - 1:
        net.add_module("TransitionBlock_%d" % i, TransitionBlock(num_channels, num_channels // 2))
        num_channels = num_channels // 2

In [23]:
class GlobalAvgPool2d(nn.Module):
    """
    input: b * c * h * w

    output: b * c * 1 * 1
    """
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()
    def forward(self, x):
        # x.size():[batch_size, channels, height, width]
        return F.avg_pool2d(x, kernel_size=x.size()[2:])

class FlattenLayer(nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x):
        return x.view(x.shape[0], -1)
# Third Part:
net.add_module("BN", nn.BatchNorm2d(num_channels))
net.add_module("ReLU", nn.ReLU())
net.add_module("AvgPool", GlobalAvgPool2d()) 
net.add_module("FC", nn.Sequential(FlattenLayer(), nn.Linear(num_channels, 10)))  

In [24]:
X = torch.rand((1, 1, 96, 96))
for name, layer in net.named_children():
    X = layer(X)
    print(name, ' output shape:\t', X.shape)

0  output shape:	 torch.Size([1, 64, 48, 48])
1  output shape:	 torch.Size([1, 64, 48, 48])
2  output shape:	 torch.Size([1, 64, 48, 48])
3  output shape:	 torch.Size([1, 64, 24, 24])
DenseBlosk_0  output shape:	 torch.Size([1, 192, 24, 24])
TransitionBlock_0  output shape:	 torch.Size([1, 96, 12, 12])
DenseBlosk_1  output shape:	 torch.Size([1, 224, 12, 12])
TransitionBlock_1  output shape:	 torch.Size([1, 112, 6, 6])
DenseBlosk_2  output shape:	 torch.Size([1, 240, 6, 6])
TransitionBlock_2  output shape:	 torch.Size([1, 120, 3, 3])
DenseBlosk_3  output shape:	 torch.Size([1, 248, 3, 3])
BN  output shape:	 torch.Size([1, 248, 3, 3])
ReLU  output shape:	 torch.Size([1, 248, 3, 3])
AvgPool  output shape:	 torch.Size([1, 248, 1, 1])
FC  output shape:	 torch.Size([1, 10])
