### Densely Connected Networks (DenseNet)

![The main difference between ResNet (left) and DenseNet (right) in cross-layer connections: use of addition and use of concatenation. ](http://www.d2l.ai/_images/densenet.svg)

$$\mathbf{x} \to \left[\mathbf{x}, f_1(\mathbf{x}), f_2(\mathbf{x}, f_1(\mathbf{x})), f_3(\mathbf{x}, f_1(\mathbf{x}), f_2(\mathbf{x}, f_1(\mathbf{x})), \ldots\right]$$

![Dense connections in DenseNet](http://www.d2l.ai/_images/DenseNetDense.svg)

### Dense Blocks

In [1]:
%matplotlib inline
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
import numpy as np
import torchvision
import torchvision.datasets as datasets
from torchvision import transforms
import torch.optim as optim
import time

A dense block consists of multiple `conv_block` units, each using the same number of output channels. In the forward computation, however, we concatenate the input and output of each block on the channel dimension.

In [2]:
def conv_block(input_channels, num_channels):
    blk = nn.Sequential(nn.BatchNorm2d(input_channels), 
            nn.ReLU(),
            nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1))
    return blk
    
class DenseBlock(nn.Module):
    def __init__(self, num_convs, input_channels, num_channels):
        super(DenseBlock, self).__init__()
        layer = []
        for i in range(num_convs):
            layer.append(conv_block((num_channels * i + input_channels), num_channels))
        self.net = nn.Sequential(*layer)

    def forward(self, X):
        for blk in self.net:
            Y = blk(X)
            # Concatenate the input and output of each block on the channel dimension.
            X = torch.cat((X, Y), dim=1)  
        return X

Testing it with data.

In [3]:
blk = DenseBlock(2, 3, 10)
X = torch.randn(4, 3, 8, 8)
Y = blk(X)
Y.shape

torch.Size([4, 23, 8, 8])

### Transition Layers to reduce dimensionality

In [4]:
def transition_block(input_channels, num_channels):
    blk = nn.Sequential(
            nn.BatchNorm2d(input_channels), 
            nn.ReLU(),
            nn.Conv2d(input_channels, num_channels, kernel_size=1),
            nn.AvgPool2d(kernel_size=2, stride=2)
            )
    return blk

In [5]:
blk = transition_block(23, 10)
print(Y.shape)
print(blk(Y).shape)

torch.Size([4, 23, 8, 8])
torch.Size([4, 10, 4, 4])


### DenseNet Model

In [6]:
class flatten(torch.nn.Module):
    def forward(self, x):
        return x.view(x.shape[0], -1)

net_1 = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

4 dense blocks with a transition layer in between.

In [7]:
num_channels, growth_rate = 64, 32  # Num_channels: the current number of channels.
num_convs_in_dense_blocks = [4, 4, 4, 4]

net_2 = []
for i, num_convs in enumerate(num_convs_in_dense_blocks):
    net_2.append(DenseBlock(num_convs, num_channels, growth_rate))
    # This is the number of output channels in the previous dense block.
    num_channels += num_convs * growth_rate
    # A transition layer that haves the number of channels is added between the dense blocks.
    if i != len(num_convs_in_dense_blocks) - 1:
        net_2.append(transition_block(num_channels, num_channels // 2))
        num_channels = num_channels // 2
net_2 = nn.Sequential(*net_2)

### Last stage

Similar to ResNet, a global pooling layer and fully connected layer are connected at the end to produce the output.

In [8]:
net = nn.Sequential(
        net_1,
        net_2,
        nn.BatchNorm2d(num_channels), 
        nn.ReLU(), 
        nn.AdaptiveMaxPool2d((1,1)),
        flatten(),
        nn.Linear(num_channels, 10)
        )

### Data Acquisition and Training

Since we are using a deeper network here we only use 96x96 images for speed. 

In [9]:
def evaluate_accuracy(data_iter, net):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum,n = 0,0
    for (imgs, labels) in data_iter:
        # send data to the GPU if cuda is availabel
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        net.eval()
        with torch.no_grad():
            labels = labels.long()
            acc_sum += torch.sum((torch.argmax(net(imgs), dim=1) == labels)).float()
            n += labels.shape[0]
    return acc_sum.item()/n

def weights_init(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)

if torch.cuda.is_available():
    print('Training using GPU.')
    net.cuda()
else:
    print('Training using CPU.')

#Initialize network parameters.
net.apply(weights_init)

lr, num_epochs, batch_size = 0.1, 5, 128
optimizer = torch.optim.SGD(net.parameters(), lr=lr)

# Apply resize to 96*96 at trasfrom
transform = transforms.Compose([transforms.Resize(96),
                                transforms.ToTensor()
                                ]) 
mnist_trainset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)
# Loading training set and test set using DataLoader.
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=batch_size,
    shuffle=True, num_workers=0)
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=batch_size,
    shuffle=True, num_workers=0)

criterion = nn.CrossEntropyLoss()

from tqdm import tqdm
for epoch in range(num_epochs):
    net.train() # Switch to training mode
    n, start = 0, time.time()
    train_l_sum = torch.tensor([0.0], dtype=torch.float32)
    train_acc_sum = torch.tensor([0.0], dtype=torch.float32)
    train_iter = iter(train_loader)
    # for _, (X, y) in tqdm(enumerate(train_iter)):
    for X, y in train_iter:
        optimizer.zero_grad()
        if torch.cuda.is_available():
            X = X.cuda()
            y = y.cuda()
            train_l_sum = train_l_sum.cuda()
            train_acc_sum = train_acc_sum.cuda()
        y_hat = net(X)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            y = y.long()
            train_l_sum += loss.float()
            train_acc_sum += (torch.sum((torch.argmax(y_hat, dim=1) == y))).float()
            n += y.shape[0]

    test_acc = evaluate_accuracy(iter(test_loader), net) 
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'\
        % (epoch + 1, train_l_sum/n, train_acc_sum/n, test_acc, time.time() - start))

Training using GPU.
epoch 1, loss 0.0047, train acc 0.801, test acc 0.877, time 33.0 sec
epoch 2, loss 0.0025, train acc 0.882, test acc 0.883, time 33.6 sec
epoch 3, loss 0.0021, train acc 0.902, test acc 0.832, time 33.9 sec
epoch 4, loss 0.0018, train acc 0.913, test acc 0.892, time 33.7 sec
epoch 5, loss 0.0016, train acc 0.922, test acc 0.912, time 33.8 sec
