### Residual Networks (ResNet)

![Residual block](http://www.d2l.ai/_images/residual-block.svg)

In [1]:
%matplotlib inline
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
import numpy as np
import torchvision
import torchvision.datasets as datasets
from torchvision import transforms
import torch.optim as optim
import time

In [2]:
class Residual(nn.Module): # This class is part of the d2l package
    def __init__(self, input_channels, num_channels, use_1x1conv=False, strides=1):
        super(Residual, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=strides)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)
        self.relu = nn.ReLU()

    def forward(self, X):
        Y = self.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return self.relu(Y + X)

Networks

![Left: regular ResNet block; Right: ResNet block with 1x1 convolution](http://www.d2l.ai/_images/ResNetBlock.svg) 


In [3]:
blk = Residual(3, 3)
X = torch.randn(size=(4, 3, 6, 6))
blk(X).shape

torch.Size([4, 3, 6, 6])

We also have the option to halve the output height and width while increasing the number of output channels.

In [4]:
blk = Residual(3, 6, use_1x1conv=True, strides=2)
blk(X).shape

torch.Size([4, 6, 3, 3])

### ResNet Model Stage 1

ResNet and GoogLeNet are quite similar on the initial layers.

In [5]:
res1 = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64), 
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

We also need a ResNet block.

In [6]:
def resnet_block(input_channels, num_channels, num_residuals, first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(input_channels, num_channels, use_1x1conv=True, strides=2))
        else:
            blk.append(Residual(num_channels, num_channels))
    return nn.Sequential(*blk)

Then, we add all the residual blocks to ResNet. Here, two residual blocks are used for each module.

Finally, just like GoogLeNet, we add a global average pooling layer, followed by the fully connected layer output.

In [7]:
class flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

net = nn.Sequential(
        res1,
        resnet_block(64, 64, 2, first_block=True),
        resnet_block(64, 128, 2),
        resnet_block(128, 256, 2),
        resnet_block(256, 512, 2),
        nn.AdaptiveMaxPool2d((1,1)),
        flatten(),
        nn.Linear(512, 10)
        )

### Full ResNet-18

![ResNet 18](http://www.d2l.ai/_images/ResNetFull.svg)

In [8]:
# print(net)
X = torch.randn(size=(1, 1, 96, 96))
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, 'output shape:\t', X.shape)

Sequential output shape:	 torch.Size([1, 64, 24, 24])
Sequential output shape:	 torch.Size([1, 64, 24, 24])
Sequential output shape:	 torch.Size([1, 128, 12, 12])
Sequential output shape:	 torch.Size([1, 256, 6, 6])
Sequential output shape:	 torch.Size([1, 512, 3, 3])
AdaptiveMaxPool2d output shape:	 torch.Size([1, 512, 1, 1])
flatten output shape:	 torch.Size([1, 512])
Linear output shape:	 torch.Size([1, 10])


## Data Acquisition and Training

We train ResNet on the Fashion-MNIST data set, just like before. The only thing that has changed is the learning rate that decreased again, due to the more complex architecture.

In [9]:
def evaluate_accuracy(data_iter, net):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum,n = 0,0
    for (imgs, labels) in data_iter:
        # send data to the GPU if cuda is availabel
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        net.eval()
        with torch.no_grad():
            labels = labels.long()
            acc_sum += torch.sum((torch.argmax(net(imgs), dim=1) == labels)).float()
            n += labels.shape[0]
    return acc_sum.item()/n
def weights_init(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)

if torch.cuda.is_available():
    print('Training using GPU.')
    net.cuda()
else:
    print('Training using CPU.')

#Initialize network parameters.
net.apply(weights_init)

lr, num_epochs, batch_size = 0.05, 5, 256
optimizer = torch.optim.SGD(net.parameters(), lr=lr)

# Apply resize to 96*96 at trasfrom
transform = transforms.Compose([transforms.Resize(96),
                                transforms.ToTensor()
                                ]) 
mnist_trainset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)
# Loading training set and test set using DataLoader.
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=batch_size,
    shuffle=True, num_workers=0)
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=batch_size,
    shuffle=True, num_workers=0)

criterion = nn.CrossEntropyLoss()

from tqdm import tqdm
for epoch in range(num_epochs):
    net.train() # Switch to training mode
    n, start = 0, time.time()
    train_l_sum = torch.tensor([0.0], dtype=torch.float32)
    train_acc_sum = torch.tensor([0.0], dtype=torch.float32)
    train_iter = iter(train_loader)
    # for _, (X, y) in tqdm(enumerate(train_iter)):
    for X, y in train_iter:
        optimizer.zero_grad()
        if torch.cuda.is_available():
            X = X.cuda()
            y = y.cuda()
            train_l_sum = train_l_sum.cuda()
            train_acc_sum = train_acc_sum.cuda()
        y_hat = net(X)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            y = y.long()
            train_l_sum += loss.float()
            train_acc_sum += (torch.sum((torch.argmax(y_hat, dim=1) == y))).float()
            n += y.shape[0]

    test_acc = evaluate_accuracy(iter(test_loader), net) 
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'\
        % (epoch + 1, train_l_sum/n, train_acc_sum/n, test_acc, time.time() - start))

Training using GPU.
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw
Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)



epoch 1, loss 0.0060, train acc 0.706, test acc 0.798, time 44.3 sec
epoch 2, loss 0.0014, train acc 0.862, test acc 0.793, time 45.7 sec
epoch 3, loss 0.0011, train acc 0.893, test acc 0.886, time 47.1 sec
epoch 4, loss 0.0009, train acc 0.912, test acc 0.861, time 46.9 sec
epoch 5, loss 0.0008, train acc 0.925, test acc 0.861, time 47.0 sec
