# 残差块

$f(x) --> f(x) - x$，后者实际中往往更容易优化，我们只需要将右者的结果为0，然后学习weight和bias。

实际中，当理想映射$f(x)$极接近于恒等映射时，残差映射也易于捕捉恒等映射的细微波动。

在残差块中，输入可以通过跨层的数据线路更快地向前传播。

In [1]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import d2lzh_pytorch as d2l

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# 保存在了d2lzh_pytorch 
class Residual(nn.Module):
    def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
        super(Residual, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels,
                               kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(out_channels, out_channels,
                               kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return F.relu(Y + X)

In [5]:
# 查看输入和输出形状
blk = Residual(3, 3)
X = torch.rand((4, 3, 6, 6))
blk(X).shape

torch.Size([4, 3, 6, 6])

In [6]:
blk = Residual(3, 6, use_1x1conv=True, stride=2)
blk(X).shape

torch.Size([4, 6, 3, 3])

可以看出增加输出通道的同时，减半输出高和宽

# ResNet
这里我们组织ResNet-18模型（更深层次的有ResNet-152）

在输出通道数为64，步幅为2的7 * 7卷积层后接步幅为2的3 * 3的最大池化层。不同之处在于ResNet每个卷积层后增加的批量归一化层。

GoogleNet后接4个Inception模块，ResNet使用4个由残差块组成的模块，每个模块使用若干个同样输出通道数组的残差块。

最后，加入全局平均池化层后接上全连接层输出。

In [9]:
net = nn.Sequential(
    nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
net

Sequential(
  (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
)

In [10]:
# 注意这一特别模块
def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
    if first_block:
        # 第一个模块的通道数同输入通道数一致
        assert in_channels == out_channels
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(in_channels, out_channels, use_1x1conv=True, stride=2))
        else:
            blk.append(Residual(out_channels, out_channels))
    return nn.Sequential(*blk)

In [11]:
net.add_module('resnet_block1', resnet_block(64, 64, 2, first_block=True))
net.add_module('resnet_block2', resnet_block(64, 128, 2))
net.add_module('resnet_block3', resnet_block(128, 256, 2))
net.add_module('resnet_block4', resnet_block(256, 512, 2))

In [12]:
# 全局平均池化层后接上全连接层输出
net.add_module('global_avg_pool', d2l.GlobalAvgPool2d())
net.add_module('fc', nn.Sequential(d2l.FlattenLayer(), nn.Linear(512, 10)))

In [13]:
X = torch.rand((1, 1, 224, 244))
for name, layer in net.named_children():
    X = layer(X)
    print(name, ' output shape:\t', X.shape)

0  output shape:	 torch.Size([1, 64, 112, 122])
1  output shape:	 torch.Size([1, 64, 112, 122])
2  output shape:	 torch.Size([1, 64, 112, 122])
3  output shape:	 torch.Size([1, 64, 56, 61])
resnet_block1  output shape:	 torch.Size([1, 64, 56, 61])
resnet_block2  output shape:	 torch.Size([1, 128, 28, 31])
resnet_block3  output shape:	 torch.Size([1, 256, 14, 16])
resnet_block4  output shape:	 torch.Size([1, 512, 7, 8])
global_avg_pool  output shape:	 torch.Size([1, 512, 1, 1])
fc  output shape:	 torch.Size([1, 10])
