## 2.1 RetinaNet的简介

&emsp;RetinaNet出自何恺明大神的《Focal Loss for Dense Object Detection》。这是一个诞生于SSD和Yolo-V2后，Yolo-v3之前的一个模型。它主要的贡献在于针对单阶段法（one-stage)目标检测模型中前景(positive)和背景(negatives)类别的不平衡问题，提出了一种叫做Focal Loss的损失函数，用来降低大量easy negatives在标准交叉熵中所占权重（提高hard negatives所占权重)。

&emsp;而RetinaNet是一种提出来测试Focal Loss性能的一种模型。

In [1]:
import torch.nn as nn
import torch
import math
import torch.utils.model_zoo as model_zoo
from torchvision.ops import nms
from retinanet.utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes
from retinanet.anchors import Anchors
from retinanet import losses

model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}

## 2.2 RetinaNet的结构

<img src="jpnoteImages/retina_structure.png" width="700" align="bottom"/>
&emsp;RetinaNet的基本架构可以看成是残差网络ResNet+特征金字塔网络FPN，以特征金字塔不同的尺寸特征图作为输入，搭建两个用于分类和框回归的子网络。

- 分类子网络（Class Subnet）输出的特征图尺寸为（W,H,KA)，其中W、H为特征图宽高，KA为特征图通道，存放A个anchor各自的类别信息（K为类别数）。
- 框回归子网络（Box Subnet）输出的特征图尺寸为（W,H,4A)，其中4为确定一个框所需要的参数。

In [2]:
class PyramidFeatures(nn.Module):
    def __init__(self, C3_size, C4_size, C5_size, feature_size=256):  # 与FPN的代码一致，通道数保持为256
        super(PyramidFeatures, self).__init__()

        # upsample C5 to get P5 from the FPN paper
        self.P5_1 = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0)  # C5通过1x1且stride=1的卷积做通道上的降维
        self.P5_upsampled = nn.Upsample(scale_factor=2, mode='nearest')  # 用于上采样至与C4的尺寸相同
        self.P5_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)  # 通过3x3且stride=1的卷积输出特征为P5

        # add P5 elementwise to C4
        self.P4_1 = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0)  # C4通过1x1且stride=1的卷积做通道上的降维
        self.P4_upsampled = nn.Upsample(scale_factor=2, mode='nearest')  # 用于上采样至与C3的尺寸相同
        self.P4_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)  # 通过3x3且stride=1的卷积输出特征为P4

        # add P4 elementwise to C3
        self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0)  # C3通过1x1且stride=1的卷积做通道上的降维
        self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)  # 通过3x3且stride=1的卷积输出特征为P3

        # "P6 is obtained via a 3x3 stride-2 conv on C5"
        self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1)  # C5通过3x3且stride=2的卷积尺寸缩小到1/2^2，且直接输出为P6

        # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6"
        self.P7_1 = nn.ReLU()  # 通过一个relu，添加非线性性
        self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1)  # 通过3x3且stride=2的卷积尺寸进一步缩小到1/2^2，且直接输出为P7

    def forward(self, inputs):
        C3, C4, C5 = inputs  # 与FPN不同，未使用C2的特征

        P5_x = self.P5_1(C5)  # 代表C5处的Lateral Connection
        P5_upsampled_x = self.P5_upsampled(P5_x)  # 上采样到与C4同等大小
        P5_x = self.P5_2(P5_x)  # 通过卷积结构获得最后输出的P5

        P4_x = self.P4_1(C4)  # 代表C4处的Lateral Connection
        P4_x = P5_upsampled_x + P4_x  # 与C5上采样后的特征相加
        P4_upsampled_x = self.P4_upsampled(P4_x)  # 上采样到与C3同等大小
        P4_x = self.P4_2(P4_x)  # 通过卷积结构获得最后输出的P4

        P3_x = self.P3_1(C3)  # 代表C3处的Lateral Connection
        P3_x = P3_x + P4_upsampled_x  # 与C4上采样后的特征相加
        P3_x = self.P3_2(P3_x)  # 通过卷积结构获得最后输出的P3

        P6_x = self.P6(C5)  # 卷积C5直接获得P6

        P7_x = self.P7_1(P6_x)
        P7_x = self.P7_2(P7_x)  # 通过relu加卷积获得P7

        return [P3_x, P4_x, P5_x, P6_x, P7_x]

### 2.2.3 框回归子网络Box Subnet
&emsp;框回归子网络由四层卷积层组成，每层卷积层的stirde=1,kernel_size=3,padding=1，也就是说特征图通过该网络，长宽大小不变，通道维变为4*num_anchors。

In [3]:
class RegressionModel(nn.Module):
    def __init__(self, num_features_in, num_anchors=9, feature_size=256):
        super(RegressionModel, self).__init__()

        # 由于FPN层的特征维度为256,与feature_size大小一直，在这里定义了4个一样的3x3且stride=1的卷积
        self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
        self.act1 = nn.ReLU()

        self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
        self.act2 = nn.ReLU()

        self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
        self.act3 = nn.ReLU()

        self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
        self.act4 = nn.ReLU()

        # 定义3x3且stride=1的卷积输出num_anchors * 4的通道数
        self.output = nn.Conv2d(feature_size, num_anchors * 4, kernel_size=3, padding=1)

    def forward(self, x):
        out = self.conv1(x)
        out = self.act1(out)

        out = self.conv2(out)
        out = self.act2(out)

        out = self.conv3(out)
        out = self.act3(out)

        out = self.conv4(out)
        out = self.act4(out)

        out = self.output(out)

        # out is B x C x W x H, with C = 4*num_anchors
        out = out.permute(0, 2, 3, 1)

        return out.contiguous().view(out.shape[0], -1, 4)

### 2.2.4 分类子网络Class Subnet
&emsp;和框回归子网络类似，分类子网络也是通过了四层卷积层，特征图的长宽保持不变，通道扩展为类别数 * anchor数，用于存放所有基于anchor的检测框的分类信息。

In [5]:
class ClassificationModel(nn.Module):
    def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256):  # prior 未用到
        super(ClassificationModel, self).__init__()

        self.num_classes = num_classes
        self.num_anchors = num_anchors

        # 与框回归子网络类似，在这里定义了4个一样的3x3且stride=1的卷积
        self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
        self.act1 = nn.ReLU()

        self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
        self.act2 = nn.ReLU()

        self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
        self.act3 = nn.ReLU()

        self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
        self.act4 = nn.ReLU()

        # 定义3x3且stride=1的卷积输出num_anchors * num_classes的通道数
        self.output = nn.Conv2d(feature_size, num_anchors * num_classes, kernel_size=3, padding=1)
        self.output_act = nn.Sigmoid()  # 用Sigmoid将预测结果转换到0～1之间

    def forward(self, x):
        out = self.conv1(x)
        out = self.act1(out)

        out = self.conv2(out)
        out = self.act2(out)

        out = self.conv3(out)
        out = self.act3(out)

        out = self.conv4(out)
        out = self.act4(out)

        out = self.output(out)
        out = self.output_act(out)

        # out is B x C x W x H, with C = n_classes + n_anchors
        out1 = out.permute(0, 2, 3, 1)

        batch_size, width, height, channels = out1.shape

        # 将最后的Channel分解为num_anchors和num_classes
        out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes)

        # 最后返回大小为[batch_size, -1, num_classes]
        return out2.contiguous().view(x.shape[0], -1, self.num_classes)

### 2.2.5 整体模型的定义

<img src="jpnoteImages/res_structure.jpg" width="700" align="bottom"/>

In [3]:
class ResNet(nn.Module):

    def __init__(self, num_classes, block, layers):
        self.inplanes = 64
        super(ResNet, self).__init__()
        # 定义ResNet每一个layer的部分，对应上文中的ResNet
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        # 因为BasicBlock比Bottleneck少一层，所以是conv2.out_channels，而Bottleneck是conv3.out_channels
        # 获得conv3、conv4、conv5的最后一层的大小，用于定义FPN网络
        if block == BasicBlock:
            fpn_sizes = [self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels,
                         self.layer4[layers[3] - 1].conv2.out_channels]
        elif block == Bottleneck:
            fpn_sizes = [self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels,
                         self.layer4[layers[3] - 1].conv3.out_channels]
        else:
            raise ValueError(f"Block type {block} not understood")

        # 用上面获得的大小，定义FPN网络
        self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2])

        self.regressionModel = RegressionModel(256)  # 定义框子网络
        self.classificationModel = ClassificationModel(256, num_classes=num_classes)  # 定义分类子网络

        self.anchors = Anchors()  # 生成Anchors的部分（下文中细说）

        self.regressBoxes = BBoxTransform()  # 基于anchors计算实际的boxes（下文中细说）

        self.clipBoxes = ClipBoxes()  # 限制box的范围，不超过图片部分

        self.focalLoss = losses.FocalLoss()  # 定义Focal Loss的部分（下文中细说）

        # 初始化卷积和bn中的参数
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

        prior = 0.01

        # 初始化两个子网络输出层（output）中的模型参数
        self.classificationModel.output.weight.data.fill_(0)
        self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior))

        self.regressionModel.output.weight.data.fill_(0)
        self.regressionModel.output.bias.data.fill_(0)

        self.freeze_bn()  # 固定bn层的参数不变

    def _make_layer(self, block, planes, blocks, stride=1):  # 用于建立一个Layer
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = [block(self.inplanes, planes, stride, downsample)]
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def freeze_bn(self):
        '''Freeze BatchNorm layers.'''
        for layer in self.modules():
            if isinstance(layer, nn.BatchNorm2d):
                layer.eval()

    def forward(self, inputs):

        if self.training:  # train和测试中inputs的形状会不同
            img_batch, annotations = inputs
        else:
            img_batch = inputs

        # ResNet
        x = self.conv1(img_batch)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x1 = self.layer1(x)
        x2 = self.layer2(x1)
        x3 = self.layer3(x2)
        x4 = self.layer4(x3)

        # FPN
        features = self.fpn([x2, x3, x4])

        # 框回归子网络
        regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1)

        # 分类子网络
        classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1)

        # 生成每一张图片最基本的anchors的大小和位置信息（下文中将详细说）
        anchors = self.anchors(img_batch)

        if self.training:
            # 计算Focal Loss（下文中将详细说）
            return self.focalLoss(classification, regression, anchors, annotations)

NameError: name 'nn' is not defined