In [1]:
import numpy as np
import torch
import torch.nn as nn

## 3.1 先验框Anchors

&emsp;在RetinaNet中，先验框Anchors的设定遵循着以下几个条件：

- 从P3到P7,先验框的大小呈金字塔式的增加，从$32^2$到$512^2$（对应`class Anchors()`的`__init__`部分）;
- 先验框的长宽比有$\{1:2,1:1,2:1\}$这三种对应`class Anchors()`的`__init__`部分）;
- 为了让先验框能更好地覆盖所有情况，金子塔每一层的先验框都会进行一个缩放，放大比例分别有$\{2^0,2^{(1/3)},2^{(2/3)}\}$对应`class Anchors()`的`__init__`部分）;
- 总的来说，每一层都对应有3x3=9个先验框;
- 对于每一个先验框，它对应一个长度为数据类别的one-hot向量和一个长度为4的框回归的向量;
- 当IoU大于等于0.5时，该先验框会被对应到一个ground truth对象;当IoU小于0.4时，则被定义为背景;当IoU大于等于0.4小于0.5时，则在训练中被忽略。

In [2]:
class Anchors(nn.Module):
    def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None):
        super(Anchors, self).__init__()

        if pyramid_levels is None:  # 对应有5个层级
            self.pyramid_levels = [3, 4, 5, 6, 7]
        if strides is None:  # 每个层级对应的特征图大小（原图/2^x）
            self.strides = [2 ** x for x in self.pyramid_levels]
        if sizes is None:  # base_size的大小（即原图/8的区域对应32×32）
            self.sizes = [2 ** (x + 2) for x in self.pyramid_levels]
        if ratios is None:  # 三种长宽比
            self.ratios = np.array([0.5, 1, 2])
        if scales is None:  # 三种缩放
            self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])

    def forward(self, image):
        
        image_shape = image.shape[2:]
        image_shape = np.array(image_shape)
        image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels]

        # compute anchors over all pyramid levels
        all_anchors = np.zeros((0, 4)).astype(np.float32)

        for idx, p in enumerate(self.pyramid_levels):
            # 对某一特定大小的特征图，生成其所有anchor坐标信息
            anchors         = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales)
            # 按照anchor坐标信息，stride数值，计算出实际的anchors所对应的信息（等于做了一个平移）
            shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors)
            all_anchors     = np.append(all_anchors, shifted_anchors, axis=0)

        all_anchors = np.expand_dims(all_anchors, axis=0)

        if torch.cuda.is_available():  # 如果支持GPU，则转换成cuda类型
            return torch.from_numpy(all_anchors.astype(np.float32)).cuda()
        else:
            return torch.from_numpy(all_anchors.astype(np.float32))

In [3]:
def generate_anchors(base_size=16, ratios=None, scales=None):
    """
    Generate anchor (reference) windows by enumerating aspect ratios X
    scales w.r.t. a reference window.
    """

    if ratios is None:
        ratios = np.array([0.5, 1, 2])

    if scales is None:
        scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])

    # 计算Anchor的总数9
    num_anchors = len(ratios) * len(scales)

    # initialize output anchors  初始化输出的结果9×4的大小
    anchors = np.zeros((num_anchors, 4))

    # scale base_size
    # 复制成2行，3列 ,即（2，9）
    # 转置成（9，2），每行都是一组ratio和scale的组合，比例是base_size的
    anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T

    # compute areas of anchors 其实2、3值是一样的
    areas = anchors[:, 2] * anchors[:, 3]

    # correct for ratios 实际2列上等于anchors[:, 2:]/sqrt（scales）而实际3列上等于anchors[:, 2:]×sqrt（scales）
    anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales)))
    anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales))

    # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) 转换anchors的形式
    anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
    anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T

    return anchors

## 3.2 Focal Loss & 框回归子网络的Loss
&emsp;Focal Loss仅用于分类子网络上，而整体的loss还包括框回归子网络的loss。

### 3.2.1 Focal Loss
&emsp;Focal Loss是交叉熵损失的改进版本，一个二分类交叉熵可以表示为：
$$CE(p,y)=
\begin{cases}
-\log{(p)}& \text{if}{\quad}y=1\\
-\log{(1-p)}& \text{otherwise}
\end{cases}$$
&emsp;上面公式可以简写成：
$$CE(p,y)=CE(p_t)=-\log{(p_t)}$$
&emsp;其中：
$$p_t=
\begin{cases}
p & \text{if}{\quad}y=1\\
1-p & \text{otherwise}
\end{cases}$$
#### $\alpha$：解决正负样本不平衡
&emsp;平衡交叉熵的提出是为了解决正负样本不平衡的问题的。它的原理很简单，为正负样本分配不同的权重比值$\alpha\in{[0,1]}$，当$y=1$时取$\alpha$，为$y=0$时取$1-\alpha$。我们使用和$p_t$类似的方法将上面$\alpha$的情况表示为$\alpha_t$，即:
$$\alpha_t=
\begin{cases}
\alpha & \text{if}{\quad}y=1\\
1-\alpha & \text{otherwise}
\end{cases}$$
&emsp;那么这个$\alpha\text{-balanced}$交叉熵损失可以表示为
$$CE(p_t)=-\alpha_t\log{(p_t)}$$
#### $\gamma$：解决难易样本不平衡
&emsp;FL中$\gamma$的引入是为了解决难易样本不平衡的问题的。图2是FL中example预测概率和loss值之间的关系。其中蓝色曲线是交叉熵（$\gamma=0$时Focal Loss退化为交叉熵损失）的曲线。
<img src="jpnoteImages/gamma.png" width="350" align="bottom"/>
&emsp;从曲线中我们可以看出对于一些well-classified examples (easy examples)虽然它们单个example的loss可以收敛到很小，但是由于它们的数量过于庞大，把一些hard example的loss覆盖掉。导致求和之后他们依然会支配整个批次样本的收敛方向。

&emsp;一个非常简单的策略是继续缩小easy examples的训练比重。作者的思路很简单，给每个乘以$(1-p_t)^{\gamma}$。因为easy example的score$p_t$往往接近1，那么$(1-p_t)^{\gamma}$值会比较小，因此example得到了抑制，相对的hard example得到了放大，例如图中$\gamma>0$的那四条曲线。

### 3.2.2 框回归子网络的Loss
&emsp;框回归子网络的Loss中prediction（regression）与ground truth之间的关系与SSD的部分基本一致，比较有区别的地方在于loss的定义形式为如下分段函数的形式（从代码中获得，论文中未详细提及）
$$
\text{diff}=|\text{targets}-\text{pred}| \\
\text{loss}=
\begin{cases}
0.5*9*\text{diff}^2 & \text{if}{\quad}\text{diff}<\frac{1}{9}\\
\text{diff}-\frac{0.5}{9} & \text{otherwise}
\end{cases}
$$
&emsp;两个loss对应的代码如下所示

In [4]:
def calc_iou(a, b):  # 用于计算IoU的函数
    area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])

    iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0])
    ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1])

    iw = torch.clamp(iw, min=0)
    ih = torch.clamp(ih, min=0)

    ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih

    ua = torch.clamp(ua, min=1e-8)

    intersection = iw * ih

    IoU = intersection / ua

    return IoU

class FocalLoss(nn.Module):
    #def __init__(self):

    def forward(self, classifications, regressions, anchors, annotations):
        alpha = 0.25  # Focal Loss中的alpha和gamma与论文中的一致
        gamma = 2.0
        batch_size = classifications.shape[0]
        classification_losses = []
        regression_losses = []

        anchor = anchors[0, :, :]

        # 重新将anchors的值从左上坐标，右下坐标）转为（中心坐标，宽高）格式
        anchor_widths  = anchor[:, 2] - anchor[:, 0]
        anchor_heights = anchor[:, 3] - anchor[:, 1]
        anchor_ctr_x   = anchor[:, 0] + 0.5 * anchor_widths
        anchor_ctr_y   = anchor[:, 1] + 0.5 * anchor_heights

        for j in range(batch_size):  # 对于batch_size中的每一张图片，做以下处理

            classification = classifications[j, :, :]
            regression = regressions[j, :, :]

            bbox_annotation = annotations[j, :, :]
            bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1]  # 取bbox_annotation的值不为-1的框
            
            classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)  # 将类别数值规范到[1e-4, 1.0 - 1e-4]，避免取对数时候出现问题

            if bbox_annotation.shape[0] == 0:  # 只计算classification_losses，不计算regression_losses，并执行完后跳过
                if torch.cuda.is_available():  # 分有没有GPU的两种情况
                    alpha_factor = torch.ones(classification.shape).cuda() * alpha

                    alpha_factor = 1. - alpha_factor
                    focal_weight = classification
                    focal_weight = alpha_factor * torch.pow(focal_weight, gamma)

                    bce = -(torch.log(1.0 - classification))

                    # cls_loss = focal_weight * torch.pow(bce, gamma)
                    cls_loss = focal_weight * bce
                    classification_losses.append(cls_loss.sum())  # 有classification_losses
                    regression_losses.append(torch.tensor(0).float())  # 但regression_losses为常数0
                    
                else:
                    alpha_factor = torch.ones(classification.shape) * alpha

                    alpha_factor = 1. - alpha_factor
                    focal_weight = classification
                    focal_weight = alpha_factor * torch.pow(focal_weight, gamma)

                    bce = -(torch.log(1.0 - classification))

                    # cls_loss = focal_weight * torch.pow(bce, gamma)
                    cls_loss = focal_weight * bce
                    classification_losses.append(cls_loss.sum())
                    regression_losses.append(torch.tensor(0).float())
                    
                continue

            # 接着计算所有anchor与真实框的IOU大小
            IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4])  # num_anchors x num_annotations

            # 找到所有anchor IOU最大的真实框的索引以及该IOU大小
            IoU_max, IoU_argmax = torch.max(IoU, dim=1)  # num_anchors x 1

            #import pdb
            #pdb.set_trace()

            # 开始计算两个子网络的损失
            targets = torch.ones(classification.shape) * -1  # (anchor_nums,class_num),初始全为-1

            if torch.cuda.is_available():  # 判断是否有GPU，有则用
                targets = targets.cuda()

            targets[torch.lt(IoU_max, 0.4), :] = 0  # IOU<0.4为负样本，记为0

            positive_indices = torch.ge(IoU_max, 0.5)  # IOU>=0.5为正样本，找到index

            num_positive_anchors = positive_indices.sum()  # 正样本个数

            assigned_annotations = bbox_annotation[IoU_argmax, :]  # 通过IoU_argmax找到对应的实际annotations为哪一个（anchor_nums,4）

            # compute the loss for classification 计算分类子网络的损失
            targets[positive_indices, :] = 0  # 将targets中正样本对应的类别全赋值为0
            targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1  # 通过查assigned_annotations第5位上的标签信息，实现one-hot的效果

            if torch.cuda.is_available():  # 判断是否有GPU，有则用
                alpha_factor = torch.ones(targets.shape).cuda() * alpha
            else:
                alpha_factor = torch.ones(targets.shape) * alpha

            # torch.where的作用是[1]满足则[2]，不满足则[3]
            alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor)  # 正样本用alpha，负样本用1-alpha
            focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification)  # 正样本用1-classification ，负样本用classification
            focal_weight = alpha_factor * torch.pow(focal_weight, gamma)  # 对应文中的alpha×(1-classification)^gamma

            bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification))  # 普通的Balanced Cross Entropy公式

            # cls_loss = focal_weight * torch.pow(bce, gamma)
            cls_loss = focal_weight * bce  # 将focal_weight与普通的Balanced Cross Entropy就可以得到Focal Loss

            if torch.cuda.is_available():  # 如果targets不存在（为-1），此时的cls_loss置为常数0
                cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda())
            else:
                cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape))

            classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0))  # 将classification loss求并除以num_positive_anchors的数目

            # compute the loss for regression 计算回归框子函数的损失

            if positive_indices.sum() > 0:  # 当存在positive_indices的时候进行计算
                assigned_annotations = assigned_annotations[positive_indices, :]  # 找到当存在positive_indices的时候进行计算对应的assigned_annotations

                # 找到positive_indices对应的anchors的四个值
                anchor_widths_pi = anchor_widths[positive_indices]
                anchor_heights_pi = anchor_heights[positive_indices]
                anchor_ctr_x_pi = anchor_ctr_x[positive_indices]
                anchor_ctr_y_pi = anchor_ctr_y[positive_indices]

                # 重新将assigned_annotations的值从左上坐标，右下坐标）转为（中心坐标，宽高）格式
                gt_widths  = assigned_annotations[:, 2] - assigned_annotations[:, 0]
                gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1]
                gt_ctr_x   = assigned_annotations[:, 0] + 0.5 * gt_widths
                gt_ctr_y   = assigned_annotations[:, 1] + 0.5 * gt_heights

                # clip widths to 1  最小框的长宽不会小于1个像素点
                gt_widths  = torch.clamp(gt_widths, min=1)
                gt_heights = torch.clamp(gt_heights, min=1)

                # 结合assigned_annotations（实际的）和anchor计算regression应该预测的值为多少（这部分和SSD的过程一致）
                targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
                targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
                targets_dw = torch.log(gt_widths / anchor_widths_pi)
                targets_dh = torch.log(gt_heights / anchor_heights_pi)

                targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh))
                targets = targets.t()

                if torch.cuda.is_available():  # 将targets的值做一个扩大，应该是为了扩大regression输出值拟合的范围
                    targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda()
                else:
                    targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]])

                negative_indices = 1 + (~positive_indices)  # 无用代码

                regression_diff = torch.abs(targets - regression[positive_indices, :])  # 取实际与预测的相对误差

                regression_loss = torch.where(
                    torch.le(regression_diff, 1.0 / 9.0),
                    0.5 * 9.0 * torch.pow(regression_diff, 2),
                    regression_diff - 0.5 / 9.0
                )  # 分段式的loss，小于1/9时，为二范数，大于1/9时为y=x+c
                regression_losses.append(regression_loss.mean())
            else:
                if torch.cuda.is_available():
                    regression_losses.append(torch.tensor(0).float().cuda())
                else:
                    regression_losses.append(torch.tensor(0).float())

        # 分别返回classification_losses和regression_losses
        return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True)

## 3.3 其他功能性函数

In [5]:
class BBoxTransform(nn.Module):
    '''
    该函数的作用在于将class FocalLoss中的
        if torch.cuda.is_available():  # 将targets的值做一个扩大，应该是为了扩大regression输出值拟合的范围
            targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda()
        else:
            targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]])
    部分的变换变回去，并结合Anchors和regression的值计算出实际相对于左上角的box位置
    '''
    def __init__(self, mean=None, std=None):
        super(BBoxTransform, self).__init__()
        if mean is None:
            if torch.cuda.is_available():
                self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)).cuda()
            else:
                self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32))

        else:
            self.mean = mean
        if std is None:
            if torch.cuda.is_available():
                self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)).cuda()
            else:
                self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32))
        else:
            self.std = std

    def forward(self, boxes, deltas):

        widths  = boxes[:, :, 2] - boxes[:, :, 0]
        heights = boxes[:, :, 3] - boxes[:, :, 1]
        ctr_x   = boxes[:, :, 0] + 0.5 * widths
        ctr_y   = boxes[:, :, 1] + 0.5 * heights

        dx = deltas[:, :, 0] * self.std[0] + self.mean[0]
        dy = deltas[:, :, 1] * self.std[1] + self.mean[1]
        dw = deltas[:, :, 2] * self.std[2] + self.mean[2]
        dh = deltas[:, :, 3] * self.std[3] + self.mean[3]

        pred_ctr_x = ctr_x + dx * widths
        pred_ctr_y = ctr_y + dy * heights
        pred_w     = torch.exp(dw) * widths
        pred_h     = torch.exp(dh) * heights

        pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w
        pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h
        pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w
        pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h

        pred_boxes = torch.stack([pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2)

        return pred_boxes