<a href="https://colab.research.google.com/github/Kanghee-Lee/Yolact_Pytorch/blob/master/YOLACT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F


backbone_selected_layers=[1, 2, 3]


############### Backbone ###############
class Bottleneck(nn.Module):
    '''
    Bottleneck : Convolutional Block & Identity Block
    Identity Block's downsample is None
    Convolutional Block's downsample consists of conv2D and BatchNorm

    Bottleneck is implemented as ResNet. 
    You can see ResNet's structure and parameters in https://ganghee-lee.tistory.com/41?category=863370
    '''
    expansion = 4

    # Only conv2 kernel size is 3, others are 1
    def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d):
        super().__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = norm_layer(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = norm_layer(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm_layer(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

In [0]:
class ResNetBackbone(nn.Module):
    '''
    Implemented using Bottleneck.
    '''
    def __init__(self, layers, block=Bottleneck, norm_layer=nn.BatchNorm2d):
        super().__init__()

        self.num_base_layers = len(layers)
        self.layers = nn.ModuleList()
        self.channels = []
        self.norm_layer = norm_layer
        self.inplanes = 64
        # stage 1 : (3, 550, 550) -> (64, 138, 138)
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        # conv1's stride is 2, it reduces input size by 2.
        self.bn1 = norm_layer(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        # maxpool reduces input size by 2. Finally stage 1 shrinks size by 4.

        self.make_layer(block, 64, layers[0])              # stage 2
        self.make_layer(block, 128, layers[1], stride=2)   # stage 3
        self.make_layer(block, 256, layers[2], stride=2)   # stage 4
        self.make_layer(block, 512, layers[3], stride=2)   # stage 5

        self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]

    def make_layer(self, block, planes, blocks, needsConv=True, stride=1):
        downsample = None

        # All stages (except stage 1) has conv block, and it reduces img size by half .(stage 2 has conv block which is stride 1)
        # So it also does not reduces size by half.
        # downsample consists of conv2D and Batch Norm used for Convolutional Block.
        if needsConv:
            downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes * block.expansion,
                                                 kernel_size=1, stride=stride, bias=False),
                                       self.norm_layer(planes * block.expansion))
        # add convolution Block.
        layers = [block(self.inplanes, planes, stride, downsample, self.norm_layer)]
        self.inplanes = planes * block.expansion

        # add Identity Block.
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer))

        layer = nn.Sequential(*layers)

        self.channels.append(planes * block.expansion)
        self.layers.append(layer)

    def forward(self, x):
        """ Returns a list of convouts for each layer. """
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.maxpool(x)

        outs = []
        for i, layer in enumerate(self.layers):
            x = layer(x)
            outs.append(x)

        return tuple(outs)
# --------------------------------------------------------------------------------------------------------------------------------
    def init_backbone(self, path):
        """ Initializes the backbone weights for training. """
        state_dict = torch.load(path)

        # Replace layer1 -> layers.0 etc.
        keys = list(state_dict)
        for key in keys:
            if key.startswith('layer'):
                idx = int(key[5])
                new_key = 'layers.' + str(idx - 1) + key[6:]
                state_dict[new_key] = state_dict.pop(key)

        # Note: Using strict=False is berry scary. Triple check this.
        self.load_state_dict(state_dict, strict=False)

    def add_layer(self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck):
        """ Add a downsample layer to the backbone as per what SSD does. """
        self.make_layer(block, conv_channels // block.expansion, blocks=depth, stride=downsample)

def construct_backbone(cfg_backbone=ResNetBackbone):
    # resnet101 has 3, 4, 23, 3 blocks for each stage
    backbone = cfg_backbone([3, 4, 23, 3])

    # Add downsampling layers until we reach the number we need
    selected_layers=[1, 2, 3]
    num_layers = max(selected_layers) + 1

    while len(backbone.layers) < num_layers:
        backbone.add_layer()

    return backbone

from torchvision import models
from torchsummary import summary

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # PyTorch v0.4.0
model = construct_backbone().to(device)
summary(model, (3, 550, 550))
input=torch.randn(1, 3, 550, 550)
backbone=construct_backbone()(input)

print('backbone output feature 개수 :', len(backbone))
print('C2 output shape : ', backbone[0].shape)
print('C3 output shape : ', backbone[1].shape)
print('C4 output shape : ', backbone[2].shape)
print('C5 output shape : ', backbone[3].shape)


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 275, 275]           9,408
       BatchNorm2d-2         [-1, 64, 275, 275]             128
              ReLU-3         [-1, 64, 275, 275]               0
         MaxPool2d-4         [-1, 64, 138, 138]               0
            Conv2d-5         [-1, 64, 138, 138]           4,096
       BatchNorm2d-6         [-1, 64, 138, 138]             128
              ReLU-7         [-1, 64, 138, 138]               0
            Conv2d-8         [-1, 64, 138, 138]          36,864
       BatchNorm2d-9         [-1, 64, 138, 138]             128
             ReLU-10         [-1, 64, 138, 138]               0
           Conv2d-11        [-1, 256, 138, 138]          16,384
      BatchNorm2d-12        [-1, 256, 138, 138]             512
           Conv2d-13        [-1, 256, 138, 138]          16,384
      BatchNorm2d-14        [-1, 256, 1

In [0]:
############### Feature Pyramid Network ###############
class FPN(nn.Module):
    """
    Implemented FPN here is different from the FPN introduced in https://arxiv.org/pdf/1612.03144.pdf.
    """

    def __init__(self, in_channels):
        '''
        in_channels=[512, 1024, 2048]
        '''
        super().__init__()
        self.num_downsample = 2
        self.in_channels = in_channels

        self.last_layers = nn.ModuleList([nn.Conv2d(x, 256, kernel_size=1) for x in reversed(self.in_channels)])
        # 1 x 1 conv to backbone feature map
        # ModuleList((0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
        #            (1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
        #            (2): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1)))

        self.final_layers = nn.ModuleList([nn.Conv2d(256, 256, kernel_size=3, padding=1) for _ in self.in_channels])
        # 3 x 3 conv to FPN feature map in order to recover error that might be occur during upsampling 
        # and add two different feature map
        # ModuleList((0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        #            (1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        #            (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))

        self.downsample_layers = nn.ModuleList([nn.Conv2d(256, 256, kernel_size=3, padding=1, stride=2)
                                                for _ in range(self.num_downsample)])
        # 3 x 3 conv to P5 in order to make P6, P7 final feature map
        # ModuleList((0): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        #            (1): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)))

    def forward(self, backbone_outs):
        '''
        #backbone_outs = [[n, 512, 69, 69], [n, 1024, 35, 35], [n, 2048, 18, 18]]
        In class Yolact's train(), remove C2 from bakebone_outs. So FPN gets three feature outs.
        '''
        out = []
        x = torch.zeros(1, device=backbone_outs[0].device)
        for i in range(len(backbone_outs)):
            out.append(x)

        # For backward compatability, the conv layers are stored in reverse but the input and output is
        # given in the correct order. Thus, use j=-i-1 for the input and output and i for the conv layers.
        j = len(backbone_outs)  # convouts: C3, C4, C5

        for last_layer in self.last_layers:
            j -= 1
            if j < len(backbone_outs) - 1:
                #backbone_outs = [[n, 512, 69, 69], [n, 1024, 35, 35], [n, 2048, 18, 18]]
                _, _, h, w = backbone_outs[j].size()
                x = F.interpolate(x, size=(h, w), mode='bilinear', align_corners=False)
            x = x + last_layer(backbone_outs[j])
            out[j] = x
        j = len(backbone_outs)
        for final_layer in self.final_layers:
            j -= 1
            out[j] = F.relu(final_layer(out[j]))

        for layer in self.downsample_layers:
            out.append(layer(out[-1]))

        return out
backbone_outs=[]
for i in backbone_selected_layers :
    backbone_outs.append(backbone[i])

fpn = FPN([512, 1024, 2048])
fpn_outs=fpn(backbone_outs)
print('FPN output feature 개수 :', len(fpn_outs))
print('P3 output shape : ', fpn_outs[0].shape)
print('P4 output shape : ', fpn_outs[1].shape)
print('P5 output shape : ', fpn_outs[2].shape)
print('P6 output shape : ', fpn_outs[3].shape)
print('P7 output shape : ', fpn_outs[4].shape)

FPN output feature 개수 : 5
P3 output shape :  torch.Size([1, 256, 69, 69])
P4 output shape :  torch.Size([1, 256, 35, 35])
P5 output shape :  torch.Size([1, 256, 18, 18])
P6 output shape :  torch.Size([1, 256, 9, 9])
P7 output shape :  torch.Size([1, 256, 5, 5])


In [0]:
############### Proto Network ###############
'''
Use P3 which is deepest feature map and has highest resolution
'''
mask_proto_net = [(256, 3, {'padding': 1}), (256, 3, {'padding': 1}), (256, 3, {'padding': 1}),
                  (None, -2, {}), (256, 3, {'padding': 1}), (32, 1, {})]

class Protonet(nn.Module) :
    def __init__(self, mask_proto_net) :
        super().__init__()

        self.inplanes=256
        self.mask_proto_net = mask_proto_net
        self.conv1 = nn.Conv2d(self.inplanes, mask_proto_net[0][0], kernel_size=mask_proto_net[0][1], **mask_proto_net[0][2])
        self.conv2 = nn.Conv2d(self.inplanes, mask_proto_net[1][0], kernel_size=mask_proto_net[1][1], **mask_proto_net[1][2])
        self.conv3 = nn.Conv2d(self.inplanes, mask_proto_net[2][0], kernel_size=mask_proto_net[2][1], **mask_proto_net[2][2])
        self.conv4 = nn.Conv2d(self.inplanes, mask_proto_net[4][0], kernel_size=mask_proto_net[4][1], **mask_proto_net[4][2])
        self.conv5 = nn.Conv2d(self.inplanes, mask_proto_net[5][0], kernel_size=mask_proto_net[5][1], **mask_proto_net[5][2])
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        out = self.conv1(x)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.relu(out)
        out = F.interpolate(out, scale_factor = -self.mask_proto_net[3][1], mode='bilinear', align_corners=False, **self.mask_proto_net[3][2])
        out = self.relu(out)
        out = self.conv4(out)
        out = self.relu(out)
        out = self.conv5(out)
        
        
        return out

proto_out=Protonet(mask_proto_net)(fpn_outs[0])
print(proto_out)
print('-'*50)
print('Proto net output shape : ', proto_out.shape)



tensor([[[[ 0.0395,  0.0383,  0.0426,  ...,  0.0419,  0.0401,  0.0425],
          [ 0.0403,  0.0362,  0.0417,  ...,  0.0394,  0.0360,  0.0411],
          [ 0.0415,  0.0369,  0.0400,  ...,  0.0424,  0.0399,  0.0429],
          ...,
          [ 0.0405,  0.0404,  0.0444,  ...,  0.0406,  0.0408,  0.0414],
          [ 0.0405,  0.0413,  0.0433,  ...,  0.0394,  0.0400,  0.0381],
          [ 0.0388,  0.0386,  0.0365,  ...,  0.0365,  0.0365,  0.0363]],

         [[ 0.0315,  0.0320,  0.0326,  ...,  0.0331,  0.0317,  0.0288],
          [ 0.0345,  0.0293,  0.0281,  ...,  0.0315,  0.0292,  0.0286],
          [ 0.0368,  0.0326,  0.0332,  ...,  0.0340,  0.0318,  0.0309],
          ...,
          [ 0.0449,  0.0434,  0.0444,  ...,  0.0381,  0.0375,  0.0356],
          [ 0.0440,  0.0452,  0.0460,  ...,  0.0369,  0.0365,  0.0356],
          [ 0.0391,  0.0424,  0.0442,  ...,  0.0366,  0.0374,  0.0340]],

         [[-0.0486, -0.0499, -0.0515,  ..., -0.0495, -0.0501, -0.0513],
          [-0.0465, -0.0454, -

In [0]:
#proto_out : [n, 32, 138, 138]
coef_dim=proto_out.shape[1]
num_classes=81
aspect_ratios: [1, 1 / 2, 2]
class PredictionModule(nn.Module):
    def __init__(self, in_channels, coef_dim):
        super().__init__()

        self.num_classes = 81
        self.coef_dim = coef_dim
        self.num_priors = 3            # num of anchor box for each pixel of feature map

        self.upfeature = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)
        
        out_channels = 256
        self.bbox_layer = nn.Conv2d(out_channels, self.num_priors * 4, kernel_size=3, padding=1)
        self.conf_layer = nn.Conv2d(out_channels, self.num_priors * self.num_classes, kernel_size=3, padding=1)
        self.mask_layer = nn.Conv2d(out_channels, self.num_priors * self.coef_dim, kernel_size=3, padding=1)

    def forward(self, x):
        x = self.upfeature(x)
        x = self.relu(x)
        conf = self.conf_layer(x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.num_classes)
        bbox = self.bbox_layer(x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, 4)
        coef_test = self.mask_layer(x)
        print('mask layer output shape : ', coef_test.shape)
        coef = self.mask_layer(x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.coef_dim)       
        # mask_layer output shape : [n, 96, 69, 69] / In order to make it's shape [n, 69*69*3, 32], use permute and contiguous.
        print('Changed shape : ', coef.shape)
        coef = torch.tanh(coef)

        return {'box': bbox, 'class': conf, 'coef': coef}
prediction_layers = nn.ModuleList()
prediction_layers.append(PredictionModule(in_channels=256, coef_dim=coef_dim))
print(prediction_layers[0](fpn_outs[0]))

predictions = {'box': [], 'class': [], 'coef': []}
for i in range(len(fpn_outs)) :
    p=prediction_layers[0](fpn_outs[i])
    for key, value in p.items() :
        predictions[key].append(value)
print(predictions.keys())


mask layer output shape :  torch.Size([1, 96, 69, 69])
Changed shape :  torch.Size([1, 14283, 32])
{'box': tensor([[[-0.0679,  0.1455,  0.0270,  0.1836],
         [-0.1596, -0.0462, -0.0232, -0.2538],
         [-0.0641, -0.1218, -0.1471, -0.0621],
         ...,
         [ 0.0878,  0.2053,  0.0869,  0.1902],
         [ 0.2527,  0.0430,  0.1544,  0.1631],
         [ 0.0999,  0.0977,  0.1058,  0.1289]]], grad_fn=<ViewBackward>), 'class': tensor([[[-0.0593,  0.0791, -0.0490,  ...,  0.1516, -0.1004, -0.0774],
         [-0.0414, -0.1331, -0.1768,  ...,  0.1356, -0.0410,  0.0604],
         [ 0.0413,  0.1187,  0.0259,  ...,  0.0632,  0.0308, -0.1532],
         ...,
         [-0.0274, -0.0330, -0.1076,  ..., -0.1525, -0.0576, -0.0219],
         [-0.0523, -0.0306,  0.1327,  ..., -0.0299, -0.0412,  0.0042],
         [-0.1187,  0.1677,  0.0395,  ...,  0.1190, -0.0270, -0.0976]]],
       grad_fn=<ViewBackward>), 'coef': tensor([[[-0.1319,  0.0452,  0.0812,  ...,  0.0077, -0.0303,  0.0068],
        

In [0]:
def jaccard(box_a, box_b, iscrowd:bool=False):
    """
    Return IoU between box_a and box_b.
    """
    use_batch = True
    if box_a.dim() == 2:
        use_batch = False
        box_a = box_a[None, ...]
        box_b = box_b[None, ...]
    
    inter = intersect(box_a, box_b)
    
    area_a = ((box_a[:, :, 2]-box_a[:, :, 0]) *
              (box_a[:, :, 3]-box_a[:, :, 1])).unsqueeze(2).expand_as(inter)  # [A,B]
    area_b = ((box_b[:, :, 2]-box_b[:, :, 0]) *
              (box_b[:, :, 3]-box_b[:, :, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
    union = area_a + area_b - inter
    
    out = inter / area_a if iscrowd else inter / union
    
    return out if use_batch else out.squeeze(0)
def intersect(box_a, box_b):
    """ 
    Return intersection area, Shape: [n,A,B].
    In order to make size as [n, A, B], resize both tensors to [A,B,2]
    [A,2] -> [A,1,2] -> [A,B,2]
    [B,2] -> [1,B,2] -> [A,B,2]
    Then compute the area of intersect between box_a and box_b.
    
    box_a: (tensor) bounding boxes, Shape: [n,A,4].
    box_b: (tensor) bounding boxes, Shape: [n,B,4].
    
    """
    n = box_a.size(0)
    A = box_a.size(1)
    B = box_b.size(1)
    max_xy = torch.min(box_a[:, :, 2:].unsqueeze(2).expand(n, A, B, 2),
                       box_b[:, :, 2:].unsqueeze(1).expand(n, A, B, 2))
    min_xy = torch.max(box_a[:, :, :2].unsqueeze(2).expand(n, A, B, 2),
                       box_b[:, :, :2].unsqueeze(1).expand(n, A, B, 2))
    inter = torch.clamp((max_xy - min_xy), min=0)
    return inter[:, :, :, 0] * inter[:, :, :, 1]


In [0]:
conf_thresh=0.05
max_num_detections=100
def fast_nms(boxes, scores, iou_threshold:float=0.5, top_k:int=200, second_threshold:bool=False):
        
        scores, idx = scores.sort(1, descending=True)
        
        idx = idx[:, :top_k].contiguous()
        scores = scores[:, :top_k]
    
        num_classes, num_dets = idx.size()
        print('-'*80)

        print('sorted index : ', idx)
        boxes = boxes[idx.view(-1), :].view(num_classes, num_dets, 4)
        #masks = masks[idx.view(-1), :].view(num_classes, num_dets, -1)
        print('<sorted boxes>')
        print(boxes)
        iou = jaccard(boxes, boxes)
        print('-'*80)
        
        iou.triu_(diagonal=1)
        print(iou)
        iou_max, _ = iou.max(dim=1)

        # Now just filter out the ones higher than the threshold
        keep = (iou_max <= iou_threshold)
        print('keep: ', keep)
        if second_threshold:
            keep *= (scores > conf_thresh)

        # Assign each kept detection to its corresponding class
        classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as(keep)
        classes = classes[keep]

        boxes = boxes[keep]
        #masks = masks[keep]
        scores = scores[keep]
        
        # Only keep the top cfg.max_num_detections highest scores across all classes
        scores, idx = scores.sort(0, descending=True)
        idx = idx[:max_num_detections]
        scores = scores[:max_num_detections]

        classes = classes[idx]
        boxes = boxes[idx]
        #masks = masks[idx]

        return boxes, classes, scores
score=torch.FloatTensor([[3.12, 1.213, 4., 5.23]])
print('<score>')
print(score)
temp=[[0., 0., 5., 5.], [1., 1., 5., 5.], [1., 1., 2., 2.], [4., 4., 7., 7.]]
#temp=list(np.arange(i, i+4,) for i in range(4))
temp=torch.FloatTensor(temp)
print('')
print('<box coordinate>')
print(temp)
b, c, s=fast_nms(temp, score)
print('*'*80)
print(b, c, s)

<score>
tensor([[3.1200, 1.2130, 4.0000, 5.2300]])

<box coordinate>
tensor([[0., 0., 5., 5.],
        [1., 1., 5., 5.],
        [1., 1., 2., 2.],
        [4., 4., 7., 7.]])
--------------------------------------------------------------------------------
sorted index :  tensor([[3, 2, 0, 1]])
<sorted boxes>
tensor([[[4., 4., 7., 7.],
         [1., 1., 2., 2.],
         [0., 0., 5., 5.],
         [1., 1., 5., 5.]]])
--------------------------------------------------------------------------------
tensor([[[0.0000, 0.0000, 0.0303, 0.0417],
         [0.0000, 0.0000, 0.0400, 0.0625],
         [0.0000, 0.0000, 0.0000, 0.6400],
         [0.0000, 0.0000, 0.0000, 0.0000]]])
keep:  tensor([[ True,  True,  True, False]])
********************************************************************************
tensor([[4., 4., 7., 7.],
        [1., 1., 2., 2.],
        [0., 0., 5., 5.]]) tensor([0, 0, 0]) tensor([5.2300, 4.0000, 3.1200])


In [0]:
import torch
a=torch.randn(2, 3)
b=a.view(-1, 2)
print(b)
m=b.max()
print(b.sum(1))
print(torch.log(torch.sum(torch.exp(b-m), 1)))

tensor([[ 0.2916,  1.1499],
        [-0.8576, -0.3178],
        [ 0.3299,  1.0939]])
tensor([ 1.4416, -1.1754,  1.4239])
tensor([ 0.3534, -1.0085,  0.3264])


In [0]:
def ohem_conf_loss(self, class_p, conf_gt, positive_bool):
        # class_P : [n, 19248, 81]
        # Compute max conf across batch for hard negative mining
        batch_conf = class_p.view(-1, self.num_classes)  # [n*19248, 81]

        # compute softmax for each anchors
        batch_conf_max = batch_conf.data.max()
        mark = torch.log(torch.sum(torch.exp(batch_conf - batch_conf_max), 1)) + batch_conf_max - batch_conf[:, 0]     # [n*19248]

        # Hard Negative Mining
        mark = mark.view(class_p.size(0), -1)  # (n, 19248)
        mark[positive_bool] = 0  # filter out pos boxes
        mark[conf_gt < 0] = 0  # filter out neutrals (conf_gt = -1)

        _, idx = mark.sort(1, descending=True)
        _, idx_rank = idx.sort(1)

        num_pos = positive_bool.long().sum(1, keepdim=True)
        num_neg = torch.clamp(self.negpos_ratio * num_pos, max=positive_bool.size(1) - 1)

        # Select num_neg hard negative examples.
        negative_bool = idx_rank < num_neg.expand_as(idx_rank)

        negative_bool[positive_bool] = 0    # filter out pos boxes
        negative_bool[conf_gt < 0] = 0  # filter out neutrals

        # Confidence Loss Including Positive and Negative Examples
        class_p_selected = class_p[(positive_bool + negative_bool)].view(-1, self.num_classes)
        class_gt_selected = conf_gt[(positive_bool + negative_bool)]

        loss_c = F.cross_entropy(class_p_selected, class_gt_selected, reduction='sum')

        return cfg.conf_alpha * loss_c

In [0]:
def bbox_loss(pos_box_p, pos_offsets):
        loss_b = F.smooth_l1_loss(pos_box_p, pos_offsets, reduction='sum') * cfg.bbox_alpha
        return loss_b

In [0]:
def lincomb_mask_loss(positive_bool, prior_max_index, coef_p, proto_p, mask_gt, prior_max_box):
        proto_h = proto_p.size(1)  # 138
        proto_w = proto_p.size(2)  # 138

        loss_m = 0
        for i in range(coef_p.size(0)):  # coef_p.shape: (n, 19248, 32)
            with torch.no_grad():
                # downsample the gt mask to the size of 'proto_p'
                downsampled_masks = F.interpolate(mask_gt[i].unsqueeze(0), (proto_h, proto_w), mode='bilinear',
                                                  align_corners=False).squeeze(0)
                downsampled_masks = downsampled_masks.permute(1, 2, 0).contiguous()  # (138, 138, num_objects)
                # binarize the gt mask because of the downsample operation
                downsampled_masks = downsampled_masks.gt(0.5).float()

            # gt boxes that corresponds to positive anchors (Not background)
            # Not detected positive objects will not be computed for loss
            pos_prior_index = prior_max_index[i, positive_bool[i]]  # pos_prior_index.shape: [num_positives]
            pos_prior_box = prior_max_box[i, positive_bool[i]]
            pos_coef = coef_p[i, positive_bool[i]]

            if pos_prior_index.size(0) == 0:
                continue

            # If exceeds the number of masks for training, select a random subset
            old_num_pos = pos_coef.size(0)
            if old_num_pos > cfg.masks_to_train:
                perm = torch.randperm(pos_coef.size(0))
                select = perm[:cfg.masks_to_train]
                pos_coef = pos_coef[select]
                pos_prior_index = pos_prior_index[select]
                pos_prior_box = pos_prior_box[select]

            num_pos = pos_coef.size(0)

            # Also choose positive objects' mask from downsampled_masks which is consists of all of objects' mask
            pos_mask_gt = downsampled_masks[:, :, pos_prior_index]

            # mask assembly by linear combination
            # @ means dot product
            # coef_p : [num_pos, 32] / proto_p : [138, 138, 32]
            mask_p = torch.sigmoid(proto_p[i] @ pos_coef.t())  # mask_p.shape: (138, 138, num_pos)
            mask_p = crop(mask_p, pos_prior_box)  # pos_prior_box.shape: (num_pos, 4)

            mask_loss = F.binary_cross_entropy(torch.clamp(mask_p, 0, 1), pos_mask_gt, reduction='none')
            # Normalize the mask loss to emulate roi pooling's effect on loss.
            # Compute mask loss pixel-wise
            pos_get_csize = center_size(pos_prior_box)      # Compute area of bounding box
            mask_loss = mask_loss.sum(dim=(0, 1)) / pos_get_csize[:, 2] / pos_get_csize[:, 3]

            if old_num_pos > num_pos:
                mask_loss *= old_num_pos / num_pos

            loss_m += torch.sum(mask_loss)

        loss_m *= cfg.mask_alpha / proto_h / proto_w

        return loss_m


In [0]:
    def semantic_segmentation_loss(segmentation_p, mask_gt, class_gt):
        # Note classes here exclude the background class, so num_classes = cfg.num_classes-1
        batch_size, num_classes, mask_h, mask_w = segmentation_p.size()  # (n, 80, 69, 69)
        loss_s = 0

        for i in range(batch_size):
            cur_segment = segmentation_p[i]
            cur_class_gt = class_gt[i]

            with torch.no_grad():
                downsampled_masks = F.interpolate(mask_gt[i].unsqueeze(0), (mask_h, mask_w), mode='bilinear',
                                                  align_corners=False).squeeze(0)
                downsampled_masks = downsampled_masks.gt(0.5).float()  # (num_objects, 69, 69)

                # Construct Semantic Segmentation
                segment_gt = torch.zeros_like(cur_segment, requires_grad=False)
                for i_obj in range(downsampled_masks.size(0)):
                    segment_gt[cur_class_gt[i_obj]] = torch.max(segment_gt[cur_class_gt[i_obj]],
                                                                downsampled_masks[i_obj])

            loss_s += F.binary_cross_entropy_with_logits(cur_segment, segment_gt, reduction='sum')
        # also compute loss pixel-wise
        return loss_s / mask_h / mask_w * cfg.semantic_alpha