# 02_SSD_model_forward.ipynb

### Create SSD network model and its forward function.

### SSD model is composed four modules: 
###       (1) VGG, (2) extra, (3) loc, and (4) conf modules.


This includes Non-Maximum Supression

In [1]:
# import package
from math import sqrt
from itertools import product

import pandas as pd
import torch
from torch.autograd import Function
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init

###  Implement VGG-module

In [2]:
# create 34 layered VGG module
def make_vgg():
    layers = []
    in_channels = 3  # num of color channel

    # configuration of network
    # definition of number of feature maps in each layer 
    #     OR definition of max pooling as 'M' in VGG modules 
    cfg = [64, 64, 'M', 128, 128, 'M', 256, 256,
           256, 'MC', 512, 512, 512, 'M', 512, 512, 512]

    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        elif v == 'MC':
            # 'ceil_mode' rounds up float results into integer
            #  (in default: output is rounding down in 'floor' mode)
            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v

    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
    conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
    conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
    layers += [pool5, conv6,
               nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
    return nn.ModuleList(layers)


# confirmation
vgg_test = make_vgg()
print(vgg_test)


ModuleList(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
  (17): Conv2d(256, 512, kernel_siz

### Implement extra module

In [3]:
# create 8 layered extra module
def make_extras():
    layers = []
    in_channels = 1024  # input channel from VGG module

    # configuration of extra network (number of feature maps)
    cfg = [256, 512, 128, 256, 128, 256, 128, 256]

    layers += [nn.Conv2d(in_channels, cfg[0], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[0], cfg[1], kernel_size=(3), stride=2, padding=1)]
    layers += [nn.Conv2d(cfg[1], cfg[2], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[2], cfg[3], kernel_size=(3), stride=2, padding=1)]
    layers += [nn.Conv2d(cfg[3], cfg[4], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[4], cfg[5], kernel_size=(3))]
    layers += [nn.Conv2d(cfg[5], cfg[6], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[6], cfg[7], kernel_size=(3))]
    
    return nn.ModuleList(layers)


# confirmation
extras_test = make_extras()
print(extras_test)


ModuleList(
  (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
  (1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (2): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
  (3): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (4): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
  (5): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
  (6): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
  (7): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
)


### Implement of (3) loc, and (4) conf modules 

In [4]:
# create loc_layers: output the offset of the default box
# create conf_layers: output the confidence for each class of the default box


def make_loc_conf(num_classes=21, bbox_aspect_num=[4, 6, 6, 6, 4, 4]):

    loc_layers = []
    conf_layers = []

    # conv layer for conv4_3 (22nd layer of VGG [source1])
    loc_layers += [nn.Conv2d(512, bbox_aspect_num[0]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(512, bbox_aspect_num[0]
                              * num_classes, kernel_size=3, padding=1)]

    # conv layer for conv6_2 (the last layer of VGG [source2])
    loc_layers += [nn.Conv2d(1024, bbox_aspect_num[1]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(1024, bbox_aspect_num[1]
                              * num_classes, kernel_size=3, padding=1)]

    # conv layer for conv7_2 conv ([source3] in the extra)
    loc_layers += [nn.Conv2d(512, bbox_aspect_num[2]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(512, bbox_aspect_num[2]
                              * num_classes, kernel_size=3, padding=1)]

    # conv layer for conv8_2 conv ([source4] in the extra)
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[3]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[3]
                              * num_classes, kernel_size=3, padding=1)]

    # conv layer for conv9_2 conv ([source5] in the extra)
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[4]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[4]
                              * num_classes, kernel_size=3, padding=1)]

    # conv layer for conv10_2 conv ([source6] in the extra)
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[5]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[5]
                              * num_classes, kernel_size=3, padding=1)]

    return nn.ModuleList(loc_layers), nn.ModuleList(conf_layers)


# confirmation
loc_test, conf_test = make_loc_conf()
print(loc_test)
print(conf_test)


ModuleList(
  (0): Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
ModuleList(
  (0): Conv2d(512, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)


### Implement of L2-Norm layer

In [5]:
# L2Norm layer after the conv4_3 
# This layer received 512x38x38 input
# This normalizes the different statistical properties of the feature map for each channel.
# See detail in the class slide

class L2Norm(nn.Module):
    def __init__(self, input_channels=512, scale=20):
        super(L2Norm, self).__init__() 
        self.weight = nn.Parameter(torch.Tensor(input_channels))
        self.scale = scale  # initial weights for each layer (alpha) 
        self.reset_parameters()  # reset parameters
        self.eps = 1e-10

    def reset_parameters(self):
        '''set initial weights for layer (alpha in the slide)'''
        init.constant_(self.weight, self.scale)  # all initial weights are set to 20.

    def forward(self, x):
        '''For each 38×38 feature (x), the route of the sum of squares was obtained over 512 channels (=norm)
        each value was divided by the 'norm'. 
        Then, normalized x is weighted by trainable layer weights (alpha).'''

        # normalized the input (x) in channel direction.
        # size of 'norm' Tensor is torch.Size([batch_num, 1, 38, 38])
        # size of normalized x Tensor is torch.Size([batch_num, 512, 38, 38])
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps
        x = torch.div(x, norm)

        # normalized x is multiplied by layer weights.
        # size of self.weight is torch.Size([512]), so it unsqueeze to 
        # torch.Size([batch_num, 512, 38, 38]) and are multiplied.
        weights = self.weight.unsqueeze(
            0).unsqueeze(2).unsqueeze(3).expand_as(x)
        out = weights * x

        return out


### Implement of Default Box (Dbox)

In [6]:
# Class for outputting the default box
class DBox(object):
    def __init__(self, cfg):
        super(DBox, self).__init__()

        # initial setting
        self.image_size = cfg['input_size']  # image size = 300
        # [38, 19, …] map size of each sorce
        self.feature_maps = cfg['feature_maps']
        self.num_priors = len(cfg["feature_maps"])  # number of source=6
        self.steps = cfg['steps']  # [8, 16, …] pixel size of DBox
        
        self.min_sizes = cfg['min_sizes']
        # [30, 60, …]  size (area) of small square DBox
        
        self.max_sizes = cfg['max_sizes']
        # [60, 111, …] size (area) of big square DBox
        
        self.aspect_ratios = cfg['aspect_ratios']  # aspect ratio of rectangle DBox

    def make_dbox_list(self):
        '''creation of DBox'''
        mean = []
        # 'feature_maps': [38, 19, 10, 5, 3, 1]
        for k, f in enumerate(self.feature_maps):
            for i, j in product(range(f), repeat=2):  # Make a combination of two pairs of numbers up to f　f_P_2
                # i, j are combination of (0,0), (0,1),...(0,37),(1,0),...(1,37),...,(37,37) when f=38
                # 'steps': image size (side length) supported by one node on the feature map
                # 'steps': [8, 16, 32, 64, 100, 300] (= 300/38, 300/19, 300/10, ...)
                f_k = self.image_size / self.steps[k]

                # Center coordinates of DBox (cx,cy). Each normalized in [0,1]
                cx = (j + 0.5) / f_k
                cy = (i + 0.5) / f_k

                # small DBow with aspect ratio=1 DBox [cx,cy, width, height]
                # 'min_sizes': [30, 60, 111, 162, 213, 264]
                s_k = self.min_sizes[k]/self.image_size
                mean += [cx, cy, s_k, s_k]

                # large DBow with aspect ratio=1 DBox [cx,cy, width, height]
                # 'max_sizes': [60, 111, 162, 213, 264, 315],
                s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size))
                mean += [cx, cy, s_k_prime, s_k_prime]

                # DBox with other aspect ratios [cx,cy, width, height]
                for ar in self.aspect_ratios[k]:
                    mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)]
                    mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)]

        # convert DBox into Tensor torch.Size([8732, 4])
        output = torch.Tensor(mean).view(-1, 4)

        # To prevent DBox from sticking out of the image, set the size to min=0 and max=1
        output.clamp_(max=1, min=0)

        return output


In [7]:
# confirmation

# setting of SSD300 network
ssd_cfg = {
    'num_classes': 21,  # number of total class (20 objects + background)
    'input_size': 300,  # image size
    'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # aspect ratio of the output DBox
    'feature_maps': [38, 19, 10, 5, 3, 1],  # image size of each 'source'
    'steps': [8, 16, 32, 64, 100, 300],  # image size (side length) supported by one node on the feature map
    'min_sizes': [30, 60, 111, 162, 213, 264],  # size of small square DBOX
    'max_sizes': [60, 111, 162, 213, 264, 315],  # size of large square DBOX
    'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
}

# creation of DBox
dbox = DBox(ssd_cfg)
dbox_list = dbox.make_dbox_list()

# confirmation of the output of DBox 
pd.DataFrame(dbox_list.numpy())


Unnamed: 0,0,1,2,3
0,0.013333,0.013333,0.100000,0.100000
1,0.013333,0.013333,0.141421,0.141421
2,0.013333,0.013333,0.141421,0.070711
3,0.013333,0.013333,0.070711,0.141421
4,0.040000,0.013333,0.100000,0.100000
...,...,...,...,...
8727,0.833333,0.833333,0.502046,1.000000
8728,0.500000,0.500000,0.880000,0.880000
8729,0.500000,0.500000,0.961249,0.961249
8730,0.500000,0.500000,1.000000,0.622254


### Implement of SSD class (only the architecture) 
###   full implementation is in the following.

In [8]:
# Definition of SSD class
class SSD(nn.Module):

    def __init__(self, phase, cfg):
        super(SSD, self).__init__()

        self.phase = phase  # select train or inference
        self.num_classes = cfg["num_classes"]  # number of classes=21

        # build the SSD model
        self.vgg = make_vgg()
        self.extras = make_extras()
        self.L2Norm = L2Norm()
        self.loc, self.conf = make_loc_conf(
            cfg["num_classes"], cfg["bbox_aspect_num"])

        # creation of DBox
        dbox = DBox(cfg)
        self.dbox_list = dbox.make_dbox_list()

        # use "Detect" when inference mode
        if phase == 'inference':
            self.detect = Detect()


# confirmation
ssd_test = SSD(phase="train", cfg=ssd_cfg)
print(ssd_test)


SSD(
  (vgg): ModuleList(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, cei

### Implement of decode()

In [9]:
# Function to convert DBox to BBox using offset information

def decode(loc, dbox_list):
    """
    Function to convert DBox to BBox using offset information

    Parameters
    ----------
    loc:  [8732,4]
        offset estimated by SSD model
        [Δcx, Δcy, Δwidth, Δheight]
    dbox_list: [8732,4]
        DBox info
        [cx, cy, width, height]

    Returns
    -------
    boxes : [xmin, ymin, xmax, ymax]
        BBox info
    """

    # DBox info is stored in [cx, cy, width, height]
    # loc info is stored in [Δcx, Δcy, Δwidth, Δheight]

    # get BBox info from offset
    boxes = torch.cat((
        dbox_list[:, :2] + loc[:, :2] * 0.1 * dbox_list[:, 2:],
        dbox_list[:, 2:] * torch.exp(loc[:, 2:] * 0.2)), dim=1)
    # size of boxes is torch.Size([8732, 4])

    # convert BBox location from [cx, cy, width, height] to [xmin, ymin, xmax, ymax]
    boxes[:, :2] -= boxes[:, 2:] / 2  # calc (xmin,ymin)
    boxes[:, 2:] += boxes[:, :2]      # calc (xmax,ymax)

    return boxes


### Implement of Non-Maximum Suppression (NMS)

In [10]:
# Non-Maximum Suppression (NMS) 


def nm_suppression(boxes, scores, overlap=0.45, top_k=200):
    """
    Non-Maximum Suppression
    remove highly overlapped BBox（use threshold: 'overlap')

    Parameters
    ----------
    boxes : [# BBox with larger confidence threshold（0.01),4]
        BBox info [xmin, ymin, xmax, ymax]
    scores :[# BBox with confidence score larger than threshold（0.01)]
        conf info

    overlap : detection threshold for overlapping 
        (IOU > 'threshold' are removed)
    top_k   : maxium number of BBoxes to be detected

    Returns
    -------
    keep : list
        Store the index that passed the NMS in the conf in descending order
    count：int
        Number of BBox passed NMS
    """

    # create a template of return
    count = 0
    keep = scores.new(scores.size(0)).zero_().long()
    # keep：torch.Size([ The number of BBoxes above the confidence threshold])
    # all emelents are initialized as 0

    # calclate the area (size) of each BBox
    x1 = boxes[:, 0] #xmin
    y1 = boxes[:, 1] #ymin
    x2 = boxes[:, 2] #xmax
    y2 = boxes[:, 3] #ymax
    area = torch.mul(x2 - x1, y2 - y1)

    # copy BBoxes. tmp area for calculation of IOU
    #   IOU (interaction over the union): degree of the overlap of BBoxes
    tmp_x1 = boxes.new()
    tmp_y1 = boxes.new()
    tmp_x2 = boxes.new()
    tmp_y2 = boxes.new()
    tmp_w = boxes.new()
    tmp_h = boxes.new()

    # Reorder 'score' in ascending order
    v, idx = scores.sort(0)

    # pickup index of top_k (upto 200) BBox
    idx = idx[-top_k:]

    # Unless the number of elements in idx is zero, repeat following process
    while idx.numel() > 0:
        i = idx[-1]  # set i as the index whos has the largest confidence score at now

        # Add BBox index i (with current largest confidence score) to 'keep' 
        #  say "ref-box"
        keep[count] = i
        count += 1

        # Exit the loop at the last BBox
        if idx.size(0) == 1:
            break

        # we have kept the current index i (with largest conf score) in 'keep',
        #           so reduce this (id of the "ref-box") from idx
        idx = idx[:-1] # the idx associate with largest conf score is excluded 

        # -------------------
        # Extracts and removes BBoxes that have a large overlap 
        #    with the BBoxes stored in 'keep' (ref-box: BBOx with largest conf score)
        #   (compare "ref-box" with other BBoxes)
        # -------------------
        # 
        # A tensor to store the position of the BBox to be compared (tmp_**)
        #  number of tmp_** element is upto num of idx-1.
        #  ex) x1 is array of xmin of BBox having more than 0.01 conf score
        #      tmp_x1 has array of xmin of BBox to be compared (with "ref-box") 
        torch.index_select(x1, 0, idx, out=tmp_x1)
        torch.index_select(y1, 0, idx, out=tmp_y1)
        torch.index_select(x2, 0, idx, out=tmp_x2)
        torch.index_select(y2, 0, idx, out=tmp_y2)

        # In order to calculate IOU, 
        #   clamp the location of BBOxes to be compared to that of the "ref-box" 
        # i.e. they are the coordinates of the four corners of the overlapped area
        tmp_x1 = torch.clamp(tmp_x1, min=x1[i])
        tmp_y1 = torch.clamp(tmp_y1, min=y1[i])
        tmp_x2 = torch.clamp(tmp_x2, max=x2[i])
        tmp_y2 = torch.clamp(tmp_y2, max=y2[i])

        # resize tensor size for width and height for following calculation
        tmp_w.resize_as_(tmp_x2)
        tmp_h.resize_as_(tmp_y2)

        # Find the width and height of BBox in the clamped state
        # (overlapped width and height with the "ref-box")
        tmp_w = tmp_x2 - tmp_x1
        tmp_h = tmp_y2 - tmp_y1

        # If the width or height is negative, set it to zero.
        tmp_w = torch.clamp(tmp_w, min=0.0)
        tmp_h = torch.clamp(tmp_h, min=0.0)

        # calculate their area
        inter = tmp_w*tmp_h # intersect(a,b)

        # IoU = intersect(a,b) / (area(a) + area(b) - intersect(a,b))
        rem_areas = torch.index_select(area, 0, idx)  # area of BBox to be compared: area(b)
        union = (rem_areas - inter) + area[i]  # area(a)-intersect(a,b)+area(b)
        IoU = inter/union 

        # leave idx whose overlap is smaller than the threshold 'overlap' 
        idx = idx[IoU.le(overlap)]  # le: Less than or Equal to
        # An idx with an IoU greater than the overlap is the same as the idx you initially chose to store in 'keep' 
        # 

    # finish this program when while-loop ends

    return keep, count


### Implement of Detect()

In [11]:
# From 'conf' and 'loc' from SSD, output BBOx after removing overlapping


class Detect(Function):

    def __init__(self, conf_thresh=0.01, top_k=200, nms_thresh=0.45):
        self.softmax = nn.Softmax(dim=-1)  # softmax fuction for 'conf'
        self.conf_thresh = conf_thresh  # threshold for 'conf' (0.01): use DBOx > 0.01 
        self.top_k = top_k  # number the maximum DBOX used BBox in the non-maximum suppression (NMS) 
        self.nms_thresh = nms_thresh  # In NMS, BBoxs with their IOU > nms_th (0.45) is treated as overlapped

    def forward(self, loc_data, conf_data, dbox_list):
        """
        forward processing

        Parameters
        ----------
        loc_data:  [batch_num,8732,4]
            offset info
        conf_data: [batch_num, 8732, num_classes]
           confidence score of DBox
        dbox_list: [8732,4]
            DBox location info [cx, cy, width, height]

        Returns
        -------
        output : torch.Size([batch_num, 21, 200, 5])
            （batch_num, conf score for class, top_k, BBox info）
              5 BBOx info: [xmin, ymin, xmax, ymax, label_ind]
        """

        # get size
        num_batch = loc_data.size(0)  # mini batch size
        num_dbox = loc_data.size(1)  # num DBox = 8732
        num_classes = conf_data.size(2)  # num class = 21

        # confidence scores from SSD (21 class) are normalized with softmax
        conf_data = self.softmax(conf_data)

        # prepare output tensor [num minibatch, 21, 200, 5]
        output = torch.zeros(num_batch, num_classes, self.top_k, 5)

        # change order of 'cof_data'
        #   from [batch_num,8732,num_classes] to [batch_num, num_classes,8732]
        conf_preds = conf_data.transpose(2, 1)

        # mini-batch loop
        for i in range(num_batch):

            # 1. calculate BBox [xmin, ymin, xmax, ymax]  from 'loc' and DBox
            decoded_boxes = decode(loc_data[i], dbox_list)

            # copy 'conf'
            conf_scores = conf_preds[i].clone()

            # loop for object class (exclude background class=0, start from 1)
            for cl in range(1, num_classes):

                # 2.detect BBOx with larger 'conf' 
                #   get the index of conf over the threshold as 'c_mask'
                c_mask = conf_scores[cl].gt(self.conf_thresh)
                # gt:greater than
                # c_mask =1 if conf_score > conf_thresh, otherwise 0
                # conf_scores:torch.Size([21, 8732])
                # c_mask:torch.Size([8732])

                # scores: torch.Size([num of BBox exceeds threshold])
                scores = conf_scores[cl][c_mask]

                # if no BBox exceed threshold (i.e scores=[]), do nothing
                if scores.nelement() == 0:  # nelement: find the total number of elements
                    continue

                # Resize 'c_mask' so that it can be applied to decoded_boxes
                l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes)
                # l_mask:torch.Size([8732, 4])

                # apply 'l_mask' to decoded_boxes (BBox)
                boxes = decoded_boxes[l_mask].view(-1, 4)
                # Because decoded_boxes[l_mask] is 1dim array,
                #   reshape this to (num of BBox exceeds threshold, 4) with 'view'

                # 3. remove highly overlapped BBOx with Non-Maximum Suppression (NMS)
                ids, count = nm_suppression(
                    boxes, scores, self.nms_thresh, self.top_k)
                # ids：indexes that pass the NMS in descending order by 'conf'
                # count：number of BBox passed NMS

                # store results of the NMS in 'output'
                output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1),
                                                   boxes[ids[:count]]), 1)

        return output  # torch.Size([1, 21, 200, 5])


### Implement of SSD class (full)

In [12]:
# definition of SSD class


class SSD(nn.Module):

    def __init__(self, phase, cfg):
        super(SSD, self).__init__()

        self.phase = phase  # select train or inference
        self.num_classes = cfg["num_classes"]  # num_class=21

        # create SSD network
        self.vgg = make_vgg()
        self.extras = make_extras()
        self.L2Norm = L2Norm()
        self.loc, self.conf = make_loc_conf(
            cfg["num_classes"], cfg["bbox_aspect_num"])

        # create DBox
        dbox = DBox(cfg)
        self.dbox_list = dbox.make_dbox_list()

        # in inference phase, "Detect" class is used.
        if phase == 'inference':
            self.detect = Detect()

    def forward(self, x):
        sources = list()  # to store source1-source6: input to 'loc' and 'conf'
        loc = list()   # to store 'loc' output
        conf = list()  # to store 'conf' output 

        # calc conv4_3 in VGG (until 22 layer)
        for k in range(23):
            x = self.vgg[k](x)

        # source 1: conv4_3 -> L2Norm, add sources
        source1 = self.L2Norm(x)
        sources.append(source1)

        # source 2: calc until the last of VGG (conv6_2), add sources
        for k in range(23, len(self.vgg)):
            x = self.vgg[k](x)

        sources.append(x)

        # calculation of (source3- source6) in 'extras' 
        #   and add them to 'sources'.
        for k, v in enumerate(self.extras):
            x = F.relu(v(x), inplace=True) # v(x) is conv 
            if k % 2 == 1:  # append 'sources' after each conv→ReLU→conv→ReLU processes
                sources.append(x)

        # perform one 3x3 conv process for each source1 - source6
        # Get multiple list elements of the for loop with
        #        (i.e 6 loops : (source1 to source6)
        for (x, l, c) in zip(sources, self.loc, self.conf):
            # reorders the elements (and reposition of elements in the memory)
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())
            # perform convolution for source x -> loc l(x) and conf c(x)
            # size of l(x) and c(x) is 
            #     [batch_num, 4*num the type of aspect ratio, height of map, width of map]
            # Since the number of aspect ratios of DBOX varies from source to source (4 or 6), 
            #        for the sake of convenience, this dimension is replaced to the end.
            #     [batch_num, height of map, width of map, 4*num the type of aspect ratio]
            #
            # （Note）
            # torch.continuous() is an instruction to reposition elements sequentially in memory.
            # In order to use [view] comamands used in laterto work, 
            #        the target variable must be contiguous in memory.

        # reshape (flatten) 'loc' and 'conf' into one-dim/image
        # loc:  torch.Size([batch_num, 34928])
        # conf: torch.Size([batch_num, 183372])
        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)

        # further reshape them 
        # loc:  torch.Size([batch_num, 8732, 4])
        # conf: torch.Size([batch_num, 8732, 21])
        loc = loc.view(loc.size(0), -1, 4)
        conf = conf.view(conf.size(0), -1, self.num_classes)

        # summarize them as 'output'
        output = (loc, conf, self.dbox_list)

        if self.phase == "inference":  # at the inference
            # execute [forward] in Detect class
            # return size: torch.Size([batch_num, 21, 200, 5])
            return self.detect(output[0], output[1], output[2])

        else:  # at the training 
            return output
            # return size: (loc, conf, dbox_list)


