In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

from utils import *

%load_ext autoreload

# Feature Extraction

Set the input image (H, W) = (800, 800)
Use the VGG16 to extrcat features -> (50, 50)

**bbox and anchor -> y1, x1, y2, x2**

In [2]:
image = torch.zeros((1, 3, 800, 800)).float()
image_size = (800, 800)

# bbox -> y1, x1, y2, x2
bbox = torch.FloatTensor([[20, 30, 400, 500], [300, 400, 500, 600]])
labels = torch.LongTensor([6, 8])

sub_sample = 16

## Use the first 30 layers of VGG16 to exact features. 

Because there are 4 maxpooling layers, (H, W) -> (H//16, W//16)

13 conv layers, 13 ReLU layers, 4 Maxpooling layers -> 30 layers

In [3]:
vgg16 = torchvision.models.vgg16(pretrained=True)
req_features = vgg16.features[:30]
print(req_features)
output_map = req_features(image)
print(output_map.shape)

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(

# Target Anchor Box

## Assign template anchor boxes with origin (0, 0)

We will use `anchor_scales(feature map)` of **8, 16, 32**, `ratio` of **0.5, 1, 2** and `sub sampling` of **16** (Since we have pooled our image from 800 px to 50px). Now every pixel in the output feature map maps to corresponding 16 * 16 pixels in the image.

anchor_template = (9, 4)

In [4]:
anchor_scale = [8, 16, 32]
ratio = [0.5, 1, 2] # H/W

len_anchor_scale = len(anchor_scale)
len_ratio = len(ratio)
len_anchor_template = len_anchor_scale * len_ratio
anchor_template = np.zeros((9, 4))

for idx, scale in enumerate(anchor_scale):
    h = scale * np.sqrt(ratio) * sub_sample
    w = scale / np.sqrt(ratio) * sub_sample
    y1 = -h/2
    x1 = -w/2
    y2 = h/2
    x2 = w/2
    anchor_template[idx*len_ratio:(idx+1)*len_ratio, 0] = y1
    anchor_template[idx*len_ratio:(idx+1)*len_ratio, 1] = x1
    anchor_template[idx*len_ratio:(idx+1)*len_ratio, 2] = y2
    anchor_template[idx*len_ratio:(idx+1)*len_ratio, 3] = x2

print(anchor_template)

[[ -45.254834    -90.50966799   45.254834     90.50966799]
 [ -64.          -64.           64.           64.        ]
 [ -90.50966799  -45.254834     90.50966799   45.254834  ]
 [ -90.50966799 -181.01933598   90.50966799  181.01933598]
 [-128.         -128.          128.          128.        ]
 [-181.01933598  -90.50966799  181.01933598   90.50966799]
 [-181.01933598 -362.03867197  181.01933598  362.03867197]
 [-256.         -256.          256.          256.        ]
 [-362.03867197 -181.01933598  362.03867197  181.01933598]]


In [5]:
np.sqrt(ratio)

array([0.70710678, 1.        , 1.41421356])

## Generate anchor at all the feature map location

### Generate center coors for all the feature map pixels
ctr -> (50, 50, 2)

In [6]:
feature_map_size = (50, 50)
# The first center coors is (8, 8)
ctr_y = np.arange(8, 800, 16)
ctr_x = np.arange(8, 800, 16)

In [7]:
ctr = np.zeros((*feature_map_size, 2))
for idx, y in enumerate(ctr_y):
    ctr[idx, :, 0] = y
    ctr[idx, :, 1] = ctr_x
print(ctr.shape)

(50, 50, 2)


### Add center coors and previous anchor boxes coors

anchors -> (50, 50, 9, 4)

In [8]:
anchors = np.zeros((*feature_map_size, 9, 4))

for idx_y in range(feature_map_size[0]):
    for idx_x in range(feature_map_size[1]):
        anchors[idx_y, idx_x] = (ctr[idx_y, idx_x] + anchor_template.reshape(-1, 2, 2)).reshape(-1, 4)
print(anchors.shape)

(50, 50, 9, 4)


In [9]:
anchors = anchors.reshape(-1, 4)
print(anchors.shape)

(22500, 4)


## Assign labels and location of objects to each and every anchor

### Find the index of all valid anchor boxes

valid anchor boxes -> inside the image

In [10]:
valid_index = np.where((anchors[:, 0] >= 0)
                      &(anchors[:, 1] >= 0)
                      &(anchors[:, 2] <= 800)
                      &(anchors[:, 3] <= 800))[0]
print(valid_index.shape)

(8940,)


We assign a **positive label to two kind of anchors** a) The anchor/anchors with the highest Intersection-over-Union(IoU) overlap with a ground-truth-box or b) An anchor that has an IoU overlap higher than 0.7 with ground-truth box.


**Note that single ground-truth object may assign positive labels to multiple anchors.**


c) We assign a **negative label** to a non-positive anchor if its IoU ratio is lower than 0.3 for all ground-truth boxes. 

d) Anchors that are **neither positive nor negitive** do not contribute to the training objective.

### create an empty label array with inside_index shape and fill with -1. 

In [11]:
valid_labels = np.empty((valid_index.shape[0],), dtype=np.int32)
valid_labels.fill(-1)

### Create valid anchors

In [12]:
valid_anchors = anchors[valid_index]

In [13]:
print(valid_anchors.shape)
print(bbox.shape)

(8940, 4)
torch.Size([2, 4])


### Generate ious matrix between valid_anchors and bbox

In [14]:
ious = bbox_iou(valid_anchors, bbox.numpy())

### Label pos and neg anchors

Considering the scenarios of a and b, we need to find two things here
- the highest iou for each gt_box and its corresponding anchor box
- the highest iou for each anchor box and its corresponding ground truth box

#### Set `pos_iou_thres` and `neg_iou_thres` to set anchors

In [15]:
pos_iou_thres = 0.7
neg_iou_thred = 0.3

anchor_max_iou = np.amax(ious, axis=1)
pos_iou_anchor_label = np.where(anchor_max_iou >= pos_iou_thres)[0]
neg_iou_anchor_label = np.where(anchor_max_iou < neg_iou_thred)[0]
valid_labels[pos_iou_anchor_label] = 1
valid_labels[neg_iou_anchor_label] = 0

#### Assign labels to the anchors with highest iou with box

In [16]:
gt_max_iou = np.amax(ious, axis=0)
gt_max_iou_anchor_label = np.where(ious == gt_max_iou)[0]
print(gt_max_iou_anchor_label)
valid_labels[gt_max_iou_anchor_label] = 1

[2264 2271 4097 4105 4113 4121 4371 4379 4387 4395 4645 4653 4661 4669
 4919 4927 4935 4943]


### Sample positive and negtive anchors

> we randomly sample 256 anchors in an image to compute the loss function of a mini-batch, where the sampled positive and negative anchors have a ratio of up to 1:1. If there are fewer than 128 positive samples in an image, we pad the mini-batch with negitive ones.

In [17]:
n_sample_anchors = 256
pos_ratio = 0.5

total_n_pos = len(np.where(valid_labels == 1)[0])
n_pos_sample = n_sample_anchors*pos_ratio if total_n_pos > n_sample_anchors*pos_ratio else total_n_pos
n_neg_sample = n_sample_anchors - n_pos_sample

In [18]:
pos_index = np.where(valid_labels == 1)[0]
if len(pos_index) > n_sample_anchors*pos_ratio:
    disable_index = np.random.choice(pos_index, size=len(pos_index)-n_pos_sample, replace=False)
    valid_labels[disable_index] = -1

neg_index = np.where(valid_labels == 0)[0]
disable_index = np.random.choice(neg_index, size=len(neg_index) - n_neg_sample, replace=False)
valid_labels[disable_index] = -1

### Assigning locations to anchor boxes

```
t_{x} = (x - x_{a})/w_{a}
t_{y} = (y - y_{a})/h_{a}
t_{w} = log(w/ w_a)
t_{h} = log(h/ h_a)
```
**x, y , w, h** are the groud truth box center co-ordinates which has maxmimum iou with corresponding anchor, width and height. **x_a, y_a, h_a and w_a** and anchor boxes center cooridinates, width and height.

In [19]:
# Each anchor corresponds to a box

argmax_iou = np.argmax(ious, axis=1)
max_iou_box = bbox[argmax_iou].numpy()
print(max_iou_box.shape)
print(valid_anchors.shape)

(8940, 4)
(8940, 4)


In [20]:
anchor_loc_format_target = format_loc(valid_anchors, max_iou_box)
print(anchor_loc_format_target.shape)

(8940, 4)


### Final labels and locations

In [21]:
anchor_target_labels = np.empty((len(anchors),), dtype=np.int32)
anchor_target_format_locations = np.zeros((len(anchors), 4), dtype=np.float32)

anchor_target_labels.fill(-1)
anchor_target_labels[valid_index] = valid_labels

anchor_target_format_locations[valid_index] = anchor_loc_format_target

print(anchor_target_labels.shape)
print(anchor_target_format_locations.shape)

(22500,)
(22500, 4)


# Region Proposal Network

To generate region proposals, we **slide a small network over the convolutional feature map output** that we obtained in the feature extraction module. This small network takes as input an n x n spatial window of the input convolutional feature map. Each sliding window is mapped to a lower-dimensional feature [512 features]. **This feature is fed into two sibling fully connected layers**
- A box regrression layer
- A box classification layer

In [22]:
mid_channel = 512
in_channel = 512
n_anchor = 9

conv1 = nn.Conv2d(in_channel, mid_channel, 3, 1, 1)
reg_layer = nn.Conv2d(mid_channel, n_anchor*4, 1, 1, 0)
cls_layer = nn.Conv2d(mid_channel, n_anchor*2, 1, 1, 0)

## Initialize weights and bias
The paper tells that they initialized these layers with zero mean and 0.01 standard deviation for weights and zeros for base.

In [23]:
conv1.weight.data.normal_(0, 0.01)
conv1.bias.data.zero_()

reg_layer.weight.data.normal_(0, 0.01)
reg_layer.bias.data.zero_()

cls_layer.weight.data.normal_(0, 0.01)
cls_layer.bias.data.zero_()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [24]:
x = conv1(output_map)
anchor_pred_format_locations = reg_layer(x)
anchor_pred_scores = cls_layer(x)

print(anchor_pred_format_locations.shape)
print(anchor_pred_scores.shape)

torch.Size([1, 36, 50, 50])
torch.Size([1, 18, 50, 50])


## Reformat the output

Lets **reformat these a bit and make it align with our anchor targets we designed previously**. We will also find the objectness scores for each anchor box, as this is used to for proposal layer which we will discuss in the next section

In [25]:
anchor_pred_format_locations = anchor_pred_format_locations.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
print(anchor_pred_format_locations.shape)

anchor_pred_scores = anchor_pred_scores.permute(0, 2, 3, 1).contiguous().view(1, -1, 2)
print(anchor_pred_scores.shape)

objectness_pred_scores = anchor_pred_scores[:, :, 1]
print(objectness_pred_scores.shape)

torch.Size([1, 22500, 4])
torch.Size([1, 22500, 2])
torch.Size([1, 22500])


## RPN Loss

![](./RPN-loss.png)

where `p_{i}` is the predicted class label and `p_{i}^*` is the actual class score. `t_{i}` and `t_{i}^*` are the predicted co-oridinates and actual co-ordinates. The ground-truth label `p_{i}^*` is 1 if the the anchor is positive and 0 if the anchor is negative. We will see how this is done in Pytorch.

In [26]:
print(anchor_target_labels.shape)
print(anchor_target_format_locations.shape)
print(anchor_pred_scores.shape)
print(anchor_pred_format_locations.shape)

(22500,)
(22500, 4)
torch.Size([1, 22500, 2])
torch.Size([1, 22500, 4])


In [27]:
gt_rpn_format_locs = torch.from_numpy(anchor_target_format_locations)
gt_rpn_scores = torch.from_numpy(anchor_target_labels)

rpn_format_locs = anchor_pred_format_locations[0]
rpn_scores = anchor_pred_scores[0]

### cls loss
pred_cls_scores and anchor_labels are the predited objectness score and actual objectness score of the RPN network. We will use the following loss functions for Regression and classification respectively.
For classification we use cross-entropy loss
![](./Cross_entropy-loss.png)
Cross Entropy Loss

In [28]:
rpn_cls_loss = F.cross_entropy(rpn_scores, gt_rpn_scores.long(), ignore_index=-1)
print(rpn_cls_loss)

tensor(0.6935, grad_fn=<NllLossBackward>)


### reg loss for positive target anchors
For Regression we use smooth L1 loss as defined in the Fast RCNN paper,
![](Smooth-L1-loss.png)
Smooth L1 Loss
They used L1 loss instead of L2 loss because the values of predicted regression head of RPN are not bounded. **Regression loss is also applied to the bounding boxes which have positive label**

In [29]:
mask = gt_rpn_scores > 0
mask_target_format_locs = gt_rpn_format_locs[mask]
mask_pred_format_locs = rpn_format_locs[mask]

print(mask_target_format_locs.shape)
print(mask_pred_format_locs.shape)

torch.Size([18, 4])
torch.Size([18, 4])


In [30]:
x = torch.abs(mask_target_format_locs - mask_pred_format_locs)
rpn_loc_loss = ((x<0.5).float()*(x**2)*0.5 + (x>0.5).float()*(x-0.5)).sum()
print(rpn_loc_loss)

tensor(1.1740, grad_fn=<SumBackward0>)


### RPN total loss

In [31]:
rpn_lambda = 10
N_reg = mask.float().sum()

rpn_loss = rpn_cls_loss + rpn_lambda / N_reg * rpn_loc_loss
print(rpn_loss)

tensor(1.3457, grad_fn=<AddBackward0>)


# Generating proposals to feed Fast R-CNN network

The proposal function will take the following parameters
- Weather training_mode or testing mode
- nms_thresh
- n_train_pre_nms — number of bboxes before nms during training
- n_train_post_nms — number of bboxes after nms during training
- n_test_pre_nms — number of bboxes before nms during testing
- n_test_post_nms — number of bboxes after nms during testing
- min_size — minimum height of the object required to create a proposal.

The Faster R_CNN says, RPN proposals highly overlap with each other. To reduced redundancy, we adopt non-maximum supression (NMS) on the proposal regions based on their cls scores. We fix the IoU threshold for NMS at 0.7, which leaves us about 2000 proposal regions per image. After an ablation study, the authors show that NMS does not harm the ultimate detection accuracy, but substantially reduces the number of proposals. After NMS, we use the top-N ranked proposal regions for detection. In the following we training Fast R-CNN using 2000 RPN proposals. During testing they evaluate only 300 proposals, they have tested this with various numbers and obtained this.

In [32]:
nms_thresh = 0.7
n_train_pre_nms = 12000
n_train_post_nms = 2000
n_test_pre_nms = 6000
n_test_post_nms = 300
min_size = 16

- convert the loc predictions from the rpn network to bbox [y1, x1, y2, x2] format.
- clip the predicted boxes to the image
- Remove predicted boxes with either height or width < threshold (min_size).
- Sort all (proposal, score) pairs by score from highest to lowest.
- Take top pre_nms_topN (e.g. 12000 while training and 300 while testing).
- Apply nms threshold > 0.7
- Take top pos_nms_topN (e.g. 2000 while training and 300 while testing)

## convert the loc predictions from the rpn net to [y1, x1, y2, x2] format


In [33]:
print(anchors.shape)
print(anchor_pred_format_locations.shape)

(22500, 4)
torch.Size([1, 22500, 4])


In [34]:
rois = deformat_loc(anchors=anchors, formatted_base_anchor=anchor_pred_format_locations[0].data.numpy())
print(rois.shape)
print(rois)

(22500, 4)
[[ -37.56205856  -83.65124834   55.51502551   96.9647187 ]
 [ -59.50866938  -56.68875009   64.91222143   72.23375052]
 [ -81.40298363  -41.99777969   96.39533509   49.35743635]
 ...
 [ 610.35422226  414.3952291   979.0893042  1163.98340092]
 [ 538.20066833  564.81064224 1041.29725647 1063.15491104]
 [ 432.48094419  606.7697889  1166.24708388  973.39356325]]


## clip the predicted boxes to the image

In [35]:
rois[:, 0:4:2] = np.clip(rois[:, 0:4:2], a_min=0, a_max=image_size[0])
rois[:, 1:4:2] = np.clip(rois[:, 1:4:2], a_min=0, a_max=image_size[1])
print(rois)

[[  0.           0.          55.51502551  96.9647187 ]
 [  0.           0.          64.91222143  72.23375052]
 [  0.           0.          96.39533509  49.35743635]
 ...
 [610.35422226 414.3952291  800.         800.        ]
 [538.20066833 564.81064224 800.         800.        ]
 [432.48094419 606.7697889  800.         800.        ]]


## Remove predicted boxes with either height or width < threshold

In [36]:
h = rois[:, 2] - rois[:, 0]
w = rois[:, 3] - rois[:, 1]

valid_index = np.where((h>min_size)&(w>min_size))[0]
valid_rois = rois[valid_index]
valid_scores = objectness_pred_scores[0][valid_index].data.numpy()

print(valid_rois.shape)
print(valid_scores.shape)

(22500, 4)
(22500,)


## sort all pairs by score from highest to lowest

In [37]:
valid_score_order = valid_scores.ravel().argsort()[::-1]
print(valid_score_order)

[433  16   1 ... 913 432   9]


## Take top pre_nms_topN

In [38]:
pre_train_valid_score_order = valid_score_order[:n_train_pre_nms]
pre_train_valid_rois = valid_rois[pre_train_valid_score_order]
pre_train_valid_scores = valid_scores[pre_train_valid_score_order]

print(pre_train_valid_rois.shape)
print(pre_train_valid_scores.shape)
print(pre_train_valid_score_order.shape)

(12000, 4)
(12000,)
(12000,)


## Apply NMS threshold

- Take all the roi boxes [roi_array]
- Find the areas of all the boxes [roi_area]
- Take the indexes of order the probability score in descending order [order_array]
keep = []
while order_array.size > 0:
  - take the first element in order_array and append that to keep  
  - Find the area with all other boxes
  - Find the index of all the boxes which have high overlap with this box
  - Remove them from order array
  - Iterate this till we get the order_size to zero (while loop)
- Ouput the keep variable which tells what indexes to consider.

In [39]:
keep_index = nms(rois=pre_train_valid_rois, scores=pre_train_valid_scores, nms_thresh=nms_thresh)
post_train_valid_rois = pre_train_valid_rois[keep_index][:n_train_post_nms]
post_train_valid_scores = pre_train_valid_scores[keep_index][:n_train_post_nms]
print(post_train_valid_rois.shape)
print(post_train_valid_scores.shape)

(2000, 4)
(2000,)


# Proposal targets

The Fast R-CNN network takes the region proposals (obtained from proposal layer in previous section), ground truth boxes and their respective labels as inputs. It will take the following parameters
- n_sample: Number of samples to sample from roi, The default value is 128.
- pos_ratio: the number of positive examples out of the n_samples. The default values is 0.25.
- pos_iou_thesh: The minimum overlap of region proposal with any groundtruth object to consider it as positive label.
- [neg_iou_threshold_lo, neg_iou_threshold_hi] : [0.0, 0.5], The overlap value bounding required to consider a region proposal as negitive [background object].

**This Step I think is used in the training process. In test step, we just use the proposal after NMS as the input of Fast RCNN network.**

In [40]:
n_sample = 128
pos_ratio = 0.25
pos_iou_thresh = 0.5
neg_iou_thresh_hi = 0.5
neg_iou_thresh_lo = 0.0

In [41]:
print(post_train_valid_rois.shape)
print(post_train_valid_scores.shape)

(2000, 4)
(2000,)


## Find the iou of each ground truth object with the region proposals

same as 2.3.4

In [42]:
ious = bbox_iou(post_train_valid_rois, bbox)
print(ious.shape)

(2000, 2)


## Assign labels for each proposal

same as 2.3.5

In [43]:
bbox_assignments = ious.argmax(axis=1)
roi_max_ious = ious.max(axis=1)
roi_target_labels = labels[bbox_assignments]
print(roi_target_labels.shape)

torch.Size([2000])


## Sample pos and neg samples

same as 2.3.6

- Select the foreground rois as per the pos_iou_thesh. We also want only n_sample x pos_ratio (128 x 0.25 = 32) foreground samples. So incase if we get less than 32 positive samples we will leave it as it is, Incase if we get more than 32 foreground samples, we will sample 32 samples from the positive samples. This is done using the following code.

In [44]:
total_n_pos = len(np.where(roi_max_ious >= pos_iou_thresh)[0])
n_pos_sample = n_sample*pos_ratio if total_n_pos > n_sample*pos_ratio else total_n_pos
n_neg_sample = n_sample - n_pos_sample

print(n_pos_sample)
print(n_neg_sample)

10
118


In [45]:
pos_index = np.where(roi_max_ious >= pos_iou_thresh)[0]
pos_index = np.random.choice(pos_index, size=n_pos_sample, replace=False)

neg_index = np.where((roi_max_ious < neg_iou_thresh_hi) & (roi_max_ious > neg_iou_thresh_lo))[0]
neg_index = np.random.choice(neg_index, size=n_neg_sample, replace=False)

print(pos_index.shape)
print(neg_index.shape)

(10,)
(118,)


## Gather positve samples index and negitive samples index, their respective labels and region proposals

In [46]:
keep_index = np.append(pos_index, neg_index)
post_sample_target_labels = roi_target_labels[keep_index].data.numpy()
post_sample_target_labels[len(pos_index):] = 0
post_sample_rois = post_train_valid_rois[keep_index]

## Pick the ground truth objects for these sample_roi and later parameterize

same as 2.3.7

In [47]:
post_sample_bbox = bbox[bbox_assignments[keep_index]]
post_sample_format_rois = format_loc(anchors=post_sample_rois, base_anchors=post_sample_bbox.data.numpy())
print(post_sample_format_rois.shape)

(128, 4)


# Fast R-CNN

Fast R-CNN used ROI pooling to extract features for each and every proposal suggested by selective search (Fast RCNN) or Region Proposal network (RPN in Faster R- CNN). 

Region of interest pooling (also known as RoI pooling) purpose is to perform max pooling on inputs of non-uniform sizes to obtain fixed-size feature maps (e.g. 7×7). This layer takes two inputs

- A fixed-size feature map obtained from a deep convolutional network with several convolutions and max-pooling layers
- An Nx5 matrix of representing a list of regions of interest, where N is the number of RoIs. The first column represents the image index and the remaining four are the co-ordinates of the top left and bottom right corners of the region.

What does the RoI pooling actually do? For every region of interest from the input list, it takes a section of the input feature map that corresponds to it and scales it to some pre-defined size (e.g., 7×7). The scaling is done by:
- Dividing the region proposal into equal-sized sections (the number of which is the same as the dimension of the output)
- Finding the largest value in each section
- Copying these max values to the output buffer

In [48]:
rois = torch.from_numpy(post_sample_rois).float()
print(rois.shape)
# roi_indices = torch.zeros((len(rois),1), dtype=torch.float32)
# print(rois.shape, roi_indices.shape)

# indices_and_rois = torch.cat([roi_indices, rois], dim=1)
# print(indices_and_rois.shape)

torch.Size([128, 4])


Now we need to pass this array to the roi_pooling layer. We will briefly discuss the workings of it here. The sudo code is as follows

- Multiply the dimensions of rois with the sub_sampling ratio (16 in this case)
- Empty output Tensor
- Take each roi
    - subset the **feature map** based on the roi dimension
    - Apply AdaptiveMaxPool2d to this subset Tensor.
    - Add the outputs to the output Tensor
- Empty output Tensor goes to the network

We will define the size to be 7 x 7 and define adaptive_max_pool

## ROI Pooling layer

In [49]:
size = (7, 7)
adaptive_max_pool = nn.AdaptiveMaxPool2d(size)

# correspond to feature map
rois.mul_(1/16.0)
rois = rois.long()

In [50]:
output = []
num_rois = len(rois)
for roi in rois:
    roi_feature = output_map[..., roi[0]:roi[2]+1, roi[1]:roi[3]+1]
    output.append(adaptive_max_pool(roi_feature))
output = torch.cat(output, 0)
print(output.shape)

torch.Size([128, 512, 7, 7])


In [51]:
output_ROI_pooling = output.view(output.size(0), -1)
print(output_ROI_pooling.shape)

torch.Size([128, 25088])


## Loc and Cls layer

In [52]:
roi_head = nn.Sequential(nn.Linear(25088, 4096),
                        nn.Linear(4096, 4096))

cls_loc = nn.Linear(4096, 21*4)
cls_loc.weight.data.normal_(0, 0.01)
cls_loc.bias.data.zero_()

cls_score = nn.Linear(4096, 21)
cls_score.weight.data.normal_(0, 0.01)
cls_score.bias.data.zero_()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [53]:
x = roi_head(output_ROI_pooling)
roi_cls_loc = cls_loc(x)
roi_cls_score = cls_score(x)

print(roi_cls_loc.shape, roi_cls_score.shape)

torch.Size([128, 84]) torch.Size([128, 21])


## Fast R-CNN loss

- predicted

In [54]:
print(roi_cls_loc.shape)
print(roi_cls_score.shape)

torch.Size([128, 84])
torch.Size([128, 21])


- target

In [55]:
print(post_sample_format_rois.shape)
print(post_sample_target_labels.shape)

gt_roi_cls_loc = torch.from_numpy(post_sample_format_rois).float()
gt_roi_cls_label = torch.from_numpy(post_sample_target_labels).long()

(128, 4)
(128,)


### cls loss

In [57]:
roi_cls_loss = F.cross_entropy(roi_cls_score, gt_roi_cls_label)
print(roi_cls_loss)

tensor(3.0664, grad_fn=<NllLossBackward>)


### reg loss

In [58]:
num_roi = roi_cls_loc.size(0)
roi_cls_loc = roi_cls_loc.view(-1, 21, 4)
roi_cls_loc = roi_cls_loc[torch.arange(num_roi), gt_roi_cls_label]
print(roi_cls_loc.shape)

torch.Size([128, 4])


In [59]:
mask = gt_roi_cls_label>0
mask_loc_pred = roi_cls_loc[mask]
mask_loc_target = gt_roi_cls_loc[mask]

print(mask_loc_pred.shape)
print(mask_loc_target.shape)

torch.Size([10, 4])
torch.Size([10, 4])


In [60]:
x = torch.abs(mask_loc_pred-mask_loc_target)
roi_loc_loss = ((x<0.5).float()*x**2*0.5 + (x>0.5).float()*(x-0.5)).sum()
print(roi_loc_loss)

tensor(0.6413, grad_fn=<SumBackward0>)


### total loss

In [61]:
roi_lambda = 10
N_reg = (gt_roi_cls_label>0).float().sum()
roi_loss = roi_cls_loss + roi_lambda / N_reg * roi_loc_loss
print(roi_loss)

tensor(3.7076, grad_fn=<AddBackward0>)


# Total loss

In [62]:
total_loss = rpn_loss + roi_loss