# Feature Extraction <br>
The VGG16 network is used as a feature extractor. <br>

In [1]:
import torch
import torch.nn as nn
image = torch.zeros((1, 3, 800, 800)).float()

bbox = torch.FloatTensor([[20, 30, 400, 500], [300, 400, 500, 600]]) # [y1, x1, y2, x2] format
labels = torch.LongTensor([6, 8]) # 0 represents background
sub_sample = 16

1. Create a dummy image and set the volatile to be False.

In [2]:
import torchvision
dummy_img = torch.zeros((1, 3, 800, 800)).float()
print(dummy_img)
#Out: torch.Size([1, 3, 800, 800])

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]])


2. List all the layers of the VGG16.

In [3]:
model = torchvision.models.vgg16(pretrained=True)
model_features = list(model.features)
print(model_features) # length is 15

[Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), ReLU(inplace), Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), ReLU(inplace), MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), ReLU(inplace), Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), ReLU(inplace), MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), ReLU(inplace), Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), ReLU(inplace), Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), ReLU(inplace), MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), ReLU(inplace), Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), ReLU(inplace), Conv2d(512, 512, kern

3. Pass the image through the layers and check where you are getting this size.

In [4]:
req_features = []
k = dummy_img.clone()
for fe in model_features:
    k = fe(k)
    if k.size()[2] < 800//16:
        break
    req_features.append(fe)
    out_channels = k.size()[1]
print(len(req_features)) #30
print(out_channels) # 512

30
512


4. Convert this list into a Sequential module.

In [5]:
faster_rcnn_fe_extractor = nn.Sequential(*req_features)

print(image.size())
out_map = faster_rcnn_fe_extractor(image)
print(out_map.size())

torch.Size([1, 3, 800, 800])
torch.Size([1, 512, 50, 50])


# Anchor Boxes

1. Begin with a an empty array filled with zero values.

In [6]:
import numpy as np
ratios = [0.5, 1, 2]
anchor_scales = [8, 16, 32]

anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32)

print(anchor_base)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


2. Fill these values with corresponding y1, x1, y2, x2 at each anchor_scale and ratios. 

In [7]:
ctr_y = sub_sample / 2.
ctr_x = sub_sample / 2.

print(ctr_y, ctr_x)

for i in range(len(ratios)):
    for j in range(len(anchor_scales)):
        h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
        w = sub_sample * anchor_scales[j] * np.sqrt(1./ ratios[i])

        index = i * len(anchor_scales) + j

        anchor_base[index, 0] = ctr_y - h / 2.
        anchor_base[index, 1] = ctr_x - w / 2.
        anchor_base[index, 2] = ctr_y + h / 2.
        anchor_base[index, 3] = ctr_x + w / 2.

8.0 8.0


3. Generate Anchor Center at all the feature map location.

In [8]:
fe_size = (800//16)
ctr_x = np.arange(16, (fe_size+1) * 16, 16)
ctr_y = np.arange(16, (fe_size+1) * 16, 16)

index = 0
ctr = np.zeros((len(ctr_x)*len(ctr_x), 2))
for x in range(len(ctr_x)):
    for y in range(len(ctr_y)):
        ctr[index, 1] = ctr_x[x] - 8
        ctr[index, 0] = ctr_y[y] - 8
        index +=1
print('Number of Anchor Centers = ', len(ctr_x)*len(ctr_x))

Number of Anchor Centers =  2500


4. Generate Anchor Boxes for each Anchor Center.

In [9]:
anchors = np.zeros((fe_size * fe_size * 9, 4))
index = 0
for c in ctr:
    ctr_y, ctr_x = c
    for i in range(len(ratios)):
        for j in range(len(anchor_scales)):
            h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
            w = sub_sample * anchor_scales[j] * np.sqrt(1./ ratios[i])
            anchors[index, 0] = ctr_y - h / 2.
            anchors[index, 1] = ctr_x - w / 2.
            anchors[index, 2] = ctr_y + h / 2.
            anchors[index, 3] = ctr_x + w / 2.
            index += 1
        
print(anchors.shape)

(22500, 4)


# Label Anchors 
<br>
Positive labels - anchors with highest iou with ground truth and anchors with iou higher than 0.7 with ground truth. <br>
Negative label - anchors with iou is lower than 0.3 for all ground truth boxes. <br>
No Label - not used during training. <br>

1. Example

In [10]:
bbox = np.asarray([[20, 30, 400, 500], [300, 400, 500, 600]], dtype=np.float32) # [y1, x1, y2, x2] format
labels = np.asarray([6, 8], dtype=np.int8) # 0 represents background

2. Find the indices of valid anchor boxes

In [11]:
inside_index = np.where(
        (anchors[:, 0] >= 0) &
        (anchors[:, 1] >= 0) &
        (anchors[:, 2] <= 800) &
        (anchors[:, 3] <= 800)
    )[0]
print(inside_index.shape)

(8940,)


3. Create an empty label array of shape inside_index and fill with -1

In [12]:
label = np.empty((len(inside_index), ), dtype=np.int32)
label.fill(-1)
print(label.shape)

(8940,)


4. Create array with valid anchor boxes

In [13]:
valid_anchors = anchors[inside_index]
print(valid_anchors.shape)

(8940, 4)


5. Calculate iou with each ground truth object

In [14]:
ious = np.empty((len(valid_anchors), 2), dtype=np.float32)
ious.fill(0)
print(bbox)
for num1, i in enumerate(valid_anchors):
    ya1, xa1, ya2, xa2 = i  
    anchor_area = (ya2 - ya1) * (xa2 - xa1)
    for num2, j in enumerate(bbox):
        yb1, xb1, yb2, xb2 = j
        box_area = (yb2- yb1) * (xb2 - xb1)
        inter_x1 = max([xb1, xa1])
        inter_y1 = max([yb1, ya1])
        inter_x2 = min([xb2, xa2])
        inter_y2 = min([yb2, ya2])
        if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
            iter_area = (inter_y2 - inter_y1) * \
(inter_x2 - inter_x1)
            iou = iter_area / \
(anchor_area+ box_area - iter_area)            
        else:
            iou = 0.
        ious[num1, num2] = iou
print(ious.shape)

[[ 20.  30. 400. 500.]
 [300. 400. 500. 600.]]
(8940, 2)


6. Calculate the highest iou for each ground truth box and its corresponding anchor box

In [15]:
gt_argmax_ious = ious.argmax(axis=0)
print(gt_argmax_ious)
gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
print(gt_max_ious)

[2262 5620]
[0.68130493 0.61035156]


7. Calculate the highest iou for each anchor box and its corresponding ground truth box

In [16]:
argmax_ious = ious.argmax(axis=1)
print(argmax_ious.shape)
print(argmax_ious)
max_ious = ious[np.arange(len(inside_index)), argmax_ious]
print(max_ious)

(8940,)
[0 0 0 ... 0 0 0]
[0.06811669 0.07083762 0.07083762 ... 0.         0.         0.        ]


8. Find anchor boxes with this mach_ious

In [17]:
gt_argmax_ious = np.where(ious == gt_max_ious)[0]
print(gt_argmax_ious)

[2262 2508 5620 5628 5636 5644 5866 5874 5882 5890 6112 6120 6128 6136
 6358 6366 6374 6382]


9. Assign negative labels (0) 

In [18]:
pos_iou_threshold  = 0.7
neg_iou_threshold = 0.3
label[max_ious < neg_iou_threshold] = 0

10. Assign positive labels (1)

In [19]:
label[gt_argmax_ious] = 1
label[max_ious >= pos_iou_threshold] = 1

# Train RPN
<br>
Each mini-batch arises from a single image that contains many positive and negitive example anchors, but this will bias towards negitive samples as they are dominate. Instead, we randomly sample 256 anchors in an image to compute the loss function of a mini-batch, where the sampled positive and negative anchors have a ratio of up to 1:1. If there are fewer than 128 positive samples in an image, we pad the mini-batch with negitive ones.

1. Define training variables

In [20]:
pos_ratio = 0.5
n_sample = 256
n_pos = pos_ratio * n_sample

2. Randomly sample positive and negative examples for minibatch

In [21]:
pos_index = np.where(label == 1)[0]
if len(pos_index) > n_pos:
    disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False)
    label[disable_index] = -1
    
n_neg = n_sample * np.sum(label == 1)
neg_index = np.where(label == 0)[0]
if len(neg_index) > n_neg:
    disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace = False)
    label[disable_index] = -1

3. For each anchor box, find the groundtruth object which has max_iou

In [22]:
max_iou_bbox = bbox[argmax_ious]
print(max_iou_bbox)

[[ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 ...
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]]


4. Convert the y1, x1, y2, x2 format of valid anchor boxes and associated ground truth boxes with max iou to ctr_y, ctr_x , h, w format.

In [23]:
height = valid_anchors[:, 2] - valid_anchors[:, 0]
width = valid_anchors[:, 3] - valid_anchors[:, 1]
ctr_y = valid_anchors[:, 0] + 0.5 * height
ctr_x = valid_anchors[:, 1] + 0.5 * width
base_height = max_iou_bbox[:, 2] - max_iou_bbox[:, 0]
base_width = max_iou_bbox[:, 3] - max_iou_bbox[:, 1]
base_ctr_y = max_iou_bbox[:, 0] + 0.5 * base_height
base_ctr_x = max_iou_bbox[:, 1] + 0.5 * base_width

5. Use the above formulas to find the locations

In [24]:
eps = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width = np.maximum(width, eps)
dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height / height)
dw = np.log(base_width / width)
anchor_locs = np.vstack((dy, dx, dh, dw)).transpose()
print(anchor_locs)

[[ 0.5855728   2.30914558  0.7415674   1.64727602]
 [ 0.49718446  2.30914558  0.7415674   1.64727602]
 [ 0.40879611  2.30914558  0.7415674   1.64727602]
 ...
 [-2.50801936 -5.29225232  0.7415674   1.64727602]
 [-2.59640771 -5.29225232  0.7415674   1.64727602]
 [-2.68479606 -5.29225232  0.7415674   1.64727602]]


6. Map anchor_locs to the original anchors using the inside_index variable. Fill the unvalid anchor boxes labels with -1 (ignore) and locations with 0

In [25]:
# Final Labels
anchor_labels = np.empty((len(anchors),), dtype=label.dtype)
anchor_labels.fill(-1)
anchor_labels[inside_index] = label
print(anchor_labels.shape)

# Final Locations
anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype=anchor_locs.dtype)
anchor_locations.fill(0)
anchor_locations[inside_index, :] = anchor_locs
print(anchor_locations.shape)


(22500,)
(22500, 4)


# Region Proposal Network
<br>
Faster_R-CNN is the first work to demonstrate generating region proposals using deep learning.

1. Implement this Architecture using n x n convolutional layer followed by two sibiling 1 x 1 convolutional layers

In [26]:
import torch.nn as nn
mid_channels = 512
in_channels = 512 # depends on the output feature map. in vgg 16 it is equal to 512
n_anchor = 9 # Number of anchors at each location
conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
reg_layer = nn.Conv2d(mid_channels, n_anchor *4, 1, 1, 0)
cls_layer = nn.Conv2d(mid_channels, n_anchor *2, 1, 1, 0) ## I will be going to use softmax here. you can equally use sigmoid if u replace 2 with 1.

2. Initialize weights with 0 mean and 0.01 standard deviation

In [27]:
# conv sliding layer
conv1.weight.data.normal_(0, 0.01)
conv1.bias.data.zero_()
# Regression layer
reg_layer.weight.data.normal_(0, 0.01)
reg_layer.bias.data.zero_()
# classification layer
cls_layer.weight.data.normal_(0, 0.01)
cls_layer.bias.data.zero_()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

3. Send the outputs from feature extraction to the network and predict locations of objects with repect to the anchor and the objectness score assoiciated with it.

In [28]:
x = conv1(out_map) # out_map is obtained in section 1
pred_anchor_locs = reg_layer(x)
pred_cls_scores = cls_layer(x)
print(pred_cls_scores.shape, pred_anchor_locs.shape)

torch.Size([1, 18, 50, 50]) torch.Size([1, 36, 50, 50])


4. Reformat these to align with anchor targets and find objectness scores foe each anchor box. 

In [29]:
pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
print(pred_anchor_locs.shape)

pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous()
print(pred_cls_scores.shape)

objectness_score = pred_cls_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
print(objectness_score.shape)

pred_cls_scores  = pred_cls_scores.view(1, -1, 2)
print(pred_cls_scores.shape)


torch.Size([1, 22500, 4])
torch.Size([1, 50, 50, 18])
torch.Size([1, 22500])
torch.Size([1, 22500, 2])


# Generating Proposals to feed to Fast RCNN Network
<br>
The Faster R_CNN says, RPN proposals highly overlap with each other. To reduced redundancy, we adopt non-maximum supression (NMS) on the proposal regions based on their cls scores. We fix the IoU threshold for NMS at 0.7, which leaves us about 2000 proposal regions per image. After an ablation study, the authors show that NMS does not harm the ultimate detection accuracy, but substantially reduces the number of proposals. After NMS, we use the top-N ranked proposal regions for detection. In the following we training Fast R-CNN using 2000 RPN proposals. During testing they evaluate only 300 proposals, they have tested this with various numbers and obtained this.



1. Define parameters

In [30]:
nms_thresh = 0.7
n_train_pre_nms = 12000
n_train_post_nms = 2000
n_test_pre_nms = 6000
n_test_post_nms = 300
min_size = 16

2. Convert anchors format from y1, x1, y2, x2 to ctr_x, ctr_y, h, w


In [31]:
anc_height = anchors[:, 2] - anchors[:, 0]
anc_width = anchors[:, 3] - anchors[:, 1]
anc_ctr_y = anchors[:, 0] + 0.5 * anc_height
anc_ctr_x = anchors[:, 1] + 0.5 * anc_width

3. Convert predictions locs using above formulas. before that convert the pred_anchor_locs and objectness_score to numpy array

In [32]:
pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy()
objectness_score_numpy = objectness_score[0].data.numpy()
dy = pred_anchor_locs_numpy[:, 0::4]
dx = pred_anchor_locs_numpy[:, 1::4]
dh = pred_anchor_locs_numpy[:, 2::4]
dw = pred_anchor_locs_numpy[:, 3::4]
ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis]
ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis]
h = np.exp(dh) * anc_height[:, np.newaxis]
w = np.exp(dw) * anc_width[:, np.newaxis]

4. Convert [ctr_x, ctr_y, h, w] to [y1, x1, y2, x2] format

In [33]:
roi = np.zeros(pred_anchor_locs_numpy.shape, dtype=pred_anchor_locs_numpy.dtype)
roi[:, 0::4] = ctr_y - 0.5 * h
roi[:, 1::4] = ctr_x - 0.5 * w
roi[:, 2::4] = ctr_y + 0.5 * h
roi[:, 3::4] = ctr_x + 0.5 * w
print(roi)

[[ -37.62494   -86.46493    56.954086   97.103065]
 [ -86.02383  -175.29193    99.87668   190.44334 ]
 [-182.82507  -374.5908    180.3802    347.58655 ]
 ...
 [ 699.35626   745.096     881.7243    838.27606 ]
 [ 606.1716    702.7576    975.08966   881.8441  ]
 [ 421.9329    616.1573   1160.1234    985.53595 ]]


5. Clip the predicted boxes to the image

In [34]:
img_size = (800, 800) #Image size
roi[:, slice(0, 4, 2)] = np.clip(
            roi[:, slice(0, 4, 2)], 0, img_size[0])
roi[:, slice(1, 4, 2)] = np.clip(
    roi[:, slice(1, 4, 2)], 0, img_size[1])
print(roi)

[[  0.         0.        56.954086  97.103065]
 [  0.         0.        99.87668  190.44334 ]
 [  0.         0.       180.3802   347.58655 ]
 ...
 [699.35626  745.096    800.       800.      ]
 [606.1716   702.7576   800.       800.      ]
 [421.9329   616.1573   800.       800.      ]]


6. Remove predicted boxes with either height or width < threshold

In [35]:
hs = roi[:, 2] - roi[:, 0]
ws = roi[:, 3] - roi[:, 1]
keep = np.where((hs >= min_size) & (ws >= min_size))[0]
roi = roi[keep, :]
score = objectness_score_numpy[keep]
print(score.shape)

(22500,)


7. Sort all (proposal, score) pairs by score from highest to lowest.

In [36]:
order = score.ravel().argsort()[::-1]
print(order)

[ 15 438  24 ...  36  18 432]


8. Take top pre_nms_topN (e.g. 12000 while training and 300 while testing)

In [37]:
order = order[:n_train_pre_nms]
roi = roi[order, :]
print(roi.shape)
print(roi)

(12000, 4)
[[  0.         0.       110.082085  54.580402]
 [682.35077    0.       800.        53.140636]
 [  0.         0.       127.03026   53.721226]
 ...
 [ 41.77056  646.89075  169.2557   775.26556 ]
 [649.7592   630.9143   777.1611   759.20685 ]
 [649.7854   646.88855  777.2428   775.25543 ]]


9. Apply non-maximum supression threshold > 0.7 

In [38]:
y1 = roi[:, 0]
x1 = roi[:, 1]
y2 = roi[:, 2]
x2 = roi[:, 3]
area = (x2 - x1 + 1) * (y2 - y1 + 1)
order = score.argsort()[::-1]
keep = []
while (order.size > 0):
    i = order[0]
    xx1 = np.maximum(x1[i], x1[order[1:]])
    yy1 = np.maximum(y1[i], y1[order[1:]])
    xx2 = np.minimum(x2[i], x2[order[1:]])
    yy2 = np.minimum(y2[i], y2[order[1:]])
    w = np.maximum(0.0, xx2 - xx1 + 1)
    h = np.maximum(0.0, yy2 - yy1 + 1)
    inter = w * h
    ovr = inter / (areas[i] + areas[order[1:]] - inter)
    inds = np.where(ovr <= thresh)[0]
    order = order[inds + 1]
keep = keep[:n_train_post_nms] # while training/testing , use accordingly
roi = roi[keep] # the final region proposals

IndexError: index 22059 is out of bounds for axis 0 with size 12000