In [1]:
import numpy as np
import tensorflow as tf

## Anchor Generation Layer

In [2]:
def _extract_cor(anchor):
    """
    Return width, height, x center, and y center for an anchor (window).
    """
    # anchor is 1X4
    w = anchor[2] - anchor[0] + 1
    h = anchor[3] - anchor[1] + 1
    x_ctr = anchor[0] + 0.5 * (w - 1)
    y_ctr = anchor[1] + 0.5*(w-1)
    return w,h,x_ctr,y_ctr

In [3]:
def _make_anchors(ws,hs,x_ctr,y_ctr):
    """
    Given a vector of widths (ws) and heights (hs) around a center (x_ctr,y_ctr),
    Return a set of anchor boxes in (w, h, x_ctr, y_ctr) format.
    """
    # ws,hs,x_ctr,y_ctr are numpy arrays
    w = ws[:,np.newaxis] # [1,4] -> [4,1] i.e it generate array of one more dimension
    # print('w in mkanchors: ',w)
    h = hs[:,np.newaxis] # [1,4] -> [4,1]
    # print('h in mkanchors: ',h)
    anchors = np.hstack((x_ctr - 0.5*(w-1),
                                            y_ctr - 0.5*(h-1),
                                            x_ctr + 0.5*(w-1),
                                            y_ctr + 0.5*(h-1))) # horizontal stack
    # print('anchors in mkanchors: ',anchors)
    return anchors

In [4]:
def _ratio_enum(anchor,ratios):
    """
    Enumerate a set of anchors for each aspect ratio wrt an anchor.
    """
    
    w,h,x_ctr,y_ctr = _extract_cor(anchor)
    # print(w,h,x_ctr,y_ctr)
    size = w*h
    # print('size='+str(size))
    size_ratios = size/ratios
    # print('size_ratios='+str(size_ratios))
    ws = np.round(np.sqrt(size_ratios))
    # print('ws='+str(ws))
    hs = np.round(ws*ratios)
    # print('hs='+str(hs))
    anchors = _make_anchors(ws,hs,x_ctr,y_ctr)
    # print('anchors in ratio_enu='+str(anchors))
    return anchors

In [5]:
def _scale_enum(anchor,scales):
    """
    Enumerate a set of anchors for each scale wrt an anchor.
    """
    w, h, x_ctr, y_ctr = _extract_cor(anchor)
    # print("anchor in scle_enum="+str(anchor))
    ws = w*scales
    hs = h*scales
    anchors = _make_anchors(ws,hs,x_ctr,y_ctr)
    # print("Final anchors in scale_enum="+str(anchors))
    return anchors

In [6]:
#  It generates 9 anchor boxes from a base anchor box

def generate_anchors(base_size=16,ratios=[0.5,1,2],scales=np.array([8,16,32])):
    base_anchor = np.array([1,1,base_size,base_size]) - 1
    # print(base_anchor)
    ratio_anchors = _ratio_enum(base_anchor,ratios)
    # print(ratio_anchors.shape)
    anchors_list=[]
    for i in range(ratio_anchors.shape[0]):
        anc = _scale_enum(ratio_anchors[i,:],scales)
        anchors_list.append(anc)
    anchors = np.vstack(anchors_list)
    return anchors

In [7]:
generate_anchors()

array([[ -84. ,  -34.5,   99. ,   60.5],
       [-176. ,  -82.5,  191. ,  108.5],
       [-360. , -178.5,  375. ,  204.5],
       [ -56. ,  -56. ,   71. ,   71. ],
       [-120. , -120. ,  135. ,  135. ],
       [-248. , -248. ,  263. ,  263. ],
       [ -36. ,  -85.5,   51. ,   89.5],
       [ -80. , -173.5,   95. ,  177.5],
       [-168. , -349.5,  183. ,  353.5]])

In [8]:
# create uniformly spaced grid with spacing equal to stride

def generate_anchors_pre_tf(height, width, feat_stride=16, anchor_scales=(8,16,32), anchor_ratios=(0.5,1,2)):
    """
    A wrapper function to generate anchors given different scales and
    ratios.
    """

    shift_x = tf.range(width) * feat_stride # [0,16,32,48] width
    shift_y = tf.range(height) * feat_stride # [0,16,32,48] height
    shift_x, shift_y = tf.meshgrid(shift_x, shift_y) # meshgrid cols, rows , meshgrid generates a grid of points in ND space
    # meshgrid enumerate shift_x row wise and shift_y col wise
    shift_x = tf.reshape(shift_x, shape=(-1,)) # reshape to 1D
    shift_y = tf.reshape(shift_y, shape=(-1,))
    shifts = tf.stack((shift_x, shift_y, shift_x, shift_y), axis=1) # vertical stack by row
    K = tf.multiply(width, height)
    shifts = tf.transpose(tf.reshape(shifts,shape=[1,K,4]),perm=[1,0,2]) # reshaping into Kx1x4

    anchors = generate_anchors(ratios=np.array(anchor_ratios), scales=np.array(anchor_scales)) # basic 9 anchor boxes of shape (9,4)
    A = anchors.shape[0] # 9
    anchor_constants = tf.constant(anchors.reshape((1, A, 4)), dtype=tf.int32) # reshape to 1x9x4
    
    length = K*A
    anchors_tf = tf.reshape(tf.add(anchor_constants,shifts),shape=[length,4]) # add shift to anchors element wise
    return tf.cast(anchors_tf,tf.float32),length

In [9]:
tensor_anchors, length = generate_anchors_pre_tf(height=600//16,width=800//16)
print("type of tensor_anchors: ",type(tensor_anchors))
print("tensor_anchors shape",tensor_anchors.shape)
print("length="+str(length))

type of tensor_anchors:  <class 'tensorflow.python.framework.ops.EagerTensor'>
tensor_anchors shape (16650, 4)
length=tf.Tensor(16650, shape=(), dtype=int32)


## For Bounding box regression coefficients

In [10]:
import numpy as np
import tensorflow as tf

In [11]:
# calculating bounding box regression coefficients
def bbox_transform(original_rois,gt_rois):
    original_widths = original_rois[:,2] - original_rois[:,0] + 1.0
    original_heights = original_rois[:,3] - original_rois[:,1] + 1.0
    original_ctr_x = original_rois[:,0] + 0.5 * original_widths
    original_ctr_y = original_rois[:,1] + 0.5 * original_heights

    gt_widths = gt_rois[:,2] - gt_rois[:,0] + 1.0
    gt_heights = gt_rois[:,3] - gt_rois[:,1] + 1.0
    gt_ctr_x = gt_rois[:,0] + 0.5 * gt_widths
    gt_ctr_y = gt_rois[:,1] + 0.5 * gt_heights

    targets_dx = (gt_ctr_x - original_ctr_x) / original_widths
    targets_dy = (gt_ctr_y - original_ctr_y) / original_heights
    targets_dw = np.log(gt_widths / original_widths)
    targets_dh = np.log(gt_heights / original_heights)

    targets = np.vstack((targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
    return targets

In [12]:
bbox_transform(np.array([[-1,-1,15,15],[-1,-1,31,31]]),np.array([[0,0,16,16],[0,0,32,32]]))

array([[0.05882353, 0.05882353, 0.        , 0.        ],
       [0.03030303, 0.03030303, 0.        , 0.        ]])

In [13]:
def bbox_transform_inv_tf(boxes, deltas):
    if boxes.shape[0] == 0:
        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
    
    boxes = tf.cast(boxes, deltas.dtype)
    Original_widths = boxes[:, 2] - boxes[:, 0] + 1.0
    Original_heights = boxes[:, 3] - boxes[:, 1] + 1.0
    Original_ctr_x = boxes[:, 0] + 0.5 * Original_widths
    Original_ctr_y = boxes[:, 1] + 0.5 * Original_heights

    targets_dx = deltas[:, 0::4]
    targets_dy = deltas[:, 1::4]
    targets_dw = deltas[:, 2::4]
    targets_dh = deltas[:, 3::4]

    pred_ctr_x = tf.add(tf.multiply(targets_dx, Original_widths), Original_ctr_x)
    pred_ctr_y = tf.add(tf.multiply(targets_dy, Original_heights), Original_ctr_y)
    pred_w = tf.multiply(tf.exp(targets_dw), Original_widths)
    pred_h = tf.multiply(tf.exp(targets_dh), Original_heights)

    pred_boxes0 = tf.subtract(pred_ctr_x,pred_w*0.5)
    pred_boxes1 = tf.subtract(pred_ctr_y,pred_h*0.5)
    pred_boxes2 = tf.add(pred_ctr_x,pred_w*0.5)
    pred_boxes3 = tf.add(pred_ctr_y,pred_h*0.5)

    predicted_boxes = tf.stack([pred_boxes0,pred_boxes1,pred_boxes2,pred_boxes3],axis=1)
    return predicted_boxes

In [14]:
def clip_boxes_tf(boxes, im_info):
    """
    Clip boxes to image boundaries.
    boxes: [N, 4* num_classes]
    im_info: [image_height, image_width, scale_ratios]
    """
    # x1 >= 0
    boxes[:, 0::4] = tf.maximum(tf.minimum(boxes[:, 0::4], im_info[1] - 1), 0)
    # y1 >= 0
    boxes[:, 1::4] = tf.maximum(tf.minimum(boxes[:, 1::4], im_info[0] - 1), 0)
    # x2 < im_info[1]
    boxes[:, 2::4] = tf.maximum(tf.minimum(boxes[:, 2::4], im_info[1] - 1), 0)
    # y2 < im_info[0]
    boxes[:, 3::4] = tf.maximum(tf.minimum(boxes[:, 3::4], im_info[0] - 1), 0)
    return boxes

## NMS - Non-Maximum Suppression

## Head Network

In [15]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model

In [16]:
head_network = VGG16(weights='imagenet', include_top=False, input_shape=(600,800,3))

In [17]:
# top 18 layers of vgg16 for head network
head_network = Model(inputs=head_network.input, outputs=head_network.get_layer('block5_conv3').output)

In [18]:
head_network.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 600, 800, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 600, 800, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 600, 800, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 300, 400, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 300, 400, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 300, 400, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 150, 200, 128)     0     

## Proposal Layer

In [19]:
import tensorflow as tf

In [20]:
def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, anchors, num_anchors):
    pre_nms_topN = 12000
    post_nms_topN = 2000
    nms_thresh = 0.7

    scores = rpn_cls_prob[:, :, :, num_anchors:]
    scores = tf.reshape(scores, shape=(-1,))
    rpn_bbox_pred = tf.reshape(rpn_bbox_pred, shape=(-1, 4))

    proposals = bbox_transform_inv_tf(anchors, rpn_bbox_pred)
    proposals = clip_boxes_tf(proposals, im_info[:2])

    indices = tf.image.non_max_suppression(proposals, scores, max_output_size=post_nms_topN, iou_threshold=nms_thresh)
    boxes = tf.gather(proposals, indices)
    boxes = tf.to_float(boxes)
    scores = tf.gather(scores, indices)
    scores = tf.reshape(scores, shape=(-1, 1))

    batch_inds = tf.zeros((tf.shape(indices)[0],1), dtype=tf.float32)
    rois = tf.concat([batch_inds, boxes], axis=1)

    return rois, scores

### RPN network

In [21]:
import tf_slim as slim

initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)


In [22]:
def _region_proposal(net_conv,is_training,initializer):
    rpn = slim.conv2d(net_conv,512,[3,3],trainable=is_training,weights_initializer=initializer,scope='rpn_conv/3x3')
    rpn_cls_score = slim.conv2d(rpn,num_anchors*2,[1,1],trainable=is_training,weights_initializer=initializer,
                                padding='VALID',activation_fn=None,scope='rpn_cls_score')
    rpn_bbox_pred = slim.conv2d(rpn,*4,[1,1],trainable=is_training,weights_initializer=initializer,
                                padding='VALID',activation_fn=None,scope='rpn_bbox_pred')
    rpn_cls_prob = tf.nn.softmax(rpn_cls_score)
    rpn_cls_prob = tf.reshape(rpn_cls_prob,[-1,2])
    rpn_bbox_pred = tf.reshape(rpn_bbox_pred,[-1,4])
    return rpn_cls_prob,rpn_bbox_pred

### Anchor Target Layer

In [23]:
import numpy as np
import numpy.random as npr

In [24]:
def bbox_overlaps(boxes,query_boxes):
    """
    Parameters:
    ----------
    boxes: numpy array (N,4)
    query_boxes: numpy array (K,4)
    Returns:
    -------
    overlaps: numpy array (N,K)
    """
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    overlaps = np.zeros((N,K),dtype=np.float)
    for k in range(K):
        box_area = (
            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
        )
        for n in range(N):
            iw = (
                min(boxes[n, 2], query_boxes[k, 2]) -
                max(boxes[n, 0], query_boxes[k, 0]) + 1
            )
            if iw > 0:
                ih = (
                    min(boxes[n, 3], query_boxes[k, 3]) -
                    max(boxes[n, 1], query_boxes[k, 1]) + 1
                )
                if ih > 0:
                    ua = float((boxes[n, 2] - boxes[n, 0] + 1) *
                           (boxes[n, 3] - boxes[n, 1] + 1) + box_area - iw * ih)
                    overlaps[n, k] = iw * ih / ua
    return overlaps

In [25]:
## call bbox_overlaps
boxes = np.array([[0,0,1,1],[1,1,2,2],[3,3,4,4]])
query_boxes = np.array([[0,0,1,1],[1,1,2,2],[3,3,4,4],[0,0,1,1]])
overlaps = bbox_overlaps(boxes,query_boxes)
print(overlaps)
argmax_overlaps = overlaps.argmax(axis=0)
print(argmax_overlaps)

[[1.         0.14285714 0.         1.        ]
 [0.14285714 1.         0.         0.14285714]
 [0.         0.         1.         0.        ]]
[0 1 2 0]


In [26]:
RPN_NEGATIVE_OVERLAP = 0.3
RPN_POSITIVE_OVERLAP = 0.7
RPN_BATCHSIZE = 256
RPN_FG_FRACTION = 0.5
RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)

In [27]:
def anchor_target_layer(rpn_cls_score,gt_boxes,im_info,_feat_stride,all_anchors,num_anchors):
    A = num_anchors
    total_anchors = all_anchors.shape[0]
    K = total_anchors / num_anchors

    # allow boxes to sit over the edge by a small amount
    _allowed_border = 0

    # map of shape (..., H, W)
    height, width = rpn_cls_score.shape[1:3]

    # only keep anchors inside the image
    inds_inside = np.where(
        (all_anchors[:, 0] >= -_allowed_border) &
        (all_anchors[:, 1] >= -_allowed_border) &
        (all_anchors[:, 2] < im_info[1] + _allowed_border) &  # width
        (all_anchors[:, 3] < im_info[0] + _allowed_border)  # height
    )[0]

    # keep only inside anchors
    anchors = all_anchors[inds_inside, :]

    # label: 1 is positive, 0 is negative, -1 is dont care
    labels = np.empty((len(inds_inside),), dtype=np.float32)
    labels.fill(-1)

    # overlaps between the anchors and the gt boxes
    # overlaps (ex, gt)
    overlaps = bbox_overlaps(
        np.ascontiguousarray(anchors, dtype=np.float),
        np.ascontiguousarray(gt_boxes, dtype=np.float))
    argmax_overlaps = overlaps.argmax(axis=1)
    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
    gt_argmax_overlaps = overlaps.argmax(axis=0)
    gt_max_overlaps = overlaps[gt_argmax_overlaps,
                                 np.arange(overlaps.shape[1])]
    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]

    labels[max_overlaps < RPN_NEGATIVE_OVERLAP] = 0
    labels[gt_argmax_overlaps] = 1
    labels[max_overlaps >= RPN_POSITIVE_OVERLAP] = 1

    # subsample positive labels if we have too many
    num_fg = int(RPN_FG_FRACTION * RPN_BATCHSIZE)
    fg_inds = np.where(labels == 1)[0]
    if len(fg_inds) > num_fg:
        disable_inds = npr.choice(
            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
        labels[disable_inds] = -1
    
    # subsample negative labels if we have too many
    num_bg = RPN_BATCHSIZE - np.sum(labels == 1)
    bg_inds = np.where(labels == 0)[0]
    if len(bg_inds) > num_bg:
        disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False)
        labels[disable_inds] = -1
    
    bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
    bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])

    bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    bbox_inside_weights[labels == 1, :] = np.array(RPN_BBOX_INSIDE_WEIGHTS)

    bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    
    bbox_outside_weights[labels == 1, :] = 1.0
    bbox_outside_weights[labels == 0, :] = 1.0

    # map up to original set of anchors
    labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
    bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
    bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)

    # labels
    labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
    labels = labels.reshape((1, 1, A * height, width))
    rpn_labels = labels

    # bbox_targets
    bbox_targets = bbox_targets.reshape((1, height, width, A * 4))
    
    rpn_bbox_targets = bbox_targets

    # bbox_inside_weights
    bbox_inside_weights = bbox_inside_weights.reshape((1, height, width, A * 4))
    rpn_bbox_inside_weights = bbox_inside_weights

    # bbox_outside_weights
    bbox_outside_weights = bbox_outside_weights.reshape((1, height, width, A * 4))
    rpn_bbox_outside_weights = bbox_outside_weights

    return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights

In [28]:
def _unmap(data, count, inds, fill=0):
    """ Unmap a subset of item (data) back to the original set of items (of size count) """
    if len(data.shape) == 1:
        ret = np.empty((count,), dtype=np.float32)
        ret.fill(fill)
        ret[inds] = data
    else:
        ret = np.empty((count,) + data.shape[1:], dtype=np.float32)
        ret.fill(fill)
        ret[inds, :] = data
    return ret

In [29]:
def _compute_targets(ex_rois, gt_rois):
    """Compute bounding-box regression targets for an image."""

    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 5

    return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)

In [30]:
# call _compute_targets()
anchors = np.array([[0, 0, 10, 10],[0, 0, 20, 20],[0, 0, 30, 30]])
gt_boxes = np.array([[0, 0, 10, 10, 1],[0, 0, 20, 20, 1],[0, 0, 30, 30, 1]])
_compute_targets(anchors, gt_boxes)

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]], dtype=float32)

### Proposal Target Layer

In [31]:
BATCH_SIZE = 128
FG_FRACTION = 0.25
BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
BG_THRESH_HI = 0.5
BG_THRESH_LO = 0.1
FG_THRESH = 0.5

In [32]:
def proposal_target_layer(rpn_rois,rpn_score,gt_boxes,_num_classes):
    """
    Assign object detection proposals to ground-truth targets. Produces proposal
    classification labels and bounding-box regression targets.
    """

    # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
    all_rois = rpn_rois
    all_scores = rpn_score

    # Include ground-truth boxes in the set of candidate rois
    zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
    all_rois = np.vstack(
        (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
    )
    all_scores = np.vstack((all_scores, zeros))

    num_images = 1
    rois_per_image = BATCH_SIZE / num_images
    fg_rois_per_image = np.round(FG_FRACTION * rois_per_image)

    # Sample rois with classification labels and bounding box regression targets
    labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
        all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, _num_classes)
    
    rois = rois.reshape(-1, 5)
    roi_scores = roi_scores.reshape(-1)
    labels = labels.reshape(-1, 1)
    bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
    bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
    bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)

    return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights

In [33]:
def _get_bbox_regression_labels(bbox_target_data,num_classes):
    """Bounding-box regression targets (bbox_target_data) are stored in a
    compact form N x (class, tx, ty, tw, th)

    This function expands those targets into the 4-of-4*K representation used
    by the network (i.e. only one class has non-zero targets).

    Returns:
        bbox_target (ndarray): N x 4K blob of regression targets
        bbox_inside_weights (ndarray): N x 4K blob of loss weights
    """

    clss = bbox_target_data[:, 0]
    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
    inds = np.where(clss > 0)[0]
    for ind in inds:
        cls = clss[ind]
        start = int(4 * cls)
        end = start + 4
        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
        bbox_inside_weights[ind, start:end] = BBOX_INSIDE_WEIGHTS
    return bbox_targets, bbox_inside_weights

In [34]:
def _compute_targets_PTL(ex_rois,gt_rois,labels):
    """Compute bounding-box regression targets for an image."""

    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 4

    targets = bbox_transform(ex_rois, gt_rois).astype(np.float32, copy=False)
    targets = np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False)

    return targets

In [35]:
# call _get_bbox_regression_labels function
bbox_target_data = np.array([[1,0,0,10,10],[2,0,0,20,20],[2,0,0,30,30]])
num_classes = 4
_get_bbox_regression_labels(bbox_target_data,num_classes)

(array([[ 0.,  0.,  0.,  0.,  0.,  0., 10., 10.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 20., 20.,  0.,
          0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 30., 30.,  0.,
          0.,  0.,  0.]], dtype=float32),
 array([[0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0.]],
       dtype=float32))

In [36]:
def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
    """ Generate a random sample of RoIs comprising foreground and background examples
    """
    # overlaps: (rois x gt_boxes)
    overlaps = bbox_overlaps(
        np.ascontiguousarray(all_rois[:,1:5],dtype=np.float),
        np.ascontiguousarray(gt_boxes[:,:4],dtype=np.float))
    gt_assignment = overlaps.argmax(axis=1)
    max_overlaps = overlaps.max(axis=1)
    labels = gt_boxes[gt_assignment, 4]

    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= FG_THRESH)[0]
    # Select background RoIs as those within [BG_THRESH_LO,GB_THRESH_HI)
    bg_inds = np.where((max_overlaps < BG_THRESH_HI) &
                          (max_overlaps >= BG_THRESH_LO))[0]

    # Ensure that a fixed number of regions are sampled
    if fg_inds.size > 0 and bg_inds.size > 0:
        fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
        fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
        bg_rois_per_image = rois_per_image - fg_rois_per_image
        to_replace = bg_inds.size < bg_rois_per_image
        bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
    elif fg_inds > 0:
        to_replace = fg_inds.size < rois_per_image
        fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
        fg_rois_per_image = rois_per_image
    elif bg_inds > 0:
        to_repace = bg_inds.size < rois_per_image
        bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
        fg_rois_per_image = 0
    
    # the indices that we are selecting (both fg and bg)
    keep_inds = np.append(fg_inds,bg_inds)
    # Select sampled values from various arrays:
    labels = labels[keep_inds]
    # Clamp labels for the background RoIs to 0
    labels[int(fg_rois_per_image):] = 0
    rois = all_rois[keep_inds]
    rois_scores = all_scores[keep_inds]

    bbox_target_data = _compute_targets_PTL(
        rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4],labels)
    
    bbox_targets, bbox_inside_weights = _get_bbox_regression_labels(bbox_target_data,num_classes)

    return labels, rois, rois_scores, bbox_targets, bbox_inside_weights

### Crop Pooling Layer
will implement this layer directly in Network.

### Visualization

In [37]:
from six.moves import range
import PIL.Image as Image
import PIL.ImageDraw as ImageDraw
import PIL.ImageFont as ImageFont
import PIL.ImageColor as ImageColor

In [38]:
STANDARD_COLORS = [
    'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
    'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
    'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
    'WhiteSmoke', 'Yellow', 'YellowGreen'
]

NUM_COLORS = len(STANDARD_COLORS)

try:
    FONT = ImageFont.truetype('arial.ttf', 24)
except IOError:
    FONT = ImageFont.load_default()

In [39]:
def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, font, color='black', thickness=4):
    draw = ImageDraw.Draw(image)
    (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
    draw.line([(left, top), (left, bottom), (right, bottom),
                (right, top), (left, top)], width=thickness, fill=color)
    text_bottom = bottom
    # Reverse list and print from bottom to top.
    text_width, text_height = font.getsize(display_str)
    margin = np.ceil(0.05 * text_height)
    draw.rectangle([(left, text_bottom - text_height - 2 * margin), (left + text_width, text_bottom)], fill=color)
    draw.text((left + margin, text_bottom - text_height - margin), display_str, fill='black', font=font)
    return image

In [40]:
def draw_bounding_boxes(image,gt_boxes,im_info):
    num_boxes = gt_boxes.shape[0]
    gt_boxes_new = gt_boxes.copy()
    gt_boxes_new[:,:4] = np.round(gt_boxes_new[:,:4]/copy() / im_info[2])
    disp_image = Image.fromarray(np.uint8(image[0]))

    for i in range(num_boxes):
        this_class = int(gt_boxes_new[i,4])
        disp_image = _draw_single_box(disp_image,gt_boxes_new[i,0],gt_boxes_new[i,1],gt_boxes_new[i,2],gt_boxes_new[i,3],'N%02d-C%02d'%(i,this_class),FONT,color=STANDARD_COLORS[this_class%NUM_COLORS])
        image[0,:] = np.array(disp_image)
        return image

### Network

In [41]:
class Network(object):
    def __init__(self):
        self._predictions = {}
        self._losses = {}
        self._anchor_targets = {}
        self._proposal_targets={}
        self._layers = {}
        self._gt_image = None
        self._act_summaries = []
        self._score_summaries = {}
        self._train_summaries = []
        self._event_summaries = {}
        self._variables_to_fix = {}

    def _add_gt_image(self):
        # add back mean
        image - self._image + np.array([[[102.9801, 115.9465, 122.7717]]])
        # BGR to RGB
        resize = tf.image.resize(resized,self._im_info[:2],method=tf.image.ResizeMethod.BILINEAR)
        self._gt_image = tf.reverse(resize,axis=[-1])
    
    def _add_gt_image_summary(self):
        # use a customized visualization function to visualize the boxes
        if self._gt_image is None:
            self._add_gt_image()
        image = tf.py_func(draw_bounding_boxes,
                            [self._gt_image, self._gt_boxes, self._im_info],
                            tf.float32, name="gt_boxes")
        return tf.summary.image('GROUND_TRUTH', image)

    def _add_act_summary(self,tensor):
        tf.summary.histogram('ACT/' + tensor.op.name + '/activations', tensor)
        tf.summary.scalar('ACT/' + tensor.op.name + '/zero_fraction',tf.nn.zero_fraction(tensor))
    
    def _add_score_summary(self,key,tensor):
        tf.summary.histogram('SCORE/' + tensor.op.name + '/' + key + '/scores', tensor)
    
    def _add_train_summary(self,var):
        tf.summary.histogram('TRAIN/' + var.op.name, var)
    
    def _reshape_layer(self,bottom,num_dim, name):
        input_shape = tf.shape(bottom)
        with tf.variable_scope(name) as scope:
            # change the channel to the caffe format
            to_caffe = tf.transpose(bottom, [0, 3, 1, 2]) # NCHW where N is batch size, C is channel, H is height, W is width
            # then force it to have channel 2
            reshaped = tf.reshape(to_caffe, tf.concat(axis=0, values =[[1, num_dim, -1], [input_shape[2]]]))
            # then swap the channel back
            to_tf = tf.transpose(reshaped, [0, 2, 3, 1])
            return to_tf

    def _softmax_layer(self,bottom,name):
        if name.startswith('rpn_cls_prob_reshape'):
            input_shape = tf.shape(bottom)
            bottom_reshaped = tf.reshape(bottom, [-1, input_shape[-1]])
            reshaped_score = tf.nn.softmax(bottom_reshaped, name=name)
            return tf.reshape(reshaped_score, input_shape)
        return tf.nn.softmax(bottom, name=name)

    def _proposal_layer(self, rpn_cls_prob, rpn_bbox_pred, name):
        with tf.name_scope(name) as scope:
            rois, rpn_scores = proposal_layer(rpn_cls_prob, rpn_bbox_pred, self.im_info, self._feat_stride, self._anchors, self._num_anchors)
            rois.set_shape([None, 5])
            rpn_scores.set_shape([None, 1])
        return rois, rpn_scores

    def _crop_pool_layer(self, bottom, rois, name):
        with tf.name_scope(name) as scope:
            batch_ids = tf.squeeze(tf.slice(rois, [0,0], [-1,1], name="batch_id"), [1])
            # Get the normalized coordinates of bounding boxes
            bottom_shape = tf.shape(bottom)
            height = (tf.cast(bottom_shape[1], tf.float32) - 1.) * np.float32(self._feat_stride[0])
            width = (tf.cast(bottom_shape[2], tf.float32) - 1.) * np.float32(self._feat_stride[0])
            x1 = tf.slice(rois, [0,1], [-1,1], name="x1") / width
            y1 = tf.slice(rois, [0,2], [-1,1], name="y1") / height
            x2 = tf.slice(rois, [0,3], [-1,1], name="x2") / width
            y2 = tf.slice(rois, [0,4], [-1,1], name="y2") / height
            bboxes = tf.stop_gradient(tf.concat([y1,x1,y2,x2], axis=1))
            pre_pool_size = 7 * 2
            crops = tf.image.crop_and_resize(bottom, bboxes, tf.cast(batch_ids,tf.int32), [pre_pool_size, pre_pool_size], name="crops")
        return slim.max_pool2d(crops, [2, 2], padding='SAME')

        def _anchor_target_layer(self, rpn_cls_score, name):
            with tf.name_scope(name) as scope:
                rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = tf.py_func(_anchor_target_layer,
                [rpn_cls_score, self._gt_boxes, self._im_info, self._feat_stride, self._anchors, self._num_anchors],
                [tf.float32, tf.float32, tf.float32, tf.float32], name = "anchor_target")

                rpn_labels.set_shape([1, 1, None, None])
                rpn_bbox_targets.set_shape([1, None, None, self._num_anchors * 4])
                rpn_bbox_inside_weights.set_shape([1, None, None, self._num_anchors*4])
                rpn_bbox_outside_weights.set_shape([1, None, None, self._num_anchors*4])

                rpn_labels = tf.cast(rpn_labels,tf.int32, name = 'to_int32')
                self._anchor_targets['rpn_labels'] = rpn_labels
                self._anchor_targets['rpn_bbox_targets'] = rpn_bbox_targets
                self._anchor_targets['rpn_bbox_inside_weights'] = rpn_bbox_inside_weights
                self._anchor_targets['rpn_bbox_outside_weights'] = rpn_bbox_outside_weights

                self._score_summaries.update(self._anchor_targets)
            return rpn_labels

        def _proposal_target_layer(self, rois, roi_scores, name):
            with tf.name_scope(name) as scope:
                rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = tf.py_func(proposal_target_layer,
                [rois, roi_scores, self._gt_boxes, self._num_classes],
                [tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32],
                name = "proposal_target")

                rois.set_shape([128,5])
                roi_scores.set_shape([128])
                labels.set_shape([128,1])
                bbox_targets.set_shape([128,self._num_classes*4])
                bbox_inside_weights.set_shape([128,self._num_classes*4])
                bbox_outside_weights.set_shape([128,self._num_classes*4])

                self._proposal_targets['rois'] = rois
                self._proposal_targets['labels'] = tf.cast(labels,tf.int32)
                self._proposal_targets['bbox_targets'] = bbox_targets
                self._proposal_targets['bbox_inside_weights'] = bbox_inside_weights
                self._proposal_targets['bbox_outside_weights'] = bbox_outside_weights

                self._score_summaries.update(self._proposal_targets)
            return rois, roi_scores
        
        def _anchor_component(self):
            with tf.name_scope('ANCHOR_'+self.tag) as scope:
                # just to get the shape right
                height = tf.cast((tf.math.ceil(self.im_info[0] / np.float32(self._feat_stride[0]))),tf.int32)
                width = tf.cast((tf.math.ceil(self.im_info[1] / np.float32(self._feat_stride[0]))),tf.int32)
                anchors, anchor_length = generate_anchors_pre_tf(height, width, self._feat_stride, self._anchor_scales, self._anchor_ratios)
                anchors.set_shape([None, 4])
                anchor_length.set_shape([])
                self._anchors = anchors
                self._anchor_length = anchor_length
        
        def _build_network(self,is_training=True):
            initializer = tf.random.truncated_normal(0.0, 0.01)
            initializer_bbox = tf.random.truncated_normal(0.0, 0.001)

            new_conv = self._image_to_head(is_training)
            with tf.name_scope(self._scope, self._scope):
                self._anchor_component()
                rois = self._region_proposal(net_conv,is_training,initializer)
                pool5 = self._crop_pool_layer(net_conv, rois, "pool5")
            
            fc7 = self._head_to_tail(pool5, is_training)
            with tf.name_scope(self._scope, self._scope):
                cls_prob, bbox_pred = self._region_classification(fc7, is_training, initializer, initializer_bbox)
            
            self._score_summaries.update(self._predictions)

            return rois, cls_prob, bbox_pred
        
        def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=0.1, dim=[1]):
            sigma_2 = sigma**2
            box_diff = bbox_pred - bbox_targets
            in_box_diff = bbox_inside_weights * box_diff
            abs_in_box_diff = tf.math.abs(in_box_diff)
            smoothL1_sign = tf.stop_gradient(tf.cast(tf.less(abs_in_box_diff,1.0 / sigma_2),tf.float32))
            in_loss_box = tf.math.pow(in_box_diff,2) * (sigma_2 / 2.) * smoothL1_sign + (abs_in_box_diff - (0.5 / sigma_2)) * (1.0 - smoothL1_sign)
            out_loss_box = bbox_outside_weights * in_loss_box
            loss_box = tf.reduce_mean(tf.reduce_sum(out_loss_box, axis=dim))
            return loss_box

        def _region_proposal(self, net_conv, is_training, initializer):
            rpn = slim.conv2d(net_conv, 512, [3,3], trainable=is_training, weights_initializer=initializer, scope="rpn_conv/3x3")
            self._act_summaries.append(rpn)
            rpn_cls_score = slim.conv2d(rpn, self._num_anchors*2, [1,1], trainable=is_training, weights_initializer=initializer,padding='VALID', activation_fn=None, scope='rpn_cls_score')
            rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape')
            rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape")
            rpn_cls_pred = tf.argmax(tf.reshape(rpn_cls_score_reshape, [-1, 2]), axis=1, name = "rpn_cls_pred")
            rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors*2, 'rpn_cls_prob')
            rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors*4, [1,1], trainable=is_training, weights_initializer=initializer, padding='VALID', activation_fn=None, scope='rpn_bbox_pred')
            
            if is_training:
                rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
                rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
                # Try to have a deterministic order for the computing graph, for reproducibility
                with tf.control_dependencies([rpn_labels]):
                    rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
            
            self._predictions["rpn_cls_score"] = rpn_cls_score
            self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
            self._predictions["rpn_cls_prob"] = rpn_cls_prob
            self._predictions["rpn_cls_pred"] = rpn_cls_pred
            self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
            self._predictions["rois"] = rois

            return rois

        def _region_classification(self, fc7, is_training, initializer, initializer_bbox):
            cls_score = slim.fully_connected(fc7, self._num_classes, weights_initializer= initializer, trainable= is_training, activation_fn=None, scope='cls_score')
            cls_prob = self._softmax_layer(cls_score, "cls_prob")
            cls_pred = tf.argmax(cls_score, axis=1, name="cls_pred")
            bbox_pred = slim.fully_connected(fc7, self._num_classes*4, weights_initializer= initializer_bbox, trainable= is_training, activation_fn=None, scope='bbox_pred')
            self._predictions['cls_score'] = cls_score
            self._predictions['cls_pred'] = cls_pred
            self._predictions['cls_prob'] = cls_prob
            self._predictions['bbox_pred'] = bbox_pred

            return cls_prob, bbox_pred
        
        def _image_to_head(self,is_training,resuse=None):
            raise NotImplementedError
        def _head_to_tail(self,pool5,is_training,reuse=None):
            raise NotImplementedError
        

        def create_architecture(self, mode, num_classes, tag=None, anchor_scales = (8,16,32), _anchor_ratios = (0.5,1,2)):
            self._image = tf.compat.v1.placeholder(tf.float32, shape=[1, None, None, 3])
            self._im_info = tf.compat.v1.placeholder(tf.float32, shape=[3])
            self._gt_boxes = tf.compat.v1.placeholder(tf.float32, shape=[None, 5])
            self._tag = tag

            self._num_classes = num_classes
            self._mode = mode
            self._anchor_scales = anchor_scales
            self._num_scales = len(anchor_scales)

            self._anchor_ratios = _anchor_ratios
            self._num_ratios = len(anchor_ratios)

            self._num_anchors = self._num_scales * self._num_ratios

            training = mode == 'TRAIN'

            assert tag!=None

            # handle most of the regularizers here
            weights_regularizer = tf.keras.regularizers.l2(0.0001)
            biases_regularizer = tf.compat.v1.no_regularizer

            with arg_scope([slim.conv2d, slim.conv2d_in_plane, slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected], weights_regularizer = weights_regularizer, biases_regularizer= biases_regularizer, biases_initializer = tf.constant_initializer(0.0)):
                rois, cls_prob, bbox_pred = self._build_network(training)
            layers_to_output = {'rois': rois}

            for var in tf.Variable(trainable=True):
                self._train_summaries.append(var)
            
            self._add_losses()
            layers_to_output.update(self._losses)

            val_summaries = []
            with tf.device("/cpu:0"):
                val_summaries.append(self._add_gt_image_summary())
                for key, var in self._event_summaries.items():
                    val_summaries.append(tf.summary.scalar(key, var))
                for key, var in self._score_summaries.items():
                    self._add_score_summary(key, var)
                for var in self._act_summaries:
                    self._add_act_summary(var)
                for var in self._train_summaries:
                    self._add_train_summary(var)
            
            self._summary_op = tf.summary.merge_all()
            self._summary_op_val = tf.summary.merge(val_summaries)

            layers_to_output.update(self._predictions)
            return layers_to_output

        def train_step(self,sess,blobs,train_op):
            feed_dict={self._image: blobs['data'], self._im_info: blobs['im_info'], self._gt_boxes: blobs['gt_boxes']}
            rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, _ = sess.run([self._losses["rpn_cross_entropy"], self._losses['rpn_loss_box'], self._losses['cross_entropy'], self._losses['loss_box'], self._losses['total_loss'], train_op], feed_dict=feed_dict)
            return rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss


        def train_step_with_summary(self, sess, blobs, train_op):
            feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'], self._gt_boxes: blobs['gt_boxes']}
            rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary, _ = sess.run([self._losses['rpn_cross_entropy'], self._losses['rpn_loss_box'], self._losses['cross_entropy'], self._losses['loss_box'], self._losses['total_loss'], self._summary_op, train_op], feed_dict=feed_dict)
            return rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary

### VGG16 Network

In [42]:
import tensorflow as tf
import tf_slim as slim
from tf_slim import losses
from tf_slim import arg_scope
import numpy as np

In [43]:
class vgg16(Network):
    def __init__(self):
        Network.__init__(self)
        self._feat_stride = [16, ]
        self._feat_compress = [1. / float(self._feat_stride[0]), ]
        self._scope = 'vgg_19'
    
    def _image_to_head(self, is_training, reuse=None):
        with tf.name_scope(self._scope, self._scope, reuse=reuse):
            net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3], trainable=False, scope='conv1')
            net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1')
            net = slim.repeat(net, 2, slim.conv2d, 128, [3,3], trainable=False, scope='conv2')
            net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2')
            net = slim.repeat(net, 3, slim.conv2d, 256, [3,3], trainable=is_training, scope='conv3')
            net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], trainable=is_training, scope='conv4')
            net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], trainable=is_training, scope='conv5')
        
        self._act_summaries.append(net)
        self.layers['head'] = net

        return net

    def _head_to_tail(self,pool5,is_training, reuse=None):
        with tf.name_scope(self._scope, self._scope, reuse=reuse):
            pool5_flat = slim.flatten(pool5, scope='flatten')
            fc6 = slim.fully_connected(pool5_flat, 4096, scope='fc6')
            if is_training:
                fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True, scope='dropout6')
            fc7 = slim.fully_connected(fc6, 4096, scope='fc7')
            if is_training:
                fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True, scope='dropout7')
        return fc7;

    def get_variables_to_restore(self, variables, var_keep_dic):
        variables_to_restore = []

        for v in variables:
            if v.name == (self._scope + '/fc6/weights:0') or v.name == (self._scope + '/fc7/weights:0'):
                self._variables_to_fix[v.name] = v
                continue

            if v.name == (self._scope + 'conv1/conv1_1/weights:0'):
                self._variables_to_fix[v.name] = v
                continue

            if v.name.split(':')[0] in var_keep_dic:
                print('Variables restored: %s' % v.name)
                variables_to_restore.append(v)
        return variables_to_restore

    def fix_variables(self,sess, pretrained_model):
        print('Fix VGG16 layers..')
        with tf.name_scope('Fix_VGG16') as scope:
            with tf.device("/cpu:0"):
                fc6_conv = tf.compat.v1.get_variable("fc6_conv", [7, 7, 512, 4096], trainable=False)
                fc7_conv = tf.compat.v1.get_variable("fc7_conv", [1, 1, 4096, 4096], trainable=False)
                conv1_rgb = tf.compat.v1.get_variable("conv1_rgb", [3, 3, 3, 64], trainable=False)
                restorer_fc = tf.compat.v1.train.Saver({self._scope + "/fc6/weights": fc6_conv,
                                                        self._scope + "/fc7/weights": fc7_conv,
                                                        self._scope + "/conv1/conv1_1/weights": conv1_rgb})
                restorer_fc.restore(sess, pretrained_model)
                sess.run(tf.compat.v1.assign(self._variables_to_fix[self._scope + '/fc6/weights:0'], tf.reshape(fc6_conv,
                                                                                                                 self._variables_to_fix[self._scope+'/fc6/weights:0'].get_shape())))
                sess.run(tf.compat.v1.assign(self._variables_to_fix[self._scope + '/fc7/weights:0'], tf.reshape(fc7_conv,
                                                                                                                    self._variables_to_fix[self._scope+'/fc7/weights:0'].get_shape())))
                sess.run(tf.compat.v1.assign(self._variables_to_fix[self._scope + '/conv1/conv1_1/weights:0'], tf.reverse(conv1_rgb, [2])))                                                                                                    

### roidb

In [44]:
def prepare_roidb(imdb):
    """Enrich the imdb's roidb by adding some derived quantities that
    are useful for training. This function precomputes the maximum
    overlap, taken over ground-truth boxes, between each ROI and
    each ground-truth box. The class with maximum overlap is also
    recorded.
    """
    roidb = imdb.roidb
    if not (imdb.name.startswith('coco')):
        sizes = [PIL.Image.open(imdb.image_path_at(i)).size
                    for i in range(imdb.num_images)]
    for i in range(len(imdb.image_index)):
        roidb[i]['image'] = imdb.image_path_at(i)
        if not (imdb.name.startswith('coco')):
            roidb[i]['width'] = sizes[i][0]
            roidb[i]['height'] = sizes[i][1]
        # need gt_overlaps as a dense array for argmax
        gt_overlaps = roidb[i]['gt_overlaps'].toarray()
        # max overlap with gt over classes (columns)
        max_overlaps = gt_overlaps.max(axis=1)
        # gt class that had the max overlap
        max_classes = gt_overlaps.argmax(axis=1)
        roidb[i]['max_classes'] = max_classes
        roidb[i]['max_overlaps'] = max_overlaps
        # sanity checks
        # max overlap of 0 => class should be zero (background)
        zero_inds = np.where(max_overlaps == 0)[0]
        assert all(max_classes[zero_inds] == 0)
        # max overlap > 0 => class should not be zero (must be a fg class)
        nonzero_inds = np.where(max_overlaps > 0)[0]
        assert all(max_classes[nonzero_inds] != 0)

### blobs

In [45]:
import cv2

In [46]:
def _im_list_to_blob(ims):
    """ Convert a list of images into a network input.
    Assumes images are already prepare (mean substracted, BGR order, ....).
    """

    max_shape = np.array([im.shape for im in ims]).max(axis=0)
    num_images = len(ims)
    blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),dtype=np.float32)
    for i in range(num_images):
        im = ims[i]
        blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
    return blob

    def prep_im_for_blob(im, pixel_means, target_size, max_size):
        """ mean subtact and scale an image for use in a blob"""
        im = im.astype(np.float32, copy=False)
        im -= pixel_means
        im_shape = im.shape
        im_size_min = np.min(im_shape[0:2])
        im_size_max = np.max(im_shape[0:2])
        im_scale = float(target_size) / float(im_size_min)
        # Prevent the biggest axis from being more than MAX_SIZE
        if np.round(im_scale * im_size_max) > max_size:
            im_scale = float(max_size) / float(im_size_max)
        im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
                        interpolation=cv2.INTER_LINEAR)
        return im, im_scale

### mini-batch blobs for training

In [47]:
SCALES = (600,)
pixel_means = np.array([[[102.9801, 115.9465, 122.7717]]])

In [48]:
def get_minibatch(roidb,num_classes):
    """Given a roidb, construct a minibatch sampled from it"""
    num_images = len(roidb)
    # Sample random scales to use for each image in this batch
    random_scale_inds = npr.randint(0,high=len(SCALES),size=num_images)
    assert(128%num_images == 0), 'num_images ({}) must divide 128'.format(num_images)

    # Get the input image blob, formatted for caffe
    im_blob, im_scale = _get_image_blob(roidb, random_scale_inds)
    blobs = {'data':im_blob}
    assert(len(im_scale) == 1), 'Single batch only'
    assert(len(roidb) == 1), 'Single batch only'

    if USE_ALL_GT:
        gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
    else:
        gt_inds = np.where(roidb[0]['gt_classes'] != 0 & np.all(roidb[0]['gt_overlaps'].toarray() > -1.0, axis=1))[0]
    gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
    gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scale[0]
    gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
    blobs['gt_boxes'] = gt_boxes
    blobs['im_info'] = np.array(
        [im_blob.shape[1], im_blob.shape[2], im_scale[0]],
        dtype=np.float32)
    return blobs

def _get_image_blob(roidb, scale_inds):
    """Builds an input blob from the images in the roidb at the specified
    scales.
    """
    num_images = len(roidb)
    processed_ims = []
    im_scales = []
    for i in range(num_images):
        im = cv2.imread(roidb[i]['image'])
        if roidb[i]['flipped']:
            im = im[:, ::-1, :]
        target_size = SCALES[scale_inds[i]]
        im, im_scale = prep_im_for_blob(im, pixel_means, target_size, 1000)
        im_scales.append(im_scale)
        processed_ims.append(im)

    # Create a blob to hold the input images
    blob = _im_list_to_blob(processed_ims)
    return blob, im_scales

### RoIDataLayer

In [49]:
class RoIDataLayer(object):
    """
    It implements a caffe Python Layer.
    """
    def __init__(self, roidb, num_classes, random=False):
        """
        set the roidb to be used by this layer during training
        """
        self._roidb = roidb
        self._num_classes = num_classes
        self._random = random
        self._shuffle_roidb_inds()
    
    def _shuffle_roidb_inds(self):
        """Randomly permute the training roidb"""
        if self._random:
            st0 = np.random.get_state()
            millis = int(round(time.time()*1000)) % 4294967295
            np.random.seed(millis)
        self._perm = np.random.permutation(np.arange(len(self._roidb)))
        if self._random:
            np.random.set_state(st0)
        
        self._cur = 0
    
    def _get_next_minibatch_inds(self):
        """Return the roidb indices for the next minibatch."""
        if self._cur + 1 >= len(self._roidb):
            self._shuffle_roidb_inds()
        
        db_inds = self._perm[self._cur:self._cur + 1]
        self._cur += 1
        return db_inds
    
    def _get_next_minibatch(self):
        """Return the blobs to be used for the next minibatch.
        """
        db_inds = self._get_next_minibatch_inds()
        minibatch_db = [self._roidb[i] for i in db_inds]
        return get_minibatch(minibatch_db, self._num_classes)
    
    def forward(self):
        """Get blobs and copy them into this layer's top blob vector."""
        blobs = self._get_next_minibatch()
        return blobs

### Timer

In [50]:
import time

class Timer(object):
    def __init__(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.
    def tic(self):
        self.start_time = time.time()
    
    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if average:
            return self.average_time
        else:
            return self.diff

### trainval - SolveWrapper

In [51]:
from tensorflow.python import pywrap_tensorflow
import sys
import os
import glob
import time
import pickle

In [52]:
class SolveWrapper(object):
    """
    A wrapper class for the training purpose
    """
    def __init__(self,sess,network,imdb,roidb,valroidb,output_dir,tbdir,pretrained_model=None):
        self.net = network
        self.imdb = imdb
        self.roidb = roidb
        self.valroidb = valroidb
        self.output_dir = output_dir
        self.tbdir = tbdir
        self.tbvaldir = tbdir + '_val'
        if not os.path.exists(self.tbvaldir):
            os.makedirs(self.tbvaldir)
        self.pretrained_model = pretrained_model

    def filter_roidb(roidb):
        """Remove roidb entries that have no usable RoIs"""
        def is_valid(entry):
            overlaps = entry['max_overlaps']
            fg_inds = np.where(overlaps>=0.5)[0]
            bg_inds = np.where((overlaps<0.5) & (overlaps>0.1))[0]
            valid = len(fg_inds)>0 or len(bg_inds)>0
            return valid
        
        num = len(roidb)
        filtered_roidb = [entry for entry in roidb if is_valid(entry)]
        num_after = len(filtered_roidb)
        print('Filtered {} roidb entries: {} -> {}'.format(num-num_after,num,num_after))
        return filtered_roidb

    def snapshot(self,sess, iter):
        net = self.net

        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # Store the model snapshot
        filename = 'VGG16_faster_rcnn_iter_{:d}'.format(iter) + '.ckpt'
        filename = os.path.join(self.output_dir, filename)
        self.saver.save(sess, filename)
        print('Wrote snapshot to: {:s}'.format(filename))

        # Also store some meta information, random state, etc.
        nfilename = 'VGG16_faster_rcnn_iter_{:d}'.format(iter) + '.pkl'
        nfilename = os.path.join(self.output_dir, nfilename)
        # current state of numpy random
        st0 = np.random.get_state()
        # current position in the database
        cur = self.data_layer._cur
        # current shuffled indexes of the database
        perm = self.data_layer._perm
        # current position in the validation database
        cur_val = self.data_layer_val._cur
        # current shuffled indexes of the validation database
        perm_val = self.data_layer_val._perm

        # Dump the meta info
        with open(nfilename, 'wb') as fid:
            pickle.dump(st0, fid, pickle.HIGHEST_PROTOCOL)
            pickle.dump(cur, fid, pickle.HIGHEST_PROTOCOL)
            pickle.dump(perm, fid, pickle.HIGHEST_PROTOCOL)
            pickle.dump(cur_val, fid, pickle.HIGHEST_PROTOCOL)
            pickle.dump(perm_val, fid, pickle.HIGHEST_PROTOCOL)
            pickle.dump(iter, fid, pickle.HIGHEST_PROTOCOL)
        
        return filename, nfilename

    def construct_graph(self,sess):
        with sess.graph.as_default():
            # Set the random seed for tensorflow
            tf.keras.utils.set_random_seed(3)
            # Build the main computation graph
            layers = self.net_create_architecture(sess, 'TRAIN', self.imdb.num_classes, tag='default',anchor_scales = [8, 16, 32], anchor_ratios = [0.5, 1, 2])
            # Define the loss
            loss = layers['total_loss']
            lr = tf.Variable(0.001, trainable=False)
            self.optimizer = tf.compat.v1.train.MomentumOptimizer(lr=lr, momentum=0.9)

            # Compute the gradients with regard to the loss
            gvs = self.optimizer.compute_gradients(loss)
            final_gvs = []
            with tf.name_scope('Gradient_Mult') as scope:
                for grad, var in gvs:
                    scale=1
                    if '/biases' in var.name:
                        scale *=2
                    if not np.allclose(scale, 1.0):
                        grad = tf.multiply(grad, scale)
                    final_gvs.append((grad, var))
            train_op = self.optimizer.apply_gradients(final_gvs)

            # we will handle the snapshots ourselves
            self.saver = tf.compat.v1.train.Saver(max_to_keep=100000)
            # Write the train and validation information to tensorboard
            self.writer = tf.compat.v1.summary.FileWriter(self.tbdir, sess.graph)
            self.valwriter = tf.compat.v1.summary.FileWriter(self.tbvaldir)
        
        return lr, train_op

    def get_variables_in_checkpoint_file(self, file_name):
        try:
            reader = pywrap_tensorflow.NewCheckpointReader(file_name)
            var_to_shape_map = reader.get_variable_to_shape_map()
            return var_to_shape_map
        except Exception as e:
            print(str(e))
            if "corrupted compressed block contents" in str(e):
                print("It's likely that your checkpoint file has been compressed "
                      "with SNAPPY.")

    
    def find_previous(self):
        sfiles = os.path.join(self.output_dir, 'vgg16_faster_rcnn'+ '_iter_*.ckpt.meta')
        sfiles = glob.glob(sfiles)
        sfiles.sort(key=os.path.getmtime)
        # Get the snapshot name in TensorFlow
        redfiles = []
        for stepsize in [30000]:
            redfiles.append(os.path.join(self.output_dir, 'vgg16_faster_rcnn_' + '_iter_{:d}.ckpt.meta'.format(stepsize+1)))
        sfiles = [ss.replace('.meta', '') for ss in sfiles if ss not in redfiles]

        nfiles = os.path.join(self.output_dir, 'vgg16_faster_rcnn' + '_iter_*.pkl')
        nfiles = glob.glob(nfiles)
        nfiles.sort(key=os.path.getmtime)
        redfiles = [redfile.replace('.ckpt.meta', '.pkl') for redfile in redfiles]
        nfiles = [nn for nn in nfiles if nn not in redfiles]

        lsf = len(sfiles)
        assert len(nfiles) == lsf

        return lsf, nfiles, sfiles

    def from_snapshot(self, sess, sfile, nfile):
        print('Restoring model snapshots from {:s}'.format(sfile))
        self.saver.restore(sess, sfile)
        print('Restored.')
        with open(nfile, 'rb') as fid:
            st0 = pickle.load(fid)
            cur = pickle.load(fid)
            perm = pickle.load(fid)
            cur_val = pickle.load(fid)
            perm_val = pickle.load(fid)
            last_snapshot_iter = pickle.load(fid)

            np.random.set_state(st0)
            self.data_layer._cur = cur
            self.data_layer._perm = perm
            self.data_layer_val._cur = cur_val
            self.data_layer_val._perm = perm_val

        return last_snapshot_iter 

    def initialize(self,sess):
        np_paths = []
        ss_paths = []

        # Initialize the variables or restore them from the snapshot
        print('Loading initial model weights from {:s}'.format(self.pretrained_model))
        variables = tf.compat.v1.global_variables()
        # Initialize all variables first
        sess.run(tf.compat.v1.variables_initializer(variables, name='init'))
        var_keep_dic = self.get_variables_in_checkpoint_file(self.pretrained_model)
        # Get the variables to restore, ignorizing the variables to fix
        variables_to_restore = self.net.get_variables_to_restore(variables, var_keep_dic)

        restorer = tf.compat.v1.train.Saver(variables_to_restore)
        restorer.restore(sess, self.pretrained_model)
        print('Loaded.')
        # Need to fix the variables before loading, so that the RGB weights are changed to BGR
        # For VGG16 it also changes the convolutional weights fc6 and fc7 to
        # fully connected weights
        self.net.fix_variables(sess, self.pretrained_model)
        print('Fixed.')
        last_snapshot_iter = 0
        rate = 0.001
        stepsizes = 30000

        return rate, last_snapshot_iter, stepsizes, np_paths, ss_paths
    
    def restore(self, sess, sfile, nfile):
        np_path = [nfile]
        ss_path = [sfile]
        # Restore model from snapshots
        last_snapshot_iter = self.from_snapshot(sess, sfile, nfile)
        # Set the learning rate
        rate = 0.001
        stepsize=[]
        for stepsize in [30000]:
            if last_snapshot_iter > stepsize:
                rate *= 0.1
            else:
                stepsizes.append(stepsize)

    def remove_snapshot(self, np_paths, ss_paths):
        to_remove = len(np_paths) - 5
        for c in range(to_remove):
            nfile = np_paths[0]
            os.remove(str(nfile))
            np_paths.remove(nfile)
        
        to_remove = len(ss_paths) - 5
        for c in range(to_remove):
            sfile = ss_paths[0]
            # To make the code compatible to earlier versions of Tensorflow,
            # where the naming tradition for checkpoints are different
            if os.path.exists(str(sfile)):
                os.remove(str(sfile))
            else:
                os.remove(str(sfile + '.data-00000-of-00001'))
                os.remove(str(sfile + '.index'))
            sfile_meta = sfile + '.meta'
            os.remove(str(sfile_meta))
            ss_paths.remove(sfile)

    def get_training_roidb(imdb):
        """Returns a roidb (Region of Interest database) for use in training."""
        print('Appending horizontally-flipped training examples...')
        imdb.append_flipped_images()
        print('done')
        print('Preparing training data...')
        rdl_roidb.prepare_roidb(imdb)
        print('done')
        return imdb.roidb

    def train_model(self, sess, max_iters):
        # Build data layers for both training and validation set
        self.data_layer = RoIDataLayer(self.roidb, self.imdb.num_classes)
        self.data_layer_val = RoIDataLayer(self.valroidb, self.imdb.num_classes, random=True)

        # Construct the computation graph
        lr, train_op = self.construct_graph()

        # Find previous snapshots if there is any to restore from
        lsf, nfiles, sfiles = self.find_previous()

        # Initialize the variables or restore them from the last snapshot
        if lsf == 0:
            rate, last_snapshot_iter, stepsizes, np_path, ss_paths = self.initialize(sess);
        else:
            rate, last_snapshot_iter, stepsizes, np_path, ss_paths = self.restore(sess, str(lsf-1))
        timer = Timer();
        iter = last_snapshot_iter + 1
        last_summary_time = time.time()

        stepsizes.append(max_iters)
        stepsizes.reverse()
        next_stepsize = stepsizes.pop()
        while iters < max_iters+1:
            if iter == next_stepsize_1:
                self.snapshot(sess, iter)
                rate *= 0.1
                sess.run(tf.assign(lr, rate))
                next_stepsize = stepsizes.pop()
            
            timer.tic()
            blobs = self.data_layer.forward()

            now = time.time()
            if iter == 1 or now-last_summary_time > 180:
                rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, total_loss, summary = self.net.train_step_with_summary(sess, blobs, train_op)
                self.writer.add_summary(summary, float(iter))
                # Also check the summary on the validation set
                blobs_val = self.data_layer_val.forward()
                summary_val = self.net.get_summary(sess, blobs_val)
                self.valwriter.add_summary(summary_val, float(iter))
                last_summary_time = now
            else:
                rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, total_loss = self.net.train_step(sess, blobs, train_op)
            timer.toc()

            # Display training information
            if iter % 10 == 0:
                print('iter: %d / %d, total loss: %.6f\n >>> rpn_loss_cls: %.6f\n '
                      '>>> rpn_loss_box: %.6f\n >>> loss_cls: %.6f\n >>> loss_box: %.6f\n >>> lr: %f' % \
                      (iter, max_iters, total_loss, rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, lr.eval()))
                print('speed: {:.3f}s / iter'.format(timer.average_time))
            
            # Snapshotting
            if iter % 5000 == 0:
                last_snapshot_iter = iter
                ss_path , np_path = self.snapshot(sess, iter)
                np_paths.append(np_path)
                ss_paths.append(ss_path)

                if len(np_paths) > 5:
                    self.remove_snapshot(np_paths, ss_paths)
            
            iter += 1

            if last_snapshot_iter != iter-1:
                self.snapshot(sess,iter-1)
            
            self.writer.close()
            self.valwriter.close()


    def train_net(network, imdb, roidb, valroidb, output_dir, tb_dir, pretrained_model=None, max_iters=40000):
        roidb = filter_roidb(roidb)
        valroidb = filter_roidb(valroidb)

        tfconfig = tf.compat.v1.ConfigProto(allow_soft_placement=True)
        tfconfig.gpu_options.allow_growth = True

        with tf.compat.v1.Session(config=tfconfig) as sess:
            sw = SolveWrapper(sess,network,imdb,roidb,valroidb,output_dir,tb_dir,pretrained_model)
            print('Solving')
            sw.train_model(sess,max_iters)
            print('done solving')
        

### IMDB Class

In [53]:
import os
import PIL
import numpy as np
import scipy.sparse

In [54]:
class imdb(object):
    """Image Databas"""

    def __init__(self,name,classes=None):
        self._name = name
        self._num_classes=0
        if not classes:
            self._classes = []
        else:
            self._classes = classes
        self._image_index = []
        self._obj_proposer= 'gt'
        self._roidb = None
        self._roidb_handler = self.default_roidb
        # Use this dict for storing dataset specific config options
        self.config = {}

    @property
    def name(self):
        return self._name
    
    @property
    def num_classes(self):
        return len(self._classes)
    
    @property
    def classes(self):
        return self._classes

    @property
    def image_index(self):
        return self._image_index

    @property
    def roidb_handler(self):
        return self._roidb_handler

    @roidb_handler.setter
    def roidb_handler(self,val):
        self._roidb_handler = val
    
    def set_proposal_method(self,method):
        method = eval('self.'+method+'_roidb')
        self.roidb_handler = method
    
    @property
    def roidb(self):
        # A roidb is a list of dictionaries, each with the following keys:
        #   boxes
        #   gt_overlaps
        #   gt_classes
        #   flipped
        if self._roidb is not None:
            return self._roidb
        self._roidb = self.roidb_handler()
        return self._roidb

    @property
    def cache_path(self):
        cache_path = os.path.abspath(os.path.join(r'C:\Users\kc510\Documents\Projects\Projects_MLOps\Project_Faster_RCNN\data', 'cache'))
        if not os.path.exists(cache_path):
            os.makedirs(cache_path)
        return cache_path
    
    @property
    def num_images(self):
        return len(self.image_index)

    def image_path_at(self, i):
        raise NotImplementedError
    
    def default_roidb(self):
        raise NotImplementedError
    
    def evaluate_detections(self, all_boxes, output_dir=None):
        """
        all_boxes is a list of length number-of-classes.
        Each list element is a list of length number-of-images.
        Each of those list elements is either an empty list []
        or a numpy array of detection.
        all_boxes[class][image] = [] or np.array of shape #dets x 5
        """
        raise NotImplementedError
    
    def _get_width(self):
        return [PIL.Image.open(self.image_path_at(i)).size[0] for i in range(self.num_images)]
    
    def append_flipped_images(self):
        num_images = self.num_images
        widths = self._get_width()
        for i in range(num_images):
            boxes = self.roidb[i]['boxes'].copy()
            oldx1 = boxes[:, 0].copy()
            oldx2 = boxes[:, 2].copy()
            boxes[:, 0] = widths[i] - oldx2 - 1
            boxes[:, 2] = widths[i] - oldx1 - 1
            assert (boxes[:, 2] >= boxes[:, 0]).all()
            entry = {'boxes': boxes,
                     'gt_overlaps': self.roidb[i]['gt_overlaps'],
                     'gt_classes': self.roidb[i]['gt_classes'],
                     'flipped': True}
            self.roidb.append(entry)
        self._image_index = self._image_index * 2
    
    def evaluate_recall(self, candidate_boxes=None, thresholds=None, area='all', limit=None):
        """ Evaluate detection proposal recall metrics 
        Return:
            results: dictionary of results with keys
                'ar': average recall
                'recalls' : vector recalls at each IoU overlap threshold
                'thresholds' : vector of IoU overlap thresholds
                'gt_overlaps' : vector of all ground-truth overlaps
        """
        # Record max overlap value for each gt box
        # Return vector of overlap values
        areas = {'all': 0, 'small': 1, 'medium': 2, 'large': 3, '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7}
        area_ranges = [[0**2, 1e5**2], [0**2, 32**2], [32**2, 96**2], [96**2, 128**2], [128**2, 256**2], [256**2, 512**2], [512**2, 1e5**2]]
        assert area in areas, 'unknown area range: {}'.format(area)
        area_range = area_ranges[areas[area]]
        gt_overlaps = np.zeros(0)
        num_pos = 0
        for i in range(self.num_images):
            # Checking for max_overlaps == 1 avoids including crowd annotations
            # (...pretty hacking :/)
            max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1)
            gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) & (max_gt_overlaps == 1))[0]
            gt_boxes = self.roidb[i]['boxes'][gt_inds, :]
            gt_areas = self.roidb[i]['seg_areas'][gt_inds]
            valid_gt_inds = np.where((gt_areas >= area_range[0]) & (gt_areas <= area_range[1]))[0]
            gt_boxes = gt_boxes[valid_gt_inds, :]
            num_pos += len(valid_gt_inds)
            
            if candidate_boxes is None:
                # If candidate_boxes is not supplied, the default is to use the
                # non-ground-truth boxes from this roidb
                non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0]
                boxes = self.roidb[i]['boxes'][non_gt_inds, :]
            else:
                boxes = candidate_boxes[i]
            if boxes.shape[0] == 0:
                continue
            if limit is not None and boxes.shape[0] > limit:
                boxes = boxes[:limit, :]
            
            overlaps = bbox_overlaps(boxes.astype(np.float), gt_boxes.astype(np.float))

            _gt_overlaps = np.zeros((gt_boxes.shape[0]))
            for j in range(gt_boxes.shape[0]):
                # find which proposal box maximally covers each gt box
                argmax_overlaps = overlaps.argmax(axis=0)
                # and get the iou amount of coverage for each gt box
                max_overlaps = overlaps.max(axis=0)
                # find which gt box is 'best' covered (i.e. 'best' = most iou)
                gt_ind = max_overlaps.argmax()
                gt_ovr = max_overlaps.max()
                assert(gt_ovr >= 0)
                # find the proposal box that covers the best covered gt box
                box_ind = argmax_overlaps[gt_ind]
                # record the iou coverage of this gt box
                _gt_overlaps[j] = overlaps[box_ind, gt_ind]
                assert(_gt_overlaps[j] == gt_ovr)
                # mark the proposal box and the gt box as used
                overlaps[box_ind, :] = -1
                overlaps[:, gt_ind] = -1
            # append recorded iou coverage level
            gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))

        gt_overlaps = np.sort(gt_overlaps)
        if thresholds is None:
            step = 0.05
            thresholds = np.arange(0.5, 0.95 + 1e-5, step)
        recalls = np.zeros_like(thresholds)
        # compute recall for each iou threshold
        for i, t in enumerate(thresholds):
            recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
        # ar = 2 * np.trapz(recalls, thresholds)
        ar = recalls.mean()
        return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds, 'gt_overlaps': gt_overlaps}

    def create_roidb_from_box_list(self,box_list,gt_roidb):
        assert len(box_list) == self.num_images, 'Number of boxes must match number of ground-truth images'
        roidb = []
        for i in range(self.num_images):
            boxes = box_list[i]
            num_boxes = boxes.shape[0]
            overlaps = np.zeros((num_boxes,self.num_classes),dtype=np.float32)

            if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
                gt_boxes = gt_roidb[i]['boxes']
                gt_classes = gt_roidb[i]['gt_classes']
                gt_overlaps = bbox_overlaps(boxes.astype(np.float),gt_boxes.astype(np.float))
                argmaxes = gt_overlaps.argmax(axis=1)
                maxes = gt_overlaps.max(axis=1)
                I = np.where(maxes>0)[0]
                overlaps[I,gt_classes[argmaxes[I]]] = maxes[I]
            
            overlaps = scipy.sparse.csr_matrix(overlaps)
            roidb.append({'boxes':boxes,'gt_classes':np.zeros((num_boxes,),dtype=np.int32),'gt_overlaps':overlaps,'flipped':False})
        return roidb

    @staticmethod
    def merge_roidbs(a,b):
        assert len(a) == len(b)
        for i in range(len(a)):
            a[i]['boxes'] = np.vstack((a[i]['boxes'],b[i]['boxes']))
            a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],b[i]['gt_classes']))
            a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],b[i]['gt_overlaps']])
        return a
    
    def competition_mode(self, on):
        """Turn competition mode on or off."""
        pass

### ds_utils

In [55]:
import numpy as np

In [56]:
def unique_boxes(boxes, scale=1.0):
    """ return indices of unique boxes """
    v = np.array([1, 1e3, 1e6, 1e9])
    hashes = np.round(boxes*scale).dot(v)
    _, index = np.unique(hashes, return_index=True)
    return np.sort(index)

def xywh_to_xyxy(boxes):
    """ Convert [x y w h] box format to [x1 y1 x2 y2] format """
    return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1))

def xyxy_to_xywh(boxes):
    """ Convert [x1 y1 x2 y2] box format to [x y w h] format """
    return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1))

def validate_boxes(boxes,width=0,height=0):
    """Check that a set of boxes is valid."""
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    assert (x1 >= 0).all()
    assert (y1 >= 0).all()
    assert (x2 >= x1).all()
    assert (y2 >= y1).all()
    assert (x2 < width).all()
    assert (y2 < height).all()

def filter_small_boxes(boxes,min_size):
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]
    keep = np.where((w >= min_size) & (h > min_size))[0]
    return keep

### COCO Database

In [57]:
import json
import uuid
from pycocotools.coco import COCO
from pycocotools import mask as COCOmask
from pycocotools.cocoeval import COCOeval
import os
import os.path as osp

In [58]:
class coco(imdb):
    def __init__(self,image_set,year):
        imdb.__init__(self,'coco_'+str(year)+'_'+str(image_set))
        # COCO specific config options
        self.config = {'use_salt': True,
                          'cleanup': True}
        # name, paths
        self._year = year
        self._image_set = image_set
        self._data_path = osp.join(r'C:\Users\kc510\Documents\Projects\Projects_MLOps\Project_Faster_RCNN\data', 'coco')
        # load COCO API, classes, class <-> id mappings
        self._COCO = COCO(self._get_ann_file())
        cats = self._COCO.loadCats(self._COCO.getCatIds())
        self._classes = tuple(['__background__'] + [c['name'] for c in cats])
        self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes)))))
        self._class_to_coco_cat_id = dict(list(zip([c['name'] for c in cats],
                                                    self._COCO.getCatIds())))
        self._image_index = self._load_image_set_index()
        # Default to roidb handler
        self.set_proposal_method('gt')
        self.competition_mode(False)

        # Some image sets are "views" (i.e. subsets) into others.
        # For example, minival2014 is a random 5000 image subset of val2014.
        # This mapping tells us where the view's images and proposals come from.
        self._view_map = {
            'minival2014': 'val2014',  # 5k val2014 subset
            'valminusminival2014': 'val2014',  # val2014 \setminus minival2014
            'test-dev2015': 'test2015',
        }
        coco_name = image_set + year  # e.g., "val2014"
        self._data_name = (self._view_map[coco_name]
                            if coco_name in self._view_map
                            else coco_name)
        # Dataset splits that have ground-truth annotations (test splits
        # do not have gt annotations)
        self._gt_splits = ('train', 'val', 'minival')

    def _get_ann_file(self):
        prefix = 'instances' if self._image_set.find('test') == -1 \
                              else 'image_info'
        return osp.join(self._data_path, 'annotations',
                        prefix + '_' + self._image_set + self._year + '.json')
    
    def _load_image_set_index(self):
        """
        Load image ids.
        """
        image_ids = self._COCO.getImgIds()
        return image_ids
    
    def _get_widths(self):
        anns = self._COCO.loadImgs(self._image_index)
        widths = [ann['width'] for ann in anns]
        return widths
    
    def image_path_at(self, i):
        """
        Return the absolute path to image i in the image sequence.
        """
        return self.image_path_from_index(self._image_index[i])
    
    def image_path_from_index(self, index):
        """
        Construct an image path from the image's "index" identifier.
        """
        # Example image path for index=119993:
        #   images/train2014/COCO_train2014_000000119993.jpg
        file_name = ('COCO_' + self._data_name + '_' +
                     str(index).zfill(12) + '.jpg')
        image_path = osp.join(self._data_path, 'images',
                              self._data_name, file_name)
        assert osp.exists(image_path), \
                'Path does not exist: {}'.format(image_path)
        return image_path
    
    def gt_roidb(self):
        """
        Return the database of ground-truth regions of interest.
        This function loads/saves from/to a cache file to speed up future calls.
        """
        cache_file = osp.join(self.cache_path, self.name + '_gt_roidb.pkl')
        if osp.exists(cache_file):
            with open(cache_file, 'rb') as fid:
                roidb = pickle.load(fid)
            print('{} gt roidb loaded from {}'.format(self.name, cache_file))
            return roidb

        gt_roidb = [self._load_coco_annotation(index)
                    for index in self._image_index]

        with open(cache_file, 'wb') as fid:
            pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL)
        print('wrote gt roidb to {}'.format(cache_file))
        return gt_roidb
    
    def _load_coco_annotation(self, index):
        """
        Loads COCO bounding-box instance annotations. Crowd instances are
        handled by marking their overlaps (with all categories) to -1. This
        overlap value means that crowd "instances" are excluded from training.
        """
        im_ann = self._COCO.loadImgs(index)[0]
        width = im_ann['width']
        height = im_ann['height']

        annIds = self._COCO.getAnnIds(imgIds=index, iscrowd=None)
        objs = self._COCO.loadAnns(annIds)
        # Sanitize bboxes -- some are invalid
        valid_objs = []
        for obj in objs:
            x1 = np.max((0, obj['bbox'][0]))
            y1 = np.max((0, obj['bbox'][1]))
            x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1))))
            y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1))))
            if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
                obj['clean_bbox'] = [x1, y1, x2, y2]
                valid_objs.append(obj)
        objs = valid_objs
        num_objs = len(objs)

        boxes = np.zeros((num_objs, 4), dtype=np.uint16)
        gt_classes = np.zeros((num_objs), dtype=np.int32)
        overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
        seg_areas = np.zeros((num_objs), dtype=np.float32)

        # Lookup table to map from COCO category ids to our internal class
        # indices
        coco_cat_id_to_class_ind = dict([(self._class_to_coco_cat_id[cls],
                                            self._class_to_ind[cls])
                                            for cls in self._classes[1:]])
        
        for ix, obj in enumerate(objs):
            cls = coco_cat_id_to_class_ind[obj['category_id']]
            boxes[ix, :] = obj['clean_bbox']
            gt_classes[ix] = cls
            seg_areas[ix] = obj['area']
            if obj['iscrowd']:
                # Set overlap to -1 for all classes for crowd objects
                # so they will be excluded during training
                overlaps[ix, :] = -1.0
            else:
                overlaps[ix, cls] = 1.0

        ds_utils.validate_boxes(boxes, width=width, height=height)
        overlaps = scipy.sparse.csr_matrix(overlaps)
        return {'width': width,
                'height': height,
                'boxes': boxes,
                'gt_classes': gt_classes,
                'gt_overlaps': overlaps,
                'flipped': False,
                'seg_areas': seg_areas}
    
    def _get_widths(self):
        return [r['width'] for r in self.roidb]
    
    def append_flipped_images(self):
        num_images = self.num_images
        widths = self._get_widths()
        for i in range(num_images):
            boxes = self.roidb[i]['boxes'].copy()
            oldx1 = boxes[:, 0].copy()
            oldx2 = boxes[:, 2].copy()
            boxes[:, 0] = widths[i] - oldx2 - 1
            boxes[:, 2] = widths[i] - oldx1 - 1
            assert (boxes[:, 2] >= boxes[:, 0]).all()
            entry = {'width': widths[i],
                     'height': self.roidb[i]['height'],
                     'boxes': boxes,
                     'gt_classes': self.roidb[i]['gt_classes'],
                     'gt_overlaps': self.roidb[i]['gt_overlaps'],
                     'flipped': True,
                     'seg_areas': self.roidb[i]['seg_areas']}

            self.roidb.append(entry)
        self._image_index = self._image_index * 2

    def _get_box_file(self, index):
        # first 14 chars / first 22 chars / all chars + .mat
        # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat
        file_name = ('COCO_' + self._data_name +
                     '_' + str(index).zfill(12) + '.mat')
        return osp.join(file_name[:14], file_name[:22], file_name)

    def _print_detection_eval_metrics(self, coco_eval):
        IoU_lo_thresh = 0.5
        IoU_hi_thresh = 0.95
        def _get_thr_ind(coco_eval, thr):
            ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) &
                           (coco_eval.params.iouThrs < thr + 1e-5))[0][0]
            iou_thr = coco_eval.params.iouThrs[ind]
            assert np.isclose(iou_thr, thr)
            return ind

        ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh)
        ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh)
        # precision has dims (iou, recall, cls, area range, max dets)
        # area range index 0: all area ranges
        # max dets index 2: 100 per image
        precision = \
            coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2]
        ap_default = np.mean(precision[precision > -1])
        print(('~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] '
               '~~~~').format(IoU_lo_thresh, IoU_hi_thresh))
        print('{:.1f}'.format(100 * ap_default))
        for cls_ind, cls in enumerate(self.classes):
            if cls == '__background__':
                continue
            # minus 1 because of __background__
            precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2]
            ap = np.mean(precision[precision > -1])
            print('{:.1f}'.format(100 * ap))

        print('~~~~ Summary metrics ~~~~')
        coco_eval.summarize()
    
    def _do_detection_eval(self, res_file, output_dir):
        ann_type = 'bbox'
        coco_dt = self._COCO.loadRes(res_file)
        coco_eval = COCOeval(self._COCO, coco_dt)
        coco_eval.params.useSegm = (ann_type == 'segm')
        coco_eval.evaluate()
        coco_eval.accumulate()
        self._print_detection_eval_metrics(coco_eval)
        eval_file = osp.join(output_dir, 'detection_results.pkl')
        with open(eval_file, 'wb') as fid:
            pickle.dump(coco_eval, fid, pickle.HIGHEST_PROTOCOL)
        print('Wrote COCO eval results to: {}'.format(eval_file))
    
    def _coco_results_one_category(self, boxes, cat_id):
        results = []
        for im_ind, index in enumerate(self.image_index):
            dets = boxes[im_ind].astype(np.float)
            if dets == []:
                continue
            scores = dets[:, -1]
            xs = dets[:, 0]
            ys = dets[:, 1]
            ws = dets[:, 2] - xs + 1
            hs = dets[:, 3] - ys + 1
            results.extend(
              [{'image_id' : index,
                'category_id' : cat_id,
                'bbox' : [xs[k], ys[k], ws[k], hs[k]],
                'score' : scores[k]} for k in range(dets.shape[0])])
        return results
    
    def _write_coco_results_file(self, all_boxes, res_file):
        # [{"image_id": 42,
        #   "category_id": 18,
        #   "bbox": [258.15,41.29,348.26,243.78],
        #   "score": 0.236}, ...]
        results = []
        for cls_ind, cls in enumerate(self.classes):
            if cls == '__background__':
                continue
            print('Collecting {} results ({:d}/{:d})'.format(cls, cls_ind,
                                                          self.num_classes - 1))
            coco_cat_id = self._class_to_coco_cat_id[cls]
            results.extend(self._coco_results_one_category(all_boxes[cls_ind],
                                                           coco_cat_id))
            '''
            if cls_ind ==30:
                res_f = res_file+ '_' + cls + '.json'
                print('Writing results json to {}'.format(res_f))
                with open(res_f, 'w') as fid:
                    json.dump(results, fid)
            '''
        #res_f = res_file+ '_' + 'all' + '.json'
        res_f = res_file
        print('Writing results json to {}'.format(res_f))
        with open(res_f, 'w') as fid:
            json.dump(results, fid)
    
    def evaluate_detections(self, all_boxes, output_dir):
        res_file = osp.join(output_dir, ('detections_' +
                                         self._image_set +
                                         self._year +
                                         '_results'))
        if self.config['use_salt']:
            res_file += '_{}'.format(str(uuid.uuid4()))
        res_file += '.json'
        self._write_coco_results_file(all_boxes, res_file)
        # Only do evaluation on non-test sets
        if self._image_set.find('test') == -1:
            self._do_detection_eval(res_file, output_dir)
        # Optionally cleanup results json file
        if self.config['cleanup']:
            os.remove(res_file)
    
    def competition_mode(self, on):
        if on:
            self.config['use_salt'] = False
            self.config['cleanup'] = False
        else:
            self.config['use_salt'] = True
            self.config['cleanup'] = True

In [59]:
def get_output_dir(imdb, weights_filename):
    """
    Return the directory where experimental artifacts are placed.
    If the directory does not exist, it is created.
    A canonical path is built using the name from an imdb and a network
    (if not None).
    """
    outdir = osp.abspath(osp.join(r'C:\Users\kc510\Documents\Projects\Projects_MLOps\Project_Faster_RCNN\saved_models', 'output','default', imdb.name))
    if weights_filename is not None:
        outdir = osp.join(outdir, weights_filename)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    return outdir

def get_output_tb_dir(imdb,weights_filename):
    """
    Return the directory where tensorflow summaries are placed.
    If the directory does not exist, it is created.
    A canonical path is built using the name from an imdb and a network
    (if not None).
    """
    outdir = osp.abspath(osp.join(r'C:\Users\kc510\Documents\Projects\Projects_MLOps\Project_Faster_RCNN\saved_models', 'tensorboard', 'default', imdb.name))
    if weights_filename is not None:
        outdir = osp.join(outdir, weights_filename)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    return outdir

### Dataset Factory


In [60]:
__sets = {}
import numpy as np

for year in ['2014']:
    for split in ['train', 'val', 'minival', 'valminusminival', 'trainval']:
        name = 'coco_{}_{}'.format(year, split)
        __sets[name] = (lambda split=split, year=year: coco(split, year))

# Set up coco_2015_<split>
for year in ['2015']:
  for split in ['test', 'test-dev']:
    name = 'coco_{}_{}'.format(year, split)
    __sets[name] = (lambda split=split, year=year: coco(split, year))

def get_imdb(name):
    """Get an imdb (image database) by name."""
    if name not in __sets:
        raise KeyError('Unknown dataset: {}'.format(name))
    return __sets[name]()


def list_imdbs():
    """List all registered imdbs."""
    return list(__sets.keys())

### Final Training

In [61]:
import pprint
import argparse
import sys

In [62]:
def combined_roidb(imdb_names):
    """
    Combine multiple roidbs
    """

    def get_roidb(imdb_name):
        imdb = get_imdb(imdb_name)
        print('Loaded dataset `{:s}` for training'.format(imdb.name))
        imdb.set_proposal_method('gt')
        print('Set proposal method: {:s}'.format('gt'))
        roidb = SolveWrapper.get_training_roidb(imdb)
        return roidb
    
    roidbs = [get_imdb(s) for s in imdb_names.split('+')]
    roidb = roidbs[0]
    if len(roidbs) > 1:
        for r in roidbs[1:]:
            roidb.extend(r)
        tmp = get_imdb(imdb_names.split('+')[1])
        imdb = imdb(imdb_names, tmp.classes)
    else:
        imdb = get_imdb(imdb_names)
    return imdb, roidb

np.random.seed(3)

# train set
imdb, roidb = combined_roidb('coco_2014_train')
output_dir = get_output_dir(imdb, 'default')
print('Output will be saved to `{:s}`'.format(output_dir))
tb_dir = get_output_tb_dir(imdb, 'default')
print('TensorFlow summaries will be saved to `{:s}`'.format(tb_dir))

orgflip = True
_, valroidb = combined_roidb('coco_2014_val')
print('{:d} validation roidb entries'.format(len(valroidb)))

# load network
net = vgg16()

SolveWrapper.train_net(net, imdb, roidb, valroidb, output_dir, tb_dir, pretrained_model=r'C:\Users\kc510\Documents\Projects\Projects_MLOps\Project_Faster_RCNN\notebooks\data\imagenet_weights\vgg16.ckpt', max_iters=40000)

loading annotations into memory...
Done (t=23.86s)
creating index...
index created!
loading annotations into memory...
Done (t=37.25s)
creating index...
index created!
Output will be saved to `C:\Users\kc510\Documents\Projects\Projects_MLOps\Project_Faster_RCNN\saved_models\output\default\coco_2014_train\default`
TensorFlow summaries will be saved to `C:\Users\kc510\Documents\Projects\Projects_MLOps\Project_Faster_RCNN\saved_models\tensorboard\default\coco_2014_train\default`
