In [1]:
import numpy as np
import tensorflow as tf

In [2]:
#[num_roi,(y1,x1,y2,x2)]

boxes = tf.Variable([[0.2, 0.4, 0.5 ,0.3],
                   [0.6, 0.4, 0.9 ,0.8],
                   [0.2, 0.1, 0.2 ,0.7],
                   [0.9, 0.5, 0.7 ,0.4],
                   [0.4, 0.3, 0.6 ,0.1]])
boxes

<tf.Variable 'Variable:0' shape=(5, 4) dtype=float32, numpy=
array([[0.2, 0.4, 0.5, 0.3],
       [0.6, 0.4, 0.9, 0.8],
       [0.2, 0.1, 0.2, 0.7],
       [0.9, 0.5, 0.7, 0.4],
       [0.4, 0.3, 0.6, 0.1]], dtype=float32)>

In [3]:
boxes = tf.random.uniform(shape=[10,4])
boxes

<tf.Tensor: shape=(10, 4), dtype=float32, numpy=
array([[0.764972  , 0.6240908 , 0.8765495 , 0.77177787],
       [0.55580485, 0.39900005, 0.21462071, 0.680037  ],
       [0.16597521, 0.38515997, 0.68101656, 0.82549787],
       [0.6916344 , 0.04733384, 0.4897381 , 0.09849119],
       [0.6792551 , 0.9308542 , 0.7672365 , 0.98983943],
       [0.12855697, 0.28885508, 0.32190168, 0.6146263 ],
       [0.47772086, 0.32520795, 0.31930137, 0.694883  ],
       [0.07872629, 0.46880054, 0.8449452 , 0.26058745],
       [0.7380661 , 0.9288217 , 0.7589034 , 0.48207843],
       [0.3175006 , 0.9213325 , 0.90953636, 0.4007938 ]], dtype=float32)>

In [4]:
# Assign each ROI to a level in the pyramid based on the ROI area.
y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)
h = y2 - y1
w = x2 - x1
print(h)

tf.Tensor(
[[ 0.11157751]
 [-0.34118414]
 [ 0.51504135]
 [-0.20189631]
 [ 0.08798134]
 [ 0.19334471]
 [-0.15841949]
 [ 0.7662189 ]
 [ 0.02083731]
 [ 0.5920358 ]], shape=(10, 1), dtype=float32)


In [5]:
def log2_graph(x):
    return tf.math.log(x) / tf.math.log(2.0)

In [6]:
# Equation 1 in the Feature Pyramid Networks paper. Account for
# the fact that our coordinates are normalized here.
# e.g. a 224x224 ROI (in pixels) maps to P4
image_area = tf.Variable([1024. * 1024.])
roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
roi_level = tf.minimum(5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
print("roi_level: ",roi_level)

roi_level:  tf.Tensor(
[[3]
 [4]
 [5]
 [4]
 [2]
 [4]
 [4]
 [4]
 [4]
 [4]], shape=(10, 1), dtype=int32)


In [9]:
# Loop through levels and apply ROI pooling to each. P2 to P5.
pooled = []
box_to_level = []
for i, level in enumerate(range(2, 6)):
    ix = tf.where(tf.equal(roi_level, level))
    print("ix: ",ix)
    
    level_boxes = tf.gather_nd(boxes, ix)
    print("level_boxes: ",level_boxes)

    # Box indicies for crop_and_resize.
    box_indices = tf.cast(ix[:, 0], tf.int32)
    print("box_indices: ",box_indices)

    # Keep track of which box is mapped to which level
    box_to_level.append(ix)
    print("box_to_level: ",box_to_level)

    # Stop gradient propogation to ROI proposals
    level_boxes = tf.stop_gradient(level_boxes)
    box_indices = tf.stop_gradient(box_indices)

    # Crop and Resize
    # From Mask R-CNN paper: "We sample four regular locations, so
    # that we can evaluate either max or average pooling. In fact,
    # interpolating only a single value at each bin center (without
    # pooling) is nearly as effective."
    #
    # Here we use the simplified approach of a single value per bin,
    # which is how it's done in tf.crop_and_resize()
    # Result: [batch * num_boxes, pool_height, pool_width, channels]
    # pooled.append(tf.image.crop_and_resize(feature_maps[i], level_boxes, box_indices, self.pool_shape,method="bilinear"))
    #                                            image,       boxes,       box_indices, crop_size

ix:  tf.Tensor([[4 0]], shape=(1, 2), dtype=int64)
level_boxes:  tf.Tensor([0.6792551], shape=(1,), dtype=float32)
box_indices:  tf.Tensor([4], shape=(1,), dtype=int32)
box_to_level:  [<tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[4, 0]], dtype=int64)>]
ix:  tf.Tensor([[0 0]], shape=(1, 2), dtype=int64)
level_boxes:  tf.Tensor([0.764972], shape=(1,), dtype=float32)
box_indices:  tf.Tensor([0], shape=(1,), dtype=int32)
box_to_level:  [<tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[4, 0]], dtype=int64)>, <tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[0, 0]], dtype=int64)>]
ix:  tf.Tensor(
[[1 0]
 [3 0]
 [5 0]
 [6 0]
 [7 0]
 [8 0]
 [9 0]], shape=(7, 2), dtype=int64)
level_boxes:  tf.Tensor(
[0.55580485 0.6916344  0.12855697 0.47772086 0.07872629 0.7380661
 0.3175006 ], shape=(7,), dtype=float32)
box_indices:  tf.Tensor([1 3 5 6 7 8 9], shape=(7,), dtype=int32)
box_to_level:  [<tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[4, 0]], dtype=int64)>, <tf.Tensor: shape=(