In [32]:
import numpy as np
import tensorflow as tf
sess = tf.InteractiveSession()

In [274]:
GRID_DIM = 7
BATCH_SIZE = 16
NUM_BOXES = 2
NO_OBJECT_SCALE = .5
OBJECT_SCALE = 1.0
COORD_SCALE = 5.0
CLASS_SCALE = 1.0
CLASS_WEIGHTS = np.array([1.5,0.7,0.5], dtype='float32')
DEBUG = True

In [257]:
y_true = np.zeros((16,7,7,2,8))
y_true[...,3,3,0,:]=[3.5,3.5,0.3,0.4,1.,1.,0,0]
y_true = tf.convert_to_tensor(y_true, dtype=np.float32)

y_pred = np.zeros((16,7,7,2,8))
y_pred[:4,3,3,0,:]=[0.5,0.5,np.sqrt(0.3),np.sqrt(0.4),1.,1.,0,0]
y_pred[4:8,3,2,0,:]=[0.5,0.95,np.sqrt(0.4),np.sqrt(0.55),0.75,0.6,0.4,0]
y_pred[8:,3,3,0,:]=[0.1,0.1,np.sqrt(0.37),np.sqrt(0.58),0.8,0.7,0.2,0.1]
y_pred = tf.convert_to_tensor(y_pred, dtype=np.float32)

In [258]:
mask_shape = tf.shape(y_true)[:4] #Masking out the four dimension Batch, width, height, num_box

'''
Creating a grid for calculations of actual positions of boxes in image
'''
cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_DIM), [GRID_DIM]), (1, GRID_DIM, GRID_DIM, 1, 1)))
cell_y = tf.transpose(cell_x, (0,2,1,3,4))
cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [BATCH_SIZE, 1, 1, NUM_BOXES, 1])

In [259]:
'''
Init masks
'''
coord_mask = tf.zeros(mask_shape)
class_mask = tf.zeros(mask_shape)
conf_mask_neg = tf.zeros(mask_shape)
conf_mask_pos = tf.zeros(mask_shape)

# For debug
total_recall = tf.Variable(0.)

In [260]:
"""
Load prediction
"""
### get x and y in terms of grid
pred_box_xy = y_pred[..., :2] + cell_grid
        
### account for network predicting squares of w and h
sqrt_pred_box_wh = y_pred[..., 2:4]
pred_box_wh = tf.square(sqrt_pred_box_wh)    
    
### confidence should be in [0,1]
pred_box_conf = tf.sigmoid(y_pred[..., 4])
        
### class probabilities
pred_box_class = y_pred[..., 5:]

In [261]:
"""
Load ground truth
"""
### x and y center of boxes
true_box_xy = y_true[..., 0:2] # relative position to the containing cell

### adjust w and h
true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically

In [262]:
### Find iou for conf given obj, else 0
true_wh_half = true_box_wh / 2.
true_mins    = true_box_xy - true_wh_half
true_maxes   = true_box_xy + true_wh_half

pred_wh_half = pred_box_wh / 2.
pred_mins    = pred_box_xy - pred_wh_half
pred_maxes   = pred_box_xy + pred_wh_half

intersect_mins  = tf.maximum(pred_mins,  true_mins)
intersect_maxes = tf.minimum(pred_maxes, true_maxes)
intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)
intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]

true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]
pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]

union_areas = pred_areas + true_areas - intersect_areas
iou_scores  = tf.where(tf.less(tf.abs(union_areas), 1e-4), union_areas, tf.truediv(intersect_areas, union_areas)) 
true_box_conf = iou_scores * y_true[..., 4]

### adjust class probabilities
true_box_class = tf.argmax(y_true[..., 5:], -1)

In [263]:
print(true_box_xy.eval()[0,3,3,0])
print(pred_mins.eval()[0,3,3,0])
print(true_mins.eval()[0,3,3,0])
print(intersect_mins.eval()[0,3,3,0])
print(pred_maxes.eval()[0,3,3,0])
print(true_maxes.eval()[0,3,3,0])
print(intersect_maxes.eval()[0,3,3,0])
print(intersect_wh.eval()[0,3,3,0])
print(intersect_areas.eval()[0,3,3,0])
print(true_areas.eval()[0,3,3,0])
print(pred_areas.eval()[0,3,3,0])
print(union_areas.eval()[0,3,3,0])
print(iou_scores.eval()[0,3,3,0])
print(iou_scores.eval()[0,3,3])
print(true_box_class)

[3.5 3.5]
[3.35 3.3 ]
[3.35 3.3 ]
[3.35 3.3 ]
[3.65 3.7 ]
[3.65 3.7 ]
[3.65 3.7 ]
[0.3000002 0.4000001]
0.1200001
0.120000005
0.120000005
0.11999991
1.0000017
[1.0000017 0.       ]
Tensor("ArgMax_4:0", shape=(16, 7, 7, 2), dtype=int64)


In [271]:
"""
Determine the masks
"""
### coordinate mask: simply the position of the ground truth boxes (the predictors)
coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * COORD_SCALE

### confidence mask: penelize predictors + penalize boxes with low IOU
# penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6
just_boxes = tf.reshape(y_true[..., 0:4], (BATCH_SIZE,1,1,1,GRID_DIM*GRID_DIM*NUM_BOXES,4))
true_xy = just_boxes[...,0:2]
true_wh = just_boxes[...,2:4]

true_wh_half = true_wh / 2.
true_mins    = true_xy - true_wh_half
true_maxes   = true_xy + true_wh_half

pred_xy = tf.expand_dims(pred_box_xy, 4)
pred_wh = tf.expand_dims(pred_box_wh, 4)

pred_wh_half = pred_wh / 2.
pred_mins    = pred_xy - pred_wh_half
pred_maxes   = pred_xy + pred_wh_half    

intersect_mins  = tf.maximum(pred_mins,  true_mins)
intersect_maxes = tf.minimum(pred_maxes, true_maxes)
intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)
intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]

true_areas = true_wh[..., 0] * true_wh[..., 1]
pred_areas = pred_wh[..., 0] * pred_wh[..., 1]

union_areas = pred_areas + true_areas - intersect_areas
iou_scores  = tf.truediv(intersect_areas, union_areas)

best_ious = tf.reduce_max(iou_scores, axis=4)
conf_mask_neg = tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * NO_OBJECT_SCALE
conf_mask_pos = y_true[..., 4] * OBJECT_SCALE

### class mask: simply the position of the ground truth boxes (the predictors)
# class_mask = y_true[..., 4] * true_box_class * self.class_scale
class_mask = y_true[..., 4] * tf.gather(CLASS_WEIGHTS, true_box_class) * CLASS_SCALE       


In [272]:
print(coord_mask.eval()[0,3,3])
print()
print(true_xy.eval().shape)
print(true_wh.eval().shape)
print(true_maxes.eval().shape)
print(pred_xy.eval().shape)
print(pred_wh.eval().shape)
print(iou_scores.eval().shape)
print(best_ious.eval().shape)
print(best_ious.eval()[0,3,3])
print(conf_mask.eval().shape)
print(true_box_xy.eval().shape)
print(class_mask.eval().shape)
print(class_mask.eval()[0,3,3])


[[5.]
 [0.]]

(16, 1, 1, 1, 98, 2)
(16, 1, 1, 1, 98, 2)
(16, 1, 1, 1, 98, 2)
(16, 7, 7, 2, 1, 2)
(16, 7, 7, 2, 1, 2)
(16, 7, 7, 2, 98)
(16, 7, 7, 2)
[1.0000017 0.       ]
(16, 7, 7, 2)
(16, 7, 7, 2, 2)
(16, 7, 7, 2)
[1.5 0. ]


In [288]:
"""
Finalize the loss
"""
nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))
nb_conf_box_neg = tf.reduce_sum(tf.to_float(conf_mask_neg > 0.0))
nb_conf_box_pos = tf.reduce_sum(tf.to_float(conf_mask_pos > 0.0))
nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))

loss_xy    = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy)     * coord_mask) / (nb_coord_box + 1e-6) / 2.
loss_wh    = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh)     * coord_mask) / (nb_coord_box + 1e-6) / 2.
loss_conf_neg = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask_neg) / (nb_conf_box_neg + 1e-6) / 2.
loss_conf_pos = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask_pos) / (nb_conf_box_pos + 1e-6) / 2.
loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)
loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)

loss = loss_xy + loss_wh + loss_conf_pos + loss_conf_neg + loss_class

print(loss_xy.eval())
print(loss_wh.eval())
print(loss_conf_pos.eval())
print(loss_conf_neg.eval())
print(loss_class.eval())
print(loss.eval())

0.7125001
0.20287497
0.15930748
0.06263614
1.1947335
2.3320522
