# Installation

In [1]:
%%capture
!pip install dropbox
!pip install GPUtil
# !pip install tensorflow_data_validation

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

from matplotlib import pyplot as plt
from tensorflow.keras import layers, optimizers, metrics, losses
from tensorflow.keras.utils import plot_model

from IPython.display import display
from os import path
from pathlib import Path
from time import time
from tqdm import tqdm
from functools import reduce
from shutil import make_archive, unpack_archive, rmtree
from GPUtil import showUtilization as gpu_usage

# Datasets

In [3]:
%%capture
(train_ds, val_ds), ds_info = tfds.load(
    'voc',
    with_info=True,
    split=['train', 'validation'],
    as_supervised=False,
    read_config=tfds.ReadConfig(try_autocache=False)
)

# tf.print(ds_info.features['objects']['bbox'].encode_example({'xmax': .1, 'ymax': .2, 'xmin': .5, 'ymin': .2}))
# tf.print(ds_info.features['objects']['bbox'].encode_example([.2]))

# train_ds = train_ds.map(lambda item: (item['description'], item['label']))
# ds_info

In [4]:
IMG_SIZE = 224
GRID_SIZE = 7
N_ANCHORS = 3
# ANCHORS = [(20, 20), (100, 80), (200, 180)]
# Anchor box width and height found in https://fairyonice.github.io/Part_1_Object_Detection_with_Yolo_for_VOC_2014_data_anchor_box_clustering.html
ANCHORS = tf.constant([
    [0.08285376, 0.13705531],
    [0.20850361, 0.39420716],
    [0.80552421, 0.77665105]
])
ANCHORS = tf.stack([ANCHORS[..., 1], ANCHORS[..., 0]], axis=-1)#*IMG_SIZE
MAX_BOXES = 10

def prep_boxes(images, boxes, labels, box_counts):
    x1, y1, x2, y2 = tf.split(boxes, boxes.shape[-1], axis=-1)
    
    # Compute centers for the boxes wrt the image
    cx = ((x1 + x2)*IMG_SIZE)/2
    cy = ((y1 + y2)*IMG_SIZE)/2
    centers = tf.concat([cy, cx], axis=-1)
    
    # Compute sizes for the boxes wrt the image
    sx = (x2 - x1)*IMG_SIZE
    sy = (y2 - y1)*IMG_SIZE
    sizes = tf.concat([sy, sx], axis=-1)
    
    return images, centers, sizes, labels, box_counts

def resize(image, boxes):
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE], method='bilinear') # For downsizing
    image = tf.cast(image, tf.uint8)
    
    # TODO: Resize boxes based on image resize
    
    return (image, boxes)

def to_corners(boxes):
    """Changes boxes from (CY, CX, BH, BW) format to corner format (TOP, LEFT, BOTTOM, RIGHT).

    Arguments:
        boxes: TensorSpec([N_BOXES, 4], tf.float32)
        The values in the boxes tensor are: (CY, CX, H, W)
        CY, CX: The coordinates of the box center.
        BH, BW: Box height and width

    Returns:
        boxes: [N_BOXES, 4]
        The values in the boxes are (TOP, LEFT, BOTTOM, RIGHT) which represent
        the coordinates of the boxes.
    """
    return tf.concat(
        [boxes[..., :2] - boxes[..., 2:] / 2.0, boxes[..., :2] + boxes[..., 2:] / 2.0],
        axis=-1,
    )

def box_ious(box, candidates):
    """Computes the IOUs of the box with the candidates.
    Adopted from: https://keras.io/examples/vision/retinanet/#preprocessing-data
    
    Args:
        box: TensorSpec([4], tf.float32)
        The values are formatted as (TOP, LEFT, BOTTOM, RIGHT)
        
        candidates: TensorSpec([N_CANDIDATES, 4], tf.float32)
        The values are formatted as (CY, CX, H, W)
        CY, CX: The coordinates of the box center.
        BH, BW: Box height and width
    Returns:
        ious: TensorSpec([N_CANDIDATES], tf.float32)
    """
#     tf.print('Candidates: ', candidates)
    candidate_corners = to_corners(candidates)
    
    # Compute intersection area
    tl = tf.maximum(box[None, :2], candidate_corners[:, :2]) # Top-Left Intersection
    br = tf.minimum(box[None, 2:], candidate_corners[:, 2:]) # Bottom-Right Intersection
    intersection = tf.math.maximum(0.0, br - tl)
    intersection_area = intersection[:, 0]*intersection[:, 1]
    
    # Compute box, candidates and union areas
    box_h, box_w = box[2]-box[0], box[3]-box[1]
    box_area = box_h*box_w
    candidate_areas = candidates[:, 2]*candidates[:, 3]
    replicated_box_areas = tf.tile([box_area], [candidates.shape[0]])
    union_area = tf.math.maximum(replicated_box_areas + candidate_areas - intersection_area, 1e-8)
    
    return tf.clip_by_value(intersection_area/union_area, 0.0, 1.0)

def pick_anchors(image, boxes):
    """
    Args:
        image: TensorSpec([BS, H, W, C], tf.uint8)
        boxes: TensorSpec([BS, BOX_COUNT, 5], tf.float32)
        The values in the box tensor are (TOP, LEFT, BOTTOM, RIGHT, CLASS_ID)
    Returns:
        image: TensorSpec([BS, H, W, C], tf.uint8)
        boxes: TensorSpec([BS, BOX_COUNT, 8], tf.float32)
        The values in the boxes tensor are (GY, GX, ANCHOR_ID, CY, CX, BH, BW, CLASS_ID).
        GY, GX are the offsets of the grid which contains the center of the box.
        CY, CX are the coordinates of the center of the box.
        BH, BW are the height and the width of the box respectively.
    """
    batch_size = tf.shape(image)[0]
    
    # Extract Box Dimensions
    left, top, right, bottom = boxes[..., 0], boxes[..., 1], boxes[..., 2], boxes[..., 3]
    cy, cx = (top+bottom)/2, (left+right)/2,
    h, w = (bottom-top), (right-left)
    gy, gx = tf.math.floor(cy*GRID_SIZE), tf.math.floor(cx*GRID_SIZE)
    
    def box_fn(box):
        gcyx = (box[:2]+0.5)/GRID_SIZE # Coordinates of the grid center.
        replicated_gcyx = tf.tile([gcyx], [len(ANCHORS), 1])
        anchor_boxes = tf.concat([replicated_gcyx, ANCHORS], axis=-1)

        scores = box_ious(box[2:], anchor_boxes)
        anchor_id = tf.math.argmax(scores)
        
        return anchor_id
    
    def batch_fn(t):
        return tf.map_fn(box_fn, t, fn_output_signature=tf.TensorSpec((), tf.int64))
    
    # Boxes with grid coordinates and assigned anchors
    cid = boxes[..., 4]
    corner_boxes = tf.stack([gy, gx, top, left, bottom, right], axis=-1)
    anchors = tf.map_fn(batch_fn, corner_boxes, fn_output_signature=tf.RaggedTensorSpec([None], tf.int64))
    anchors = tf.cast(anchors, dtype=tf.float32)
    boxes = tf.stack([gy, gx, anchors, cy, cx, h, w, cid], axis=-1)
    
    return image, boxes

def dsitem_to_tuple(item):
    image = item['image']
    boxes = item['objects']['bbox']
    labels = item['objects']['label']
    
    float_labels = tf.cast(labels[:, None], dtype=tf.float32)
    box_attrs = tf.concat([boxes, float_labels], axis=-1)
    
    return image, box_attrs

BATCH_SIZE = 28
dense_to_ragged_fn = tf.data.experimental.dense_to_ragged_batch(BATCH_SIZE)
box_ds = train_ds.map(dsitem_to_tuple).map(resize).shuffle(2000, reshuffle_each_iteration=True).apply(dense_to_ragged_fn).map(pick_anchors)
# box_ds = train_ds.map(dsitem_to_tuple) #.map(rectangularize).batch(2).map(prep_boxes)

itr = iter(box_ds)
next(itr)
images, boxes = next(itr)

In [5]:
def rgb_to_hex(rgb):
    return '#%02x%02x%02x' % rgb

def bbox_to_rect(bbox, shape, color):
    """Convert bounding box to matplotlib format."""
    h, w = shape
    left, top, right, bottom = bbox[0]*w, bbox[1]*h, bbox[2]*w, bbox[3]*h
    
    rect = plt.Rectangle(
        xy=(left, top), width=right-left, height=bottom-top,
        fill=False, edgecolor=color, linewidth=2)
#     print(rect)
    return rect

N_CLASSES = ds_info.features['labels'].num_classes
LABELS = ds_info.features['labels'].names
CLASS_COLORS = list(map(lambda v: rgb_to_hex(tuple(v.tolist())), np.random.choice(range(64, 255),size=[N_CLASSES, 3])))

itr = iter(train_ds)
item = next(itr)

fig = plt.imshow(item['image'])

def draw_boxes(image, fig):
    box_shape = image.shape[:2]
    objs = item['objects']
    
    for index, bbox in enumerate(objs['bbox']):
        rect = bbox_to_rect(bbox, box_shape, CLASS_COLORS[objs['label'][index]])
        fig.axes.add_patch(rect)
        fig.axes.text(*rect.get_xy(), LABELS[objs['label'][index]], ha='left', va='top', bbox=dict(ec='none', fc=CLASS_COLORS[objs['label'][index]]))

draw_boxes(item['image'], fig)
# item['image'].shape, item['objects']['bbox'], item
print(list(enumerate(ds_info.features['labels'].names)))
item['objects']['label'], item['image'].shape, item['objects']['bbox']

# Loss Function

In [9]:
def to_corners(boxes):
    """Changes boxes from (cy, cx, h, w) format to corner format.

    Arguments:
        boxes: [N_BOXES, 4] float32
        [CY, CX, H, W]

    Returns:
        boxes: [N_BOXES, 4]
        [TOP, LEFT, BOTTOM, RIGHT]
    """
    return tf.concat(
        [boxes[..., :2] - boxes[..., 2:] / 2.0, boxes[..., :2] + boxes[..., 2:] / 2.0],
        axis=-1,
    )

def box_ious(box, candidates):
    """Computes the IOUs of the box with the candidates.
    Adopted from: https://keras.io/examples/vision/retinanet/#preprocessing-data
    
    Args:
        box: [4] float32
        [TOP, LEFT, BOTTOM, RIGHT]
        
        candidates: [N_ANCHORS, 4] float32
        [CY, CX, H, W]
    Returns:
        ious: [N_ANCHORS] float32
    """
    candidate_corners = to_corners(candidates)
    
    # Compute intersection area
    tl = tf.maximum(box[None, :2], candidate_corners[:, :2]) # Top-Left Intersection
    br = tf.minimum(box[None, 2:], candidate_corners[:, 2:]) # Bottom-Right Intersection
    intersection = tf.math.maximum(0.0, br - tl)
    intersection_area = intersection[:, 0]*intersection[:, 1]
    
    # Compute box, candidates and union areas
    box_h, box_w = box[2]-box[0], box[3]-box[1]
    box_area = box_h*box_w
    candidate_areas = candidates[:, 2]*candidates[:, 3]
    replicated_box_areas = tf.tile([box_area], [candidates.shape[0]])
    union_area = tf.math.maximum(replicated_box_areas + candidate_areas - intersection_area, 1e-8)
    
    return tf.clip_by_value(intersection_area/union_area, 0.0, 1.0)

class YoloLoss(losses.Loss):
    def __init__(self, N_CLASSES, ANCHORS, GRID_SIZE, l_coord=5, l_noobj=.5, iou_threshold=0.5):
        """
        Args:
            N_CLASSES: The number of object classes
            ANCHORS: A list of a pair of (height, width) of anchor boxes.
            FT_SIZE: The size of base model output
        """
        self.N_CLASSES = N_CLASSES
        self.ANCHORS = ANCHORS
        self.GRID_SIZE = GRID_SIZE
        self.bce = losses.BinaryCrossentropy()
        self.l_coord = 5
        self.l_noobj = 0.5
        self.iou_threshold = 0.5
    
    def reorder_indices_nd(self, indices_nd, shape, return_argsort=False):
        shape = tf.cast(shape, dtype=indices_nd.dtype)
        nd_offsets = tf.math.cumprod(shape, exclusive=True, reverse=True)
        indices = tf.math.reduce_sum(nd_offsets*indices_nd, axis=-1)
#         indices = tf.unique(indices)
        argsort = tf.argsort(indices) if return_argsort else None
        indices = tf.sort(indices)
        indices_nd = (indices[..., None]//nd_offsets)%shape

        return indices_nd, indices, argsort
    
    def true_boxes_to_grid(self, yt):
        def batch_fn(boxes):
            indices = tf.cast(boxes[..., :3], dtype=tf.int64)
            indices, indices1d, argsort = self.reorder_indices_nd(indices, [GRID_SIZE, GRID_SIZE, N_ANCHORS], return_argsort=True)
            boxes = tf.gather(boxes, argsort, axis=0)
            
            # Expand sparse class
            sparse_class = tf.cast(boxes[..., 7], dtype=tf.int64)
            oh_class = tf.one_hot(sparse_class, N_CLASSES)
            
            # Handle the case where multiple boxes are tracked by the same anchor
            _, segments = tf.unique(indices1d)
            indices = tf.math.segment_min(indices, segments)
            boxes_grid = tf.math.segment_min(boxes[..., :3], segments)
            boxes_yx = tf.math.segment_mean(boxes[..., 3:5], segments)
            boxes_hw = tf.math.segment_max(boxes[..., 5:7], segments)
            boxes_class = tf.math.segment_max(oh_class, segments)
            
            # Create a mask value to identify the true boxes in the grid
            mask = tf.ones_like(boxes_grid[..., 0], dtype=yt.dtype)
            
            # Reassemble the boxes
            boxes = tf.concat([boxes_grid, boxes_yx, boxes_hw, mask[..., None], boxes_class], axis=-1)

            def to_sparse(values):
                values = tf.reshape(values, [-1])
                return tf.SparseTensor(indices, values, [GRID_SIZE, GRID_SIZE, N_ANCHORS])

            map_fn_output_signature = tf.SparseTensorSpec([GRID_SIZE, GRID_SIZE, N_ANCHORS], dtype=tf.float32)
            st = tf.map_fn(to_sparse, tf.transpose(boxes, [1, 0]), fn_output_signature=map_fn_output_signature)
            grid_boxes = tf.transpose(tf.sparse.to_dense(st), [1, 2, 3, 0])
            
            return grid_boxes
        
        map_fn_output_signature = tf.TensorSpec([GRID_SIZE, GRID_SIZE, N_ANCHORS, yt.shape[-1]+N_CLASSES], tf.float32)
        grid_boxes = tf.map_fn(batch_fn, yt, fn_output_signature=map_fn_output_signature)
        
        return grid_boxes
    
    def yolo_head(self, m_output):
        yx, hw, confidence, classes = tf.split(m_output, [2, 2, 1, N_CLASSES], axis=-1)
    
        # Normalize output
        yx = tf.nn.sigmoid(yx) # Box center relative (within) to the grid cell.
        hw = tf.math.exp(hw) # Box sizes relative to the anchor box sizes.
        confidence = tf.nn.sigmoid(confidence)
        classes = tf.nn.sigmoid(classes)

        return yx, hw, confidence, classes
    
    def box_ious(self, boxes, candidates):
        """Computes the IOUs of the boxes with the candidates.
        Adopted from: https://keras.io/examples/vision/retinanet/#preprocessing-data

        Args:
            boxes: [N, 4] float32
            [TOP, LEFT, BOTTOM, RIGHT]

            candidates: [M, 4] float32
            [CY, CX, H, W]
        Returns:
            ious: [N, M] float32
        """
        boxes_corners = to_corners(boxes)
        candidate_corners = to_corners(candidates)

        # Compute intersection area
        tl = tf.maximum(boxes_corners[..., None, :2], candidate_corners[..., :2]) # Top-Left Intersection
        br = tf.minimum(boxes_corners[..., None, 2:], candidate_corners[..., 2:]) # Bottom-Right Intersection
        intersections = tf.math.maximum(0.0, br - tl)
        intersection_areas = intersections[..., 0]*intersections[..., 1]

        # Compute box, candidates and union areas
        box_areas = boxes[..., 2]*boxes[..., 2]
        candidate_areas = candidates[:, 2]*candidates[:, 3]
        union_areas = box_areas[..., None] + candidate_areas

        return tf.clip_by_value(intersection_areas/union_areas, 0.0, 1.0)
    
    def low_confidence_prediction_mask(self, true_boxes, pred_boxes, mask):
        squeezed_shape = mask.shape[:-1]
        mask = tf.squeeze(mask)
        mask.set_shape(squeezed_shape)
        
        true_boxes = tf.boolean_mask(true_boxes, mask) # Filter-out the unassigned anchors
        
        ious = self.box_ious(pred_boxes, true_boxes)
        best_ious = tf.math.reduce_max(ious, axis=-1)
        
        return tf.cast(tf.expand_dims(best_ious < self.iou_threshold, axis=-1), dtype=true_boxes.dtype)

    def call(self, yt, yp):
        """YOLO loss function
        
        Args:
            yt : TensorSpec([BATCH, BOX_COUNT, 8], tf.float32)
            The values in the boxes tensor are (GY, GX, ANCHOR_ID, CY, CX, BH, BW, CLASS_ID).
            GY, GX are the offsets of the grid which contains the center of the box.
            CY, CX are the coordinates of the center of the box.
            BH, BW are the height and the width of the box respectively.
            
            yp: TensorSpec([BATCH, GRID_SIZE, GRID_SIZE, N_ANCHORS, N_CLASSES+5])
            The values in the predicted bounding boxes tensor are
            (CY, CX, BH, BW, CONFIDENCE, CLASS_0, CLASS_1,...CLASS_(N_CLASSES-1))
            CY, CX are the box center coordinates within the grid cell.
            BH, BW are the box dimensions wrt the anchor box.
            confidence represents the certainity of box prediction.
            class_0,..class_(n_classes-1) are the predictions for each class.
        Returns:
            loss: TensorSpec([BATCH], tf.float32)
        """
        # Transform ragged boxes to grid
        yt = self.true_boxes_to_grid(yt)
        
        # Extract true dimensions of the boxes
        grid, anchors, true_yx, true_hw, mask, true_classes = tf.split(yt, [2, 1, 2, 2, 1, N_CLASSES], axis=-1)
        boolean_mask = tf.cast(mask, dtype=tf.bool)
        
        # Transform true box values to enable comparison with the model output
        rel_true_yx = true_yx*GRID_SIZE - grid # Grid cell relative coordinates
        raw_true_hw = tf.math.log(true_hw/ANCHORS) # Box dimensions relative to the anchor
        raw_true_hw = tf.where(boolean_mask, raw_true_hw, tf.zeros_like(raw_true_hw))
        
        # Extract predictions
        logits_pred_hw = yp[..., 2:4] # Residual dimensions
        rel_pred_yx, rel_pred_hw, confidence, pred_classes = self.yolo_head(yp)
        pred_yx, pred_hw = (rel_pred_yx + grid)/GRID_SIZE, rel_pred_hw*ANCHORS
        pred_boxes = tf.concat([pred_yx, pred_hw], axis=-1)
        
        # Compute an ignore_mask to ignore predictions with high overlap with the unassigned anchor boxes
        low_confidence_mask = self.low_confidence_prediction_mask(yt[..., 3:7], pred_boxes, boolean_mask)
        
        # Compute Losses
        yx_loss = self.l_coord*mask*self.bce(rel_true_yx, rel_pred_yx) # Loss of grid cell relative coordinates
        hw_loss = mask*((logits_pred_hw - raw_true_hw)**2)
        class_loss = mask*self.bce(true_classes, pred_classes) + self.l_noobj*(1-mask)*self.bce(true_classes, pred_classes)
        confidence_loss = mask*self.bce(mask, confidence) + low_confidence_mask*(1-mask)*mask*self.bce(mask, confidence)
        
        # Total Losses
        yx_loss = tf.reduce_sum(yx_loss)
        hw_loss = tf.reduce_sum(hw_loss)
        class_loss = tf.reduce_sum(class_loss)
        confidence_loss = tf.reduce_sum(confidence_loss)
        
        # Compute total loss
        loss = (yx_loss + hw_loss + class_loss + confidence_loss) / tf.cast(tf.shape(yt)[0], tf.float32)
        
        # Console logging
#         tf.print(
#             ' yx: ', yx_loss,
#             'hw: ', hw_loss,
#             'cl: ', class_loss,
#             'conf: ', confidence_loss
#         )
        
        return loss


# loss = YoloLoss(N_CLASSES, ANCHORS, GRID_SIZE)
# sample_image = images[0]
# preds = model(sample_image[None, ...])
# sample_boxes = tf.constant([[3, 3, 2, 1, 1, 3, 2, 5], [3, 3, 2, 0, 0, 2, 2, 5]], dtype=tf.float32)
# preds = model(images)
# loss_yx = loss.call(boxes, preds)
# loss_yx = loss.call(sample_boxes[None, ...], preds)

# loss_yx

# Yolo Model

In [10]:
def get_conv_block(num_channels, shape=(3,3), padding='same', **kwargs):
    return [
        layers.Conv2D(num_channels, shape, padding=padding, **kwargs),
        layers.BatchNormalization(),
        layers.Activation('relu')
    ]

def get_conv_dsc_block():
    pass

def get_conv_builder(layer_type='standard'):
    if (layer_type == 'standard'):
        return get_conv_block
    elif (layer_type == 'dsc'):
        return get_conv_dsc_block
    else:
        raise Error('Invalid layer type: ', layer_type)

def create_model(conv_type='standard'):
    conv_builder = get_conv_builder(conv_type)
    image_input = tf.keras.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    
    features = tf.keras.Sequential([
        image_input,
        *conv_builder(32, strides=2),
        *conv_builder(64, strides=1),
        *conv_builder(128, strides=2),
        *conv_builder(128, strides=1),
        *conv_builder(256, strides=2),
        *conv_builder(256, strides=1),
        *conv_builder(512, strides=2),
        *conv_builder(512, strides=1),
        *conv_builder(512, strides=1),
        *conv_builder(512, strides=1),
        *conv_builder(512, strides=1),
        *conv_builder(512, strides=1),
        *conv_builder(1024, strides=2),
        layers.Conv2D(N_ANCHORS*(5 + N_CLASSES), 1)
#         layers.AveragePooling2D(pool_size=(7, 7)),
#         layers.Dense(1000, activation='softmax')
    ])
    
    # Raw Predictions
    output = layers.Reshape(target_shape=[GRID_SIZE, GRID_SIZE, N_ANCHORS, 5 + N_CLASSES])(features.outputs[0])
    
#     # Non-maximal suppression
#     output = non_max_supression(boxes)
    
    # Model Creation
    model = tf.keras.Model(inputs=features.inputs, outputs=[output])
    loss = YoloLoss(N_CLASSES, ANCHORS, GRID_SIZE)
    
    model.compile(optimizer='adam', loss=loss.call)
    
    return model

model = create_model()
model.summary()

# plot_model(model, show_shapes=True)

In [11]:
history = model.fit(box_ds, epochs=20)
model.save_weights('yolo_model/weights')