# Object Detection Minimal Example

The standard instructions are not reliable.  Follow these:

https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2.md

These will install tensorflow and make it match with the models.

Make sure your protoc is up to date.

You may have to add the models to the path, explicitly, as I've done below.

## Imports and Setup

In [1]:
import tensorflow as tf
print(tf.__version__)

import os 

import sys
sys.path.append("/media/jsaxon/brobdingnag/projects/urban_vision/models")
sys.path.append("/media/jsaxon/brobdingnag/projects/urban_vision/models/research")

import tqdm

import matplotlib
# import matplotlib.pyplot as plt

# import io
# import scipy.misc
import numpy as np
from six import BytesIO
from PIL import Image, ImageDraw, ImageFont

import tensorflow as tf

from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.builders import model_builder

import cv2

cv2pil = lambda x : Image.fromarray(cv2.cvtColor(x, cv2.COLOR_BGR2RGB))
np2pil = lambda x : Image.fromarray(x)

%matplotlib inline

2.3.1


## Utilities

In [2]:
# @title Choose the model to use, then evaluate the cell.
MODELS = {'centernet_with_keypoints': 'centernet_hourglass104_512x512_kpts_coco17_tpu-32', 
          'centernet_without_keypoints': 'centernet_hourglass104_512x512_coco17_tpu-8'}

model_display_name = 'centernet_without_keypoints' # @param ['centernet_with_keypoints', 'centernet_without_keypoints']
model_name = MODELS[model_display_name]

### Build a detection model and load pre-trained model weights

This sometimes takes a little while, please be patient!

In [3]:
# Download the checkpoint and put it into models/research/object_detection/test_data/

# !wget http://download.tensorflow.org/models/object_detection/tf2/20200711/centernet_hg104_512x512_coco17_tpu-8.tar.gz
# !tar -xf centernet_hg104_512x512_coco17_tpu-8.tar.gz
# !mv centernet_hg104_512x512_coco17_tpu-8/checkpoint models/research/object_detection/test_data/

In [4]:
pipeline_config = os.path.join('models/research/object_detection/configs/tf2/', model_name + '.config')
model_dir       = 'models/research/object_detection/test_data/checkpoint/'

# Load pipeline config and build a detection model
configs = config_util.get_configs_from_pipeline_file(pipeline_config)

model_config = configs['model']
detection_model = model_builder.build(model_config = model_config, is_training=False)

# Restore checkpoint
ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt.restore(os.path.join(model_dir, 'ckpt-0')).expect_partial()

def get_model_detection_function(model):
    """Get a tf.function for detection."""
  
    @tf.function
    def detect_fn(image):
        """Detect objects in image."""
    
        image, shapes = model.preprocess(image)
        prediction_dict = model.predict(image, shapes)
        detections = model.postprocess(prediction_dict, shapes)
    
        return detections, prediction_dict, tf.reshape(shapes, [-1])
  
    return detect_fn

detect_fn = get_model_detection_function(detection_model)

# Load label map data (for plotting).

Label maps correspond index numbers to category names, so that when our convolution network predicts `5`, we know that this corresponds to `airplane`.  Here we use internal utility functions, but anything that returns a dictionary mapping integers to appropriate string labels would be fine.

This is a joke -- the examples are totally fucked.

In [5]:
label_map_path = 'models/research/object_detection/data/mscoco_label_map.pbtxt'
label_map = label_map_util.load_labelmap(label_map_path)
label_map = label_map_util.get_label_map_dict(label_map, use_display_name = True)

## Utility function to show detections through cv2.

In [6]:
CATEGORY_OFFSET = 1

In [7]:
color_cat = [(0, 0, 255), (0, 255, 255)]

def paint_detections(img, detections, categories = ["person", "bicycle"], thresh = 0.3):
    
    thresh = 0.3

    img = cv2.cvtColor(img.copy(), cv2.COLOR_RGB2BGR)
    height, width, _ = img.shape

    scores  = detections['detection_scores'] [0]
    classes = detections['detection_classes'][0]
    boxes   = detections['detection_boxes']  [0]

    keep_categories = [label_map[k] - CATEGORY_OFFSET
                       for k in categories]

    display = np.isin(classes, keep_categories) & (scores > thresh)
    
    boxes   = boxes  [display]
    scores  = scores [display]
    classes = classes[display]

    for box, cat in zip(boxes, classes):
        
        ymin, xmin, ymax, xmax = np.array(box)
        
        cv2.rectangle(img, tuple((int(xmin * width), int(ymin * height))),
                           tuple((int(xmax * width), int(ymax * height))),
                      color_cat[cat], 2)

    return img



In [17]:
def _int64_feature(value):      return tf.train.Feature(int64_list = tf.train.Int64List(value = [value]))
def _bytes_feature(value):      return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value]))
def _int64_list_feature(value): return tf.train.Feature(int64_list = tf.train.Int64List(value =  value ))
def _bytes_list_feature(value): return tf.train.Feature(bytes_list = tf.train.BytesList(value =  value ))
def _float_list_feature(value): return tf.train.Feature(float_list = tf.train.FloatList(value =  value ))


def create_tfrecord(img, detections, thresh = 0.3, 
                    categories = ["person", "bicycle"], 
                    video_name = "X", frame_id = 0):
    '''
    Converts a dictionary of features for a single frame to a tf_example object.
    '''
    
    ## Fixed values to pass in.
    video         = str.encode(video_name)
    source_id     = str.encode(str("{:05d}".format(frame_id)))
    
    height        = img.shape[0]
    width         = img.shape[1]

    image_format  = str.encode('jpg')
    
    _, image_buff = cv2.imencode(".jpg", img.copy())
    image_encoded = image_buff.tobytes()
    
    scores  = detections['detection_scores'] [0]
    classes = detections['detection_classes'][0]
    boxes   = detections['detection_boxes']  [0]

    keep_categories = {label_map[k] - CATEGORY_OFFSET : k
                       for k in categories}
    
    keep_detections = np.isin(classes, list(keep_categories)) & (scores > thresh)
    
    boxes   = boxes  [keep_detections]
    scores  = scores [keep_detections]
    classes = classes[keep_detections]
                                   
    np_box = np.array(boxes).astype(float)
    xmins = list(np_box[:,1]) 
    xmaxs = list(np_box[:,3])
    ymins = list(np_box[:,0]) 
    ymaxs = list(np_box[:,2]) 

    classes_text = []
    for label in np.array(classes).astype(int):
        classes_text.append(keep_categories[label].encode('utf-8'))
    
    classes = list(np.array(classes).astype(int) + CATEGORY_OFFSET)
                               
    record = tf.train.Example(features = tf.train.Features(feature={
        'image/video'              : _bytes_feature(video),
        'image/height'             : _int64_feature(height),
        'image/width'              : _int64_feature(width),
        'image/source_id'          : _bytes_feature(source_id),
        'image/encoded'            : _bytes_feature(image_encoded),
        'image/format'             : _bytes_feature(image_format),
        'image/object/bbox/xmin'   : _float_list_feature(xmins),
        'image/object/bbox/xmax'   : _float_list_feature(xmaxs),
        'image/object/bbox/ymin'   : _float_list_feature(ymins),
        'image/object/bbox/ymax'   : _float_list_feature(ymaxs),
        'image/object/class/label' : _int64_list_feature(classes),
        'image/object/class/text'  : _bytes_list_feature(classes_text),
    }))
                               
    return record


## And finally, the detections, from stream.

In [18]:
def tfrecords_from_stream(video, ouput, N = 100, NSKIP = 10, VAL = 5, show = False):

    cap = cv2.VideoCapture(video)
    ret, cv_img = cap.read()

    training   = tf.io.TFRecordWriter(ouput + "_train.tfrecord")
    validation = tf.io.TFRecordWriter(ouput + "_val.tfrecord")

    for ix in tqdm.tqdm(range(N)):

        for xi in range(NSKIP): ret, cv_img = cap.read()

        pil_img = cv2pil(cv_img)
        np_img  = np.array(pil_img)
        tf_img  = tf.convert_to_tensor(np.expand_dims(np_img, 0), dtype=tf.float32)
        tf_img8 = tf.cast(tf_img[0], tf.uint8)

        detections, predictions_dict, shapes = detect_fn(tf_img)

        record = create_tfrecord(cv_img, detections, thresh = 0.5, 
                                 video_name = video, frame_id = (ix+1) * NSKIP)
        
        if ix % VAL: training  .write(record.SerializeToString())
        else:        validation.write(record.SerializeToString())

        if show: 
            img_det = paint_detections(np_img, detections, thresh = 0.5)
            display(cv2pil(img_det))

        
    training  .close()
    validation.close()
    
    cap.release()
    
    return record

In [19]:
video  = "/media/jsaxon/brobdingnag/data/cv/vid/burnham/55/20200927_143601.MOV"
output = "/media/jsaxon/brobdingnag/data/cv/tf/burnham/55/20200927_143601"

record = tfrecords_from_stream(video, output, N = 1)

100%|██████████| 1/1 [00:04<00:00,  4.43s/it]


In [20]:
record

features {
  feature {
    key: "image/format"
    value {
      bytes_list {
        value: "jpg"
      }
    }
  }
  feature {
    key: "image/height"
    value {
      int64_list {
        value: 1520
      }
    }
  }
  feature {
    key: "image/object/bbox/xmax"
    value {
      float_list {
        value: 0.6568061709403992
        value: 0.42628544569015503
        value: 0.41199246048927307
        value: 0.6452797651290894
      }
    }
  }
  feature {
    key: "image/object/bbox/xmin"
    value {
      float_list {
        value: 0.5986146330833435
        value: 0.4073081612586975
        value: 0.39413192868232727
        value: 0.6103401184082031
      }
    }
  }
  feature {
    key: "image/object/bbox/ymax"
    value {
      float_list {
        value: 0.7727861404418945
        value: 0.30351051688194275
        value: 0.31569039821624756
        value: 0.8514447808265686
      }
    }
  }
  feature {
    key: "image/object/bbox/ymin"
    value {
      float_list {
   