# Convert Vatic Export Format to Tensorflow Format

This notebook is a script to take a vatic export and output tensorflow records. It uses this script as a starting point https://github.com/tensorflow/models/blob/master/object_detection/create_pascal_tf_record.py


In [2]:
import hashlib
import io
import logging
import os

from lxml import etree
import tensorflow as tf
import PIL.Image

from object_detection.utils import dataset_util
from object_detection.utils import label_map_util

In [3]:
LABEL_MAP_FILE ="labelmap.pbtxt"
LABEL_MAP = """
item {
  id: 1
  name: 'plastic-crate'
}

item {
  id: 2
  name: 'egg-carton'
}

item {
  id: 3
  name: 'milk-carton'
}

item {
  id: 4
  name: 'human-head'
}

item {
  id: 5
  name: 'silver-cart'
}

item {
  id: 6
  name: 'table'
}

item {
  id: 7
  name: 'green-grocery-bag'
}

item {
  id: 8
  name: 'green-grocery-bag'
}

item {
  id: 9
  name: 'yellow-grocery-bag'
}

item {
  id: 10
  name: 'not-a-real-object'
}
"""

with open(LABEL_MAP_FILE, "w") as f:
    f.write(LABEL_MAP)

label_map_dict = label_map_util.get_label_map_dict(LABEL_MAP_FILE)

In [4]:
label_map_dict

{'egg-carton': 2,
 'green-grocery-bag': 8,
 'human-head': 4,
 'milk-carton': 3,
 'not-a-real-object': 10,
 'plastic-crate': 1,
 'silver-cart': 5,
 'table': 6,
 'yellow-grocery-bag': 9}

In [45]:
%%sh
cat ~/model1/data/farmstead1/farmstead1-data.pascal/ImageSets/Main/*trainval.txt  \
    > ~/model1/data/farmstead1/farmstead1-data.pascal/ImageSets/Main/all.txt

In [63]:

def dict_to_tf_example(data,
                       dataset_directory,
                       label_map_dict,
                       ignore_difficult_instances=False,
                       image_subdirectory='JPEGImages'):
    """Convert XML derived dict to tf.Example proto.
    Notice that this function normalizes the bounding box coordinates provided
    by the raw data.
    Args:
      data: dict holding PASCAL XML fields for a single image (obtained by
        running dataset_util.recursive_parse_xml_to_dict)
      dataset_directory: Path to root directory holding PASCAL dataset
      label_map_dict: A map from string label names to integers ids.
      ignore_difficult_instances: Whether to skip difficult instances in the
        dataset  (default: False).
      image_subdirectory: String specifying subdirectory within the
        PASCAL dataset directory holding the actual image data.
    Returns:
      example: The converted tf.Example.
    Raises:
      ValueError: if the image pointed to by data['filename'] is not a valid JPEG
    """

    full_path = os.path.join(dataset_directory, image_subdirectory, data['filename'])

    with tf.gfile.GFile(full_path, 'rb') as fid:
        encoded_jpg = fid.read()
        encoded_jpg_io = io.BytesIO(encoded_jpg)

    image = PIL.Image.open(encoded_jpg_io)

    if image.format != 'JPEG':
        raise ValueError('Image format not JPEG')
    
    key = hashlib.sha256(encoded_jpg).hexdigest()

    width = int(data['size']['width'])
    height = int(data['size']['height'])

    xmin = []
    ymin = []
    xmax = []
    ymax = []
    classes = []
    classes_text = []
    truncated = []
    poses = []
    difficult_obj = []

    for obj in data['object']:
        difficult = bool(int(obj['difficult']))
    
        if ignore_difficult_instances and difficult:
            continue

        difficult_obj.append(int(difficult))

        xmin.append(float(obj['bndbox']['xmin']) / width)
        ymin.append(float(obj['bndbox']['ymin']) / height)
        xmax.append(float(obj['bndbox']['xmax']) / width)
        ymax.append(float(obj['bndbox']['ymax']) / height)
        
        classes_text.append(obj['name'].encode('utf8'))
        
        classes.append(label_map_dict[obj['name']])
        
        truncated.append(int(obj['truncated']))
        poses.append(obj['pose'].encode('utf8'))

    example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(
            data['filename'].encode('utf8')),
        'image/source_id': dataset_util.bytes_feature(
            data['filename'].encode('utf8')),
        'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')),
        'image/encoded': dataset_util.bytes_feature(encoded_jpg),
        'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmin),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmax),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymin),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymax),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
        'image/object/difficult': dataset_util.int64_list_feature(difficult_obj),
        'image/object/truncated': dataset_util.int64_list_feature(truncated),
        'image/object/view': dataset_util.bytes_list_feature(poses),
    }))
    return example


def main(output_dir, data_dir, label_map_dict, eval_to_train_ratio=0.1):
    """
    Assumes data dir looks like this

    dataDir/
        Annotations/
        JPEGImages/
        ImageSets/
            Main/
    
    set can be train or trainval
    """
    annotations_dir = os.path.join(data_dir, "Annotations")
    examples_path = os.path.join(data_dir, 'ImageSets', 'Main', 'all.txt')
    examples_list = dataset_util.read_examples_list(examples_path)

    eval_path = os.path.join(output_dir, "eval.tfrecords")
    train_path = os.path.join(output_dir, "train.tfrecords")

    eval_interval = int(eval_to_train_ratio * len(examples_list)) 

    with tf.python_io.TFRecordWriter(eval_path) as eval_writer, \
         tf.python_io.TFRecordWriter(train_path) as train_writer:
 
        for idx, example in enumerate(examples_list):
            if idx % 500 == 0:
                print('On image %d of %d' % (idx, len(examples_list)))

            path = os.path.join(annotations_dir, example + '.xml')

            with tf.gfile.GFile(path, 'r') as fid:
                xml_str = fid.read()

            xml = etree.fromstring(xml_str)
            data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation']

            tf_example = dict_to_tf_example(data, data_dir, label_map_dict,
                                            ignore_difficult_instances=False)

            if idx % eval_interval == 0:
                eval_writer.write(tf_example.SerializeToString())
            else:
                train_writer.write(tf_example.SerializeToString())

            
    print("Done")


main(output_dir="/home/eli/model1/data/",
     data_dir="/home/eli/model1/data/farmstead1/farmstead1-data.pascal/",
     label_map_dict)

On image 0 of 4525


  if not xml:


On image 500 of 4525
On image 1000 of 4525
On image 1500 of 4525
On image 2000 of 4525
On image 2500 of 4525
On image 3000 of 4525
On image 3500 of 4525
On image 4000 of 4525
On image 4500 of 4525
Done


In [5]:
ls /home/eli/model1/data

eval.tfrecords  [0m[01;34mfarmstead1[0m/  [01;31mfarmstead1-data.tgz[0m  train.tfrecords  [01;34mvoc[0m/


In [19]:
%%sh
cd /home/eli/model1/models/model
wget http://storage.googleapis.com/download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_11_06_2017.tar.gz
tar -xvf faster_rcnn_resnet101_coco_11_06_2017.tar.gz

faster_rcnn_resnet101_coco_11_06_2017/
faster_rcnn_resnet101_coco_11_06_2017/model.ckpt.index
faster_rcnn_resnet101_coco_11_06_2017/model.ckpt.meta
faster_rcnn_resnet101_coco_11_06_2017/frozen_inference_graph.pb
faster_rcnn_resnet101_coco_11_06_2017/model.ckpt.data-00000-of-00001
faster_rcnn_resnet101_coco_11_06_2017/graph.pbtxt


--2017-08-29 00:53:54--  http://storage.googleapis.com/download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_11_06_2017.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.7.176, 2607:f8b0:4004:801::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.7.176|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 595490113 (568M) [application/x-tar]
Saving to: ‘faster_rcnn_resnet101_coco_11_06_2017.tar.gz’

     0K .......... .......... .......... .......... ..........  0% 43.0M 13s
    50K .......... .......... .......... .......... ..........  0% 70.6M 11s
   100K .......... .......... .......... .......... ..........  0%  146M 8s
   150K .......... .......... .......... .......... ..........  0%  118M 7s
   200K .......... .......... .......... .......... ..........  0%  148M 7s
   250K .......... .......... .......... .......... ..........  0%  169M 6s
   300K .......... .......... .......... .......

In [39]:
ls -l /home/eli/model1/models/model/faster_rcnn_resnet101_coco_11_06_2017/model.ckpt.data-00000-of-00001

total 659052
-rw-r----- 1 eli eli 196890839 Jun 12 00:58 frozen_inference_graph.pb
-rw-r----- 1 eli eli  20932156 Jun 12 00:58 graph.pbtxt
-rw-r----- 1 eli eli 445812832 Jun 12 01:00 model.ckpt.data-00000-of-00001
-rw-r----- 1 eli eli     40521 Jun 12 01:00 model.ckpt.index
-rw-r----- 1 eli eli  11175327 Jun 12 01:00 model.ckpt.meta


Set up config file. See doc here: https://github.com/KaliberLabs/models/blob/es-prototype/object_detection/g3doc/configuring_jobs.md

In [29]:
ls samples/configs/

faster_rcnn_inception_resnet_v2_atrous_pets.config
faster_rcnn_resnet101_pets.config
faster_rcnn_resnet101_voc07.config
faster_rcnn_resnet152_pets.config
faster_rcnn_resnet50_pets.config
rfcn_resnet101_pets.config
ssd_inception_v2_pets.config
ssd_mobilenet_v1_pets.config


In [32]:
!cat samples/configs/faster_rcnn_resnet101_voc07.config

# Faster R-CNN with Resnet-101 (v1), configured for Pascal VOC Dataset.
# Users should configure the fine_tune_checkpoint field in the train config as
# well as the label_map_path and input_path fields in the train_input_reader and
# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
# should be configured.

model {
  faster_rcnn {
    num_classes: 20
    image_resizer {
      keep_aspect_ratio_resizer {
        min_dimension: 600
        max_dimension: 1024
      }
    }
    feature_extractor {
      type: 'faster_rcnn_resnet101'
      first_stage_features_stride: 16
    }
    first_stage_anchor_generator {
      grid_anchor_generator {
        scales: [0.25, 0.5, 1.0, 2.0]
        aspect_ratios: [0.5, 1.0, 2.0]
        height_stride: 16
        width_stride: 16
      }
    }
    first_stage_box_predictor_conv_hyperparams {
      op: CONV
      regularizer {
        l2_regularizer {
          weight: 0.0
        }
      }
  

In [45]:
%%bash
echo '
model {
  faster_rcnn {
    num_classes: 10
    image_resizer {
      keep_aspect_ratio_resizer {
        min_dimension: 600
        max_dimension: 1024
      }
    }
    feature_extractor {
      type: "faster_rcnn_resnet101"
      first_stage_features_stride: 16
    }
    first_stage_anchor_generator {
      grid_anchor_generator {
        scales: [0.25, 0.5, 1.0, 2.0]
        aspect_ratios: [0.5, 1.0, 2.0]
        height_stride: 16
        width_stride: 16
      }
    }
    first_stage_box_predictor_conv_hyperparams {
      op: CONV
      regularizer {
        l2_regularizer {
          weight: 0.0
        }
      }
      initializer {
        truncated_normal_initializer {
          stddev: 0.01
        }
      }
    }
    first_stage_nms_score_threshold: 0.0
    first_stage_nms_iou_threshold: 0.7
    first_stage_max_proposals: 300
    first_stage_localization_loss_weight: 2.0
    first_stage_objectness_loss_weight: 1.0
    initial_crop_size: 14
    maxpool_kernel_size: 2
    maxpool_stride: 2
    second_stage_box_predictor {
      mask_rcnn_box_predictor {
        use_dropout: false
        dropout_keep_probability: 1.0
        fc_hyperparams {
          op: FC
          regularizer {
            l2_regularizer {
              weight: 0.0
            }
          }
          initializer {
            variance_scaling_initializer {
              factor: 1.0
              uniform: true
              mode: FAN_AVG
            }
          }
        }
      }
    }
    second_stage_post_processing {
      batch_non_max_suppression {
        score_threshold: 0.0
        iou_threshold: 0.6
        max_detections_per_class: 100
        max_total_detections: 300
      }
      score_converter: SOFTMAX
    }
    second_stage_localization_loss_weight: 2.0
    second_stage_classification_loss_weight: 1.0
  }
}

train_config: {
  batch_size: 1
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        manual_step_learning_rate {
          initial_learning_rate: 0.0001
          schedule {
            step: 0
            learning_rate: .0001
          }
          schedule {
            step: 500000
            learning_rate: .00001
          }
          schedule {
            step: 700000
            learning_rate: .000001
          }
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  gradient_clipping_by_norm: 10.0
  fine_tune_checkpoint: "/home/eli/model1/models/model/faster_rcnn_resnet101_coco_11_06_2017/model.ckpt.data-00000-of-00001"
  from_detection_checkpoint: true
  num_steps: 800000
  data_augmentation_options {
    random_horizontal_flip {
    }
  }
}

train_input_reader: {
  tf_record_input_reader {
    input_path: "/home/eli/model1/data/train.tfrecords"
  }
  label_map_path: "labelmap.pbtxt"
}

eval_config: {
  num_examples: 4952
}

eval_input_reader: {
  tf_record_input_reader {
    input_path: "/home/eli/model1/data/eval.tfrecords"
  }
  label_map_path: "labelmap.pbtxt"
  shuffle: false
  num_readers: 1
}' > /home/eli/model1/faster-rcnn.config

In [47]:
cat /home/eli/model1/faster-rcnn.config


model {
  faster_rcnn {
    num_classes: 10
    image_resizer {
      keep_aspect_ratio_resizer {
        min_dimension: 600
        max_dimension: 1024
      }
    }
    feature_extractor {
      type: "faster_rcnn_resnet101"
      first_stage_features_stride: 16
    }
    first_stage_anchor_generator {
      grid_anchor_generator {
        scales: [0.25, 0.5, 1.0, 2.0]
        aspect_ratios: [0.5, 1.0, 2.0]
        height_stride: 16
        width_stride: 16
      }
    }
    first_stage_box_predictor_conv_hyperparams {
      op: CONV
      regularizer {
        l2_regularizer {
          weight: 0.0
        }
      }
      initializer {
        truncated_normal_initializer {
          stddev: 0.01
        }
      }
    }
    first_stage_nms_score_threshold: 0.0
    first_stage_nms_iou_threshold: 0.7
    first_stage_max_proposals: 300
    first_stage_localization_loss_weight: 2.0
    first_stage_objectness_loss_weight: 1.0
    initial_crop_size