# How to train a tensorflow model with vatic output

## Exporting the data
ssh into the vatic server and export the data

    sudo su
    cd /vatic-videos
    export PYTHONPATH=/vatic:$PYTHONPATH
    turkic dump --pascal --merge farmstead1 -o dump1/farmstead1-data.pascal

*vatic needs to be on this branch to work https://github.com/cvondrick/vatic/pull/86*


## Convert Vatic Export Format to Tensorflow Format
The following is a script to take a vatic export and output tensorflow records. It uses this script as a starting point https://github.com/tensorflow/models/blob/master/object_detection/create_pascal_tf_record.py


In [1]:
import hashlib
import io
import logging
import os

from lxml import etree
import tensorflow as tf
import PIL.Image

from object_detection.utils import dataset_util
from object_detection.utils import label_map_util

This label map enumerates the categories of objects in our image.

In [12]:
LABEL_MAP_FILE ="labelmap.pbtxt"
LABEL_MAP = """
item {
  id: 1
  name: 'plastic-crate'
}

item {
  id: 2
  name: 'egg-carton'
}

item {
  id: 3
  name: 'milk-carton'
}

item {
  id: 4
  name: 'human-head'
}

item {
  id: 5
  name: 'silver-cart'
}

item {
  id: 6
  name: 'table'
}

item {
  id: 7
  name: 'green-grocery-bag'
}

item {
  id: 8
  name: 'green-grocery-bag'
}

item {
  id: 9
  name: 'yellow-grocery-bag'
}
"""

with open(LABEL_MAP_FILE, "w") as f:
    f.write(LABEL_MAP)

label_map_dict = label_map_util.get_label_map_dict(LABEL_MAP_FILE)

In [13]:
label_map_dict

{'egg-carton': 2,
 'green-grocery-bag': 8,
 'human-head': 4,
 'milk-carton': 3,
 'plastic-crate': 1,
 'silver-cart': 5,
 'table': 6,
 'yellow-grocery-bag': 9}

In [32]:
!echo $(pwd)/labelmap.pbtxt

/home/eli/models/object_detection/labelmap.pbtxt


Combine all the training and evaluation sets into one file.

In [33]:
%%sh
# 007531000001 is a bad record
cat ~/model1/data/farmstead1-data.pascal/ImageSets/Main/*trainval.txt \
    | grep -v 007531000001 > ~/model1/data/farmstead1-data.pascal/ImageSets/Main/all.txt

In [34]:

def dict_to_tf_example(data,
                       dataset_directory,
                       label_map_dict,
                       path,
                       ignore_difficult_instances=False,
                       image_subdirectory='JPEGImages'):
    """Convert XML derived dict to tf.Example proto.
    Notice that this function normalizes the bounding box coordinates provided
    by the raw data.
    Args:
      data: dict holding PASCAL XML fields for a single image (obtained by
        running dataset_util.recursive_parse_xml_to_dict)
      dataset_directory: Path to root directory holding PASCAL dataset
      label_map_dict: A map from string label names to integers ids.
      ignore_difficult_instances: Whether to skip difficult instances in the
        dataset  (default: False).
      image_subdirectory: String specifying subdirectory within the
        PASCAL dataset directory holding the actual image data.
    Returns:
      example: The converted tf.Example.
    Raises:
      ValueError: if the image pointed to by data['filename'] is not a valid JPEG
    """

    full_path = os.path.join(dataset_directory, image_subdirectory, data['filename'])

    with tf.gfile.GFile(full_path, 'rb') as fid:
        encoded_jpg = fid.read()
        encoded_jpg_io = io.BytesIO(encoded_jpg)

    image = PIL.Image.open(encoded_jpg_io)

    if image.format != 'JPEG':
        raise ValueError('Image format not JPEG')
    
    key = hashlib.sha256(encoded_jpg).hexdigest()

    width = int(data['size']['width'])
    height = int(data['size']['height'])

    xmin = []
    ymin = []
    xmax = []
    ymax = []
    classes = []
    classes_text = []
    truncated = []
    poses = []
    difficult_obj = []

    for obj in data['object']:
        difficult = bool(int(obj['difficult']))
    
        if ignore_difficult_instances and difficult:
            continue

        if "not-a-real-object" == obj["name"]:
            continue

        xmi = float(obj['bndbox']['xmin']) / width
        xma = float(obj['bndbox']['xmax']) / width
        ymi = float(obj['bndbox']['ymin']) / height
        yma = float(obj['bndbox']['ymax']) / height

        assert 0 <= xmi < xma <= 1, "invalid xmin: {} and xmax: {}".format(xmi, xma)
         
        assert 0 <= ymi < yma <= 1, "invalid ymin: {} and ymax: {}".format(ymi, yma)
         
        difficult_obj.append(int(difficult))
            
        xmin.append(xmi)
        ymin.append(ymi)       
        xmax.append(xma)
        ymax.append(yma)
        
        classes_text.append(obj['name'].encode('utf8'))
        
        classes.append(label_map_dict[obj['name']])
        
        truncated.append(int(obj['truncated']))
        poses.append(obj['pose'].encode('utf8'))

    example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(
            data['filename'].encode('utf8')),
        'image/source_id': dataset_util.bytes_feature(
            data['filename'].encode('utf8')),
        'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')),
        'image/encoded': dataset_util.bytes_feature(encoded_jpg),
        'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmin),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmax),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymin),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymax),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
        'image/object/difficult': dataset_util.int64_list_feature(difficult_obj),
        'image/object/truncated': dataset_util.int64_list_feature(truncated),
        'image/object/view': dataset_util.bytes_list_feature(poses),
    }))
    return example


def main(output_dir, data_dir, label_map_dict, eval_to_train_ratio=0.1):
    """
    Assumes data dir looks like this

    dataDir/
        Annotations/
        JPEGImages/
        ImageSets/
            Main/
    
    set can be train or trainval
    """
    annotations_dir = os.path.join(data_dir, "Annotations")
    examples_path = os.path.join(data_dir, 'ImageSets', 'Main', 'all.txt')
    examples_list = dataset_util.read_examples_list(examples_path)

    eval_path = os.path.join(output_dir, "eval.tfrecords")
    train_path = os.path.join(output_dir, "train.tfrecords")

    eval_interval = int(eval_to_train_ratio * len(examples_list)) 

    with tf.python_io.TFRecordWriter(eval_path) as eval_writer, \
         tf.python_io.TFRecordWriter(train_path) as train_writer:
 
        for idx, example in enumerate(examples_list):
            if idx % 500 == 0:
                print('On image %d of %d' % (idx, len(examples_list)))

            path = os.path.join(annotations_dir, example + '.xml')

            with tf.gfile.GFile(path, 'r') as fid:
                xml_str = fid.read()

            xml = etree.fromstring(xml_str)
            data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation']

            tf_example = dict_to_tf_example(data, data_dir, label_map_dict, path,
                                            ignore_difficult_instances=False)

            if idx % eval_interval == 0:
                eval_writer.write(tf_example.SerializeToString())
            else:
                train_writer.write(tf_example.SerializeToString())

            
    print("Done")


main(output_dir="/home/eli/model1/data/",
     data_dir="/home/eli/model1/data/farmstead1-data.pascal/",
     label_map_dict=label_map_dict)

On image 0 of 4525


  if not xml:


On image 500 of 4525
On image 1000 of 4525
On image 1500 of 4525
On image 2000 of 4525
On image 2500 of 4525
On image 3000 of 4525
On image 3500 of 4525
On image 4000 of 4525
On image 4500 of 4525
Done


## Download a pretrained model
https://github.com/tensorflow/models/blob/master/object_detection/g3doc/detection_model_zoo.md

In [21]:
%%sh
cd /home/eli/model1/models/model
wget http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_11_06_2017.tar.gz
tar -xvf ssd_inception_v2_coco_11_06_2017.tar.gz

ssd_inception_v2_coco_11_06_2017/
ssd_inception_v2_coco_11_06_2017/model.ckpt.index
ssd_inception_v2_coco_11_06_2017/model.ckpt.meta
ssd_inception_v2_coco_11_06_2017/frozen_inference_graph.pb
ssd_inception_v2_coco_11_06_2017/model.ckpt.data-00000-of-00001
ssd_inception_v2_coco_11_06_2017/graph.pbtxt


--2017-09-02 00:19:59--  http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_11_06_2017.tar.gz
Resolving download.tensorflow.org (download.tensorflow.org)... 172.217.5.240, 2607:f8b0:4004:801::2010
Connecting to download.tensorflow.org (download.tensorflow.org)|172.217.5.240|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 466846518 (445M) [application/x-tar]
Saving to: ‘ssd_inception_v2_coco_11_06_2017.tar.gz.1’

     0K .......... .......... .......... .......... ..........  0% 11.0M 41s
    50K .......... .......... .......... .......... ..........  0% 21.7M 31s
   100K .......... .......... .......... .......... ..........  0% 39.8M 24s
   150K .......... .......... .......... .......... ..........  0% 40.9M 21s
   200K .......... .......... .......... .......... ..........  0% 69.0M 18s
   250K .......... .......... .......... .......... ..........  0% 85.2M 16s
   300K .......... .......... .......... .......... ..........  0%  106

## Configure the training pipeline
https://github.com/KaliberLabs/models/blob/es-prototype/object_detection/g3doc/configuring_jobs.md

In [22]:
%%bash
echo '
model {
  ssd {
    num_classes: 10
    box_coder {
      faster_rcnn_box_coder {
        y_scale: 10.0
        x_scale: 10.0
        height_scale: 5.0
        width_scale: 5.0
      }
    }
    matcher {
      argmax_matcher {
        matched_threshold: 0.5
        unmatched_threshold: 0.5
        ignore_thresholds: false
        negatives_lower_than_unmatched: true
        force_match_for_each_row: true
      }
    }
    similarity_calculator {
      iou_similarity {
      }
    }
    anchor_generator {
      ssd_anchor_generator {
        num_layers: 6
        min_scale: 0.2
        max_scale: 0.95
        aspect_ratios: 1.0
        aspect_ratios: 2.0
        aspect_ratios: 0.5
        aspect_ratios: 3.0
        aspect_ratios: 0.3333
        reduce_boxes_in_lowest_layer: true
      }
    }
    image_resizer {
      fixed_shape_resizer {
        height: 300
        width: 300
      }
    }
    box_predictor {
      convolutional_box_predictor {
        min_depth: 0
        max_depth: 0
        num_layers_before_predictor: 0
        use_dropout: false
        dropout_keep_probability: 0.8
        kernel_size: 3
        box_code_size: 4
        apply_sigmoid_to_scores: false
        conv_hyperparams {
          activation: RELU_6,
          regularizer {
            l2_regularizer {
              weight: 0.00004
            }
          }
          initializer {
            truncated_normal_initializer {
              stddev: 0.03
              mean: 0.0
            }
          }
        }
      }
    }
    feature_extractor {
      type: "ssd_inception_v2"
      min_depth: 16
      depth_multiplier: 1.0
      conv_hyperparams {
        activation: RELU_6,
        regularizer {
          l2_regularizer {
            weight: 0.00004
          }
        }
        initializer {
          truncated_normal_initializer {
            stddev: 0.03
            mean: 0.0
          }
        }
        batch_norm {
          train: true,
          scale: true,
          center: true,
          decay: 0.9997,
          epsilon: 0.001,
        }
      }
    }
    loss {
      classification_loss {
        weighted_sigmoid {
          anchorwise_output: true
        }
      }
      localization_loss {
        weighted_smooth_l1 {
          anchorwise_output: true
        }
      }
      hard_example_miner {
        num_hard_examples: 3000
        iou_threshold: 0.99
        loss_type: CLASSIFICATION
        max_negatives_per_positive: 3
        min_negatives_per_image: 0
      }
      classification_weight: 1.0
      localization_weight: 1.0
    }
    normalize_loss_by_num_matches: true
    post_processing {
      batch_non_max_suppression {
        score_threshold: 1e-8
        iou_threshold: 0.6
        max_detections_per_class: 100
        max_total_detections: 100
      }
      score_converter: SIGMOID
    }
  }
}

train_config: {
  batch_size: 30
  optimizer {
    rms_prop_optimizer: {
      learning_rate: {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.003
          decay_steps: 800720
          decay_factor: 0.95
        }
      }
      momentum_optimizer_value: 0.9
      decay: 0.9
      epsilon: 1.0
    }
  }
  fine_tune_checkpoint: "/home/eli/model1/models/model/ssd_inception_v2_coco_11_06_2017/model.ckpt"
  from_detection_checkpoint: true
  # Note: The below line limits the training process to 200K steps, which we
  # empirically found to be sufficient enough to train the pets dataset. This
  # effectively bypasses the learning rate schedule (the learning rate will
  # never decay). Remove the below line to train indefinitely.
  num_steps: 50000
  data_augmentation_options {
    random_horizontal_flip {
    }
  }
  data_augmentation_options {
    ssd_random_crop {
    }
  }
}

train_input_reader: {
  tf_record_input_reader {
    input_path: "/home/eli/model1/data/train.tfrecords"
  }
  label_map_path: "/home/eli/models/object_detection/labelmap.pbtxt"
}

eval_config: {
  num_examples: 2000
  # Note: The below line limits the evaluation process to 10 evaluations.
  # Remove the below line to evaluate indefinitely.
  max_evals: 10
}

eval_input_reader: {
  tf_record_input_reader {
    input_path: "/home/eli/model1/data/eval.tfrecords"
  }
  label_map_path: "/home/eli/models/object_detection/labelmap.pbtxt"
  shuffle: false
  num_readers: 1
}
' > /home/eli/model1/ssd.config

## Training a model

This takes a while

        python models/object_detection/train.py \
            --logtostderr \
            --pipeline_config_path=/home/eli/model1/faster-rcnn.config \
            --train_dir=/home/eli/model1/models/model/train      
We can moniter progress with tensorboard 

       tensorboard --logdir=model1/models/model/
       
Run an evaluation job with tensorflow.

        python models/object_detection/eval.py \
            --logtostderr \
            --pipeline_config_path=/home/eli/model1/faster-rcnn.config \
            --checkpoint_dir=/home/eli/model1/models/model/train       \
            --eval_dir=/home/eli/model1/models/model/eval      


## Convert the model checkpoint into a packaged format


        python /home/eli/models/object_detection/export_inference_graph.py --input_type image_tensor \
                --pipeline_config_path /home/eli/model1/ssd.config\
                --trained_checkpoint_prefix model1/models/model/train1/model.ckpt-50000 \
                --output_directory /home/eli/build

## Deploy with Tensorflow Model Server

Installing the ModelServer 
https://github.com/tensorflow/serving/blob/master/tensorflow_serving/g3doc/setup.md#installing-the-modelserver

    mkdir /model1/build/1
    mv model1/build/1/saved_model/* /model1/build/1
    tensorflow_model_server --model_base_path=/home/eli/model1/build/ --model_name=ssd

        2017-09-01 19:17:59.573825: I tensorflow_serving/model_servers/main.cc:147] Building single TensorFlow model file config:  model_name: ssd model_base_path: /home/eli/model1/build/
        2017-09-01 19:17:59.574048: I tensorflow_serving/model_servers/server_core.cc:434] Adding/updating models.
        2017-09-01 19:17:59.574072: I tensorflow_serving/model_servers/server_core.cc:485]  (Re-)adding model: ssd
        2017-09-01 19:17:59.674560: I tensorflow_serving/core/basic_manager.cc:705] Successfully reserved resources to load servable {name: ssd version: 1}
        2017-09-01 19:17:59.674619: I tensorflow_serving/core/loader_harness.cc:66] Approving load for servable version {name: ssd version: 1}
        2017-09-01 19:17:59.674630: I tensorflow_serving/core/loader_harness.cc:74] Loading servable version {name: ssd version: 1}
        2017-09-01 19:17:59.674670: I external/org_tensorflow/tensorflow/contrib/session_bundle/bundle_shim.cc:360] Attempting to load native SavedModelBundle in bundle-shim from: /home/eli/model1/build/1
        2017-09-01 19:17:59.674698: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:236] Loading SavedModel from: /home/eli/model1/build/1
        2017-09-01 19:17:59.739020: W external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
        2017-09-01 19:17:59.739080: W external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
        2017-09-01 19:17:59.798255: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:155] Restoring SavedModel bundle.
        2017-09-01 19:17:59.798361: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:165] The specified SavedModel has no variables; no checkpoints were restored.
        2017-09-01 19:17:59.798381: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:190] Running LegacyInitOp on SavedModel bundle.
        2017-09-01 19:17:59.804599: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:284] Loading SavedModel: success. Took 129806 microseconds.
        2017-09-01 19:17:59.805964: I tensorflow_serving/core/loader_harness.cc:86] Successfully loaded servable version {name: ssd version: 1}
        2017-09-01 19:17:59.809243: I tensorflow_serving/model_servers/main.cc:288] Running ModelServer at 0.0.0.0:8500 ...
