# Distillation of object detection pipeline

## Goals
* Minimize steps - align more closely with the steps in a final pipeline
    * For now we will neglect manual labeling, just leave a space for it
* Minimize code footprint that needs to go to docker
* Pull as much from git or S3 directly as possible

## Put all imports and globals here
This way it's easy to re-run this if you want to jump somewhere into the pipeline

In [1]:
import os
import sys

import sagemaker
from sagemaker.tensorflow import TensorFlow
import tensorflow as tf
from PIL import Image
import IPython.display as display
import matplotlib.pyplot as plt
import base64

SM_SESSION = sagemaker.Session()
SM_ROLE = sagemaker.get_execution_role()

TOPDIR = os.getcwd()

# This is the code we ship to training.  Keep this light and your training
# will start faster
CODE_SRCDIR = f"{TOPDIR}/train_src"

TF_MODELS_GIT="https://github.com/tensorflow/models.git"
TF_MODELS_SRCDIR=f"{TOPDIR}/tf_models"

# These are the discrete images and annotations we pull from S3
DATA_SRCDIR = f"{TOPDIR}/srcdata"
TARBALL_SRCDIR = f"{DATA_SRCDIR}/tarballs"
TARBALL_STAGING = f"{DATA_SRCDIR}/tarball_extract_tmp"
ANNOTATION_SRCDIR = f"{DATA_SRCDIR}/annotations"
JPEG_SRCDIR = f"{DATA_SRCDIR}/jpeg_images"

# These are the tfrecord files which hold the above data
TFRECORD_SRCDIR = f"{DATA_SRCDIR}/tfrecords"
TRAIN_TFRECORD_SRCDIR = f"{TFRECORD_SRCDIR}/train"
VALIDATE_TFRECORD_SRCDIR = f"{TFRECORD_SRCDIR}/val"
TEST_TFRECORD_SRCDIR = f"{TFRECORD_SRCDIR}/test"

# Label -> label ID used by the model
LABEL_MAP_FILE = f"{CODE_SRCDIR}/cfa_prod_label_map.pbtxt"

# S3 location of our source images
S3_ALL_IMAGES = "s3://cfa-eadatasciencesb-sagemaker/datasets/cfa_products/all_images/"
S3_ALL_ANNOTATIONS = "s3://cfa-eadatasciencesb-sagemaker/datasets/cfa_products/all_annotations"
S3_TEST_IMAGES = "s3://cfa-eadatasciencesb-sagemaker/datasets/cfa_products/test_images"

# Our SM jobs will use this bucket
SM_WORKING_S3_BUCKET = "dev-eadatasciencesb-us-east-1-sagemaker-pdamore"

# S3 locations of our inputs
# Train tfrecords
S3_TRAIN_TFRECORDS_URI=f"s3://{SM_WORKING_S3_BUCKET}/datasets/cfa_products/train"
# Validate tfrecords
S3_VALIDATE_TFRECORDS_URI=f"s3://{SM_WORKING_S3_BUCKET}/datasets/cfa_products/val"
# Since we are transfer training, this is the path of the model we want to start with
# This will get passed as an input channel to the training job
S3_BASEMODEL_URI="s3://cfa-eadatasciencesb-sagemaker/trained-models/tensorflow_mobilenet/20180718_coco14_mobilenet_v1_ssd300_quantized"

S3_OUTPUT_URI=f"s3://{SM_WORKING_S3_BUCKET}/outputs"

# Train, Validate, Test
TRAINING_SPLIT_TUPLE =  (60,30,10)
NUM_TRAIN_STEPS = '1'
NUM_VALIDATE_STEPS = '1'
MODEL_VERSION = "ptd002"

### Pull in the tf models code
Have to do this early so we can import more stuff

In [2]:
if not os.path.isdir(TF_MODELS_SRCDIR):
    ! git clone {TF_MODELS_GIT} {TF_MODELS_SRCDIR}
else:
    print("Skipping git clone of tf models because it exists.")

if not os.path.isdir(f"{CODE_SRCDIR}/object_detection"):
    ! cp -r {TF_MODELS_SRCDIR}/research/object_detection {CODE_SRCDIR}
    ! cp -r {TF_MODELS_SRCDIR}/research/slim {CODE_SRCDIR}
    ! rm -rf {CODE_SRCDIR}/object_detection/test_ckpt
    ! rm -rf {CODE_SRCDIR}/object_detection/g3doc
    ! pushd {CODE_SRCDIR}; protoc object_detection/protos/*.proto --python_out=.; popd

Skipping git clone of tf models because it exists.


### Now import the rest of the local code with the tf model dependencies

In [3]:
sys.path.append(CODE_SRCDIR)
from cfa_utils.tar_util import extract_tarball_directory
from cfa_utils.example_utils import voc_to_tfrecord_file

## Set up the filesystem environment
Please keep this reasonably idempotent
### Directories

In [4]:
# This is .gitignore'd so it's reasonable to not exist
os.makedirs(DATA_SRCDIR, exist_ok=True)
os.makedirs(f"{TARBALL_SRCDIR}", exist_ok=True)
os.makedirs(f"{TARBALL_STAGING}", exist_ok=True)
os.makedirs(f"{JPEG_SRCDIR}", exist_ok=True)
os.makedirs(f"{ANNOTATION_SRCDIR}", exist_ok=True)
os.makedirs(f"{TRAIN_TFRECORD_SRCDIR}", exist_ok=True)
os.makedirs(f"{VALIDATE_TFRECORD_SRCDIR}", exist_ok=True)
os.makedirs(f"{TEST_TFRECORD_SRCDIR}", exist_ok=True)

### Pull in raw images and annotations
Taken from UnderstandingImages

In [5]:
if len(os.listdir(f"{JPEG_SRCDIR}")) == 0:
    ! rm -f {TARBALL_SRCDIR}/*
    ! aws s3 cp {S3_ALL_IMAGES} {TARBALL_SRCDIR} --recursive --quiet
    jpg_ext = '.jpg'
    r = extract_tarball_directory(TARBALL_SRCDIR, TARBALL_STAGING, jpg_ext, JPEG_SRCDIR)
    print(f"jpeg file count: {r}")
else:
    print("Skipping jpeg copy, files aready exist")

if len(os.listdir(f"{ANNOTATION_SRCDIR}")) == 0:
    ! rm -f {TARBALL_SRCDIR}/*
    ! aws s3 cp {S3_ALL_ANNOTATIONS} {TARBALL_SRCDIR} --recursive --quiet
    xml_ext = '.xml'
    r = extract_tarball_directory(TARBALL_SRCDIR, TARBALL_STAGING, xml_ext, ANNOTATION_SRCDIR)
    print(f"annotation file count: {r}")
    # This is from Make_TFRecords.  Apparently the labeling was not consistent and this normalizes it
    ! sed -i 's/smHotDrink/smallHotDrink/g' {ANNOTATION_SRCDIR}/*.xml
    ! sed -i 's/medColdDrink/mediumColdDrink/g' {ANNOTATION_SRCDIR}/*.xml
    ! sed -i 's/smallSauce/cfaSauce/g' {ANNOTATION_SRCDIR}/*.xml
else:
    print("Skipping annotation copy, files aready exist")

! rm -f {TARBALL_SRCDIR}/*

Skipping jpeg copy, files aready exist
Skipping annotation copy, files aready exist


### Create tfrecord files
From Make_TFRecords

In [6]:
if not os.path.exists(f"{TRAIN_TFRECORD_SRCDIR}/train.tfrecord"):
    voc_to_tfrecord_file(JPEG_SRCDIR,
                        ANNOTATION_SRCDIR,
                        LABEL_MAP_FILE,
                        TFRECORD_SRCDIR,
                        TRAINING_SPLIT_TUPLE)
else:
    print("Skipping voc_to_tfrecord, already have tfrecord files")

Skipping voc_to_tfrecord, already have tfrecord files


## Time to train

Taken from TrainModel_Step3_TrainingJob

Now we have the data we need to train the model.  We are going to go right to training in a SM Training job as that is our desired end state.

### First put the tfrecord files into S3

**Note** I did not have a great way to make this idempotent, so it's not!  Uncomment and run if you need to put stuff in your S3 bucket!

In [8]:
print("Please uncomment me if you need to upload the tfrecords to S3!")
#print(S3_TRAIN_TFRECORDS_URI)
#print(S3_VALIDATE_TFRECORDS_URI)
#!aws s3 cp {TRAIN_TFRECORD_SRCDIR}/*.tfrecord {S3_TRAIN_TFRECORDS_URI}/
#!aws s3 cp {VALIDATE_TFRECORD_SRCDIR}/*.tfrecord {S3_VALIDATE_TFRECORDS_URI}/

Please uncomment me if you need to upload the tfrecords to S3!


### fine-tune checkpoint as a SageMaker input channel
We add a new input channel (basemodel, maybe not a great name but it made sense to me) and simply point to where our model lives in S3.  Then it gets pulled at runtime.

I have another variation of this in ``train.py`` which can also pull it in the script from S3.

We need to figure out which is better for tracability, here or in the train script, because we will need to track this as part of any version metadata we create!

More detail here https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html

In [9]:
model_dir = '/opt/ml/model'
# Keep this relatively cheap - $1.26/hr.  This demo also slashes the step count.  The model won't be great
# bit it works passably.
train_instance_type = 'ml.p2.xlarge'
#train_instance_type = 'local'
# Path is relative to the src_dir
hyperparameters = {'pipeline_config_path' : 'sagemaker_mobilenet_v1_ssd_retrain.config',
                   'num_train_steps' : NUM_TRAIN_STEPS,
                   'num_eval_steps' : NUM_VALIDATE_STEPS
                  }
inputs = {'train': S3_TRAIN_TFRECORDS_URI, 'val': S3_VALIDATE_TFRECORDS_URI, 'basemodel': S3_BASEMODEL_URI}
print(inputs)

{'train': 's3://dev-eadatasciencesb-us-east-1-sagemaker-pdamore/datasets/cfa_products/train', 'val': 's3://dev-eadatasciencesb-us-east-1-sagemaker-pdamore/datasets/cfa_products/val', 'basemodel': 's3://cfa-eadatasciencesb-sagemaker/trained-models/tensorflow_mobilenet/20180718_coco14_mobilenet_v1_ssd300_quantized'}


### Setting output path
We set a base path in S3 for the output files, so they don't hit the default bucket.  When you do this, the source upload directory is also based on output_path, unless you specify a code_location.

In [10]:
estimator = TensorFlow(entry_point='train.py',
                       source_dir='train_src',
                       model_dir=model_dir,
                       output_path=S3_OUTPUT_URI,
                       train_instance_type=train_instance_type,
                       train_instance_count=1,
                       hyperparameters=hyperparameters,
                       role=SM_ROLE,
                       base_job_name='cfa-products-mobilenet-v1-ssd',
                       framework_version='1.14',
                       py_version='py3',
                       script_mode=True)

### Run training job

In [11]:
estimator.fit(inputs)

2019-10-15 03:18:48 Starting - Starting the training job...
2019-10-15 03:18:49 Starting - Launching requested ML instances......
2019-10-15 03:19:50 Starting - Preparing the instances for training......
2019-10-15 03:21:06 Downloading - Downloading input data...
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])[0m
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])[0m
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])[0m
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])[0m
  np_resource = np.dtype([("resource", np.ubyte, 1)])[0m
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])[0m
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])[0m
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])[0m
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])[0m
  np_resource = np.dtype([("resource", np.ubyte, 1)])[0m
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.


2019-10-15 03:22:20 Training - Training image download completed. Training in progress.[31m  Building wheel for pycocotools (setup.py): finished with status 'done'
  Created wheel for pycocotools: filename=pycocotools-2.0.0-cp36-cp36m-linux_x86_64.whl size=288491 sha256=c77dd8ab6b5ed87a61e98a08bf068b0400869d1a9dd91ce94159a87f4adf1706
  Stored in directory: /root/.cache/pip/wheels/dc/e6/36/0e1ae88c868eb42d3f92181b1c9bbd0b217a7ec3da6bd62e55[0m
[31mSuccessfully built pycocotools[0m
[31mInstalling collected packages: pycocotools[0m
[31mSuccessfully installed pycocotools-2.0.0[0m
[31mYou should consider upgrading via the 'pip install --upgrade pip' command.[0m
[31m--> installing: matplotlib[0m
[31mCollecting matplotlib
  Downloading https://files.pythonhosted.org/packages/57/4f/dd381ecf6c6ab9bcdaa8ea912e866dedc6e696756156d8ecc087e20817e2/matplotlib-3.1.1-cp36-cp36m-manylinux1_x86_64.whl (13.1MB)[0m
[31mCollecting cycler>=0.10 (from matplotlib)
  Downloading https://files.pyt

[31mcreating index...[0m
[31mindex created![0m
[31mcreating index...[0m
[31mindex created![0m
[31mRunning per image evaluation...[0m
[31mEvaluate annotation type *bbox*[0m
[31mDONE (t=4.07s).[0m
[31mAccumulating evaluation results...[0m
[31mDONE (t=0.85s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Avera

### Model and endpoint names, based on a version at top of file

In [12]:
# this is where you'll be glad you used a new version number

model_name = 'model-mobilenet-v1-ssd-cfa-products-{}'.format(MODEL_VERSION)
endpoint_name = 'ep-mobilenet-v1-ssd-cfa-products-{}'.format(MODEL_VERSION)
print ("model:", model_name)
print ("endpoint:", endpoint_name)

model: model-mobilenet-v1-ssd-cfa-products-ptd002
endpoint: ep-mobilenet-v1-ssd-cfa-products-ptd002


### Create the Model object and deploy an endpoint.

I had originally thought that we could call estimator.create_model() to get the Model created in SM, but this only creates a programmatic Model object.  You still have to then call deploy() on the model.  There is also an issue passing ``name=`` to the create_model method, because it classes with something in the Estimator code doing the same thing and it fails.  The Estimator actually creates the model, then directly sets the name attribute before deploy().

It seems we are down to two methods to do this:

1. Call model._create_sagemaker_model(), but this is a _ method and feels wrong
2. Make sure we are highly opinionated about our output artifact names so we can easily find them in S3 at deploy time, and create the Model at that point.
2a. We could choose at build time to immediately create an endpoint for testing, which obviates this whole issue.
3. If it's really important, maybe just use boto?

In [13]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.p2.xlarge',
                             model_name=model_name,
                             endpoint_name=endpoint_name
                            )

--------------------------------------------------------------------------------------------------!

### Same stuff as before to get a prediction

In [14]:
# This actually came from cfa_utils and I should be using it there!
def bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

IMAGE_DIR = os.getcwd()
image_filename = "srcdata/jpeg_images/20190710_variety_1562781017.jpg"
image_file_path = os.path.join(IMAGE_DIR, image_filename)
pil_image = Image.open(image_file_path)

# Trying to use tf.io where I can.  This notebook is not eaguer though, so I cheat a bit later
f = tf.io.read_file(image_file_path)

feature = {}
#features['image/encoded'] = tf.io.FixedLenFeature((), tf.string, default_value='')
# This structure mimics what Jay defines in cfa_utils.  Need to try just passing
# in the whole thing with only image/encoded filled in so we can share the code
# Cheat with the tf.Session because I forgot to set eager mode earlier
feature['image/encoded'] = bytes_feature(tf.Session().run(f))
features = tf.train.Features(feature=feature)
ex = tf.train.Example(features=features)
ex_str = ex.SerializePartialToString()


# plt.imshow(pil_image)
# plt.show()
d = {'signature_name': 'serving_default', 'instances': [{'b64': base64.standard_b64encode(ex_str).decode('ascii')}]}

In [15]:
ret = predictor.predict(d)

### Make sure we got a result
TODO: pull in Jay's new code to visualize the prediction.

In [16]:
print(ret)

{'predictions': [{'detection_boxes': [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 

### Always delete your endpoint!

In [None]:
sagemaker.Session().delete_endpoint(predictor.endpoint)