# TensorFlow Estimators: Train, Evaluate, Export, Explained!

In [1]:
import os
import numpy as np
from datetime import datetime

import tensorflow as tf
from tensorflow import data

print "TensorFlow : {}".format(tf.__version__)

TensorFlow : 1.10.0


## Import Data

In [4]:
mnist = tf.contrib.learn.datasets.load_dataset("mnist")
train_data = mnist.train.images
train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
eval_data = mnist.test.images
eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
NUM_CLASSES = 10

Extracting MNIST-data/train-images-idx3-ubyte.gz
Extracting MNIST-data/train-labels-idx1-ubyte.gz
Extracting MNIST-data/t10k-images-idx3-ubyte.gz
Extracting MNIST-data/t10k-labels-idx1-ubyte.gz


In [5]:
print "Train data shape: {}".format(train_data.shape)
print "Eval data shape: {}".format(eval_data.shape)

Train data shape: (55000, 784)
Eval data shape: (10000, 784)


In [6]:
TRAIN_DATA_SIZE = 55000
EVAL_DATA_SIZE = 10000

# Let's fix batch size to 1000
BATCH_SIZE = 1000 

## Create Estimator

In [7]:
def create_estimator(run_config):
    
    feature_columns = [tf.feature_column.numeric_column(key='input_image',shape=784, dtype=tf.float32)]
    
    estimator = tf.estimator.DNNClassifier(
        feature_columns=feature_columns,
        n_classes=NUM_CLASSES,
        hidden_units=[512, 128, 64], 
        dropout=0.85,
        batch_norm=True,
        config=run_config
    )
    
    return estimator

MODELS_LOCATION = 'models/mnist'
MODEL_NAME = 'dnn_classifier'
model_dir = os.path.join(MODELS_LOCATION, MODEL_NAME)


print model_dir

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    model_dir=model_dir
)

models/mnist/dnn_classifier


## Train: Input Function
* Batch size is set
* Epochs is ignored (set to None)

Later we are going to see how to use epochs for training.


In [8]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": train_data},
            y=train_labels,
            batch_size=BATCH_SIZE,
            num_epochs=None,
            shuffle=True)

## Train: Incremental Steps vs. Total Steps
* 1 batch (feed forward pass & backpropagation) corresponds to 1 training step 
* **steps**: Number of steps for which to train model. 'steps' works **incrementally**. Two calls to train(steps=100) means 200 training iterations.
* **max_steps**: Number of **total** steps for which to train model. If set, steps must be None. Two calls to train(max_steps=100) means that the second call will not do any iteration since first call did all 100 steps.


In the following function, **clean_start** flag indicates whether to delete the previous model artefacts (if any), and **incremental** flag indicates whether to use **steps** (for incremental training steps) or **max_steps** (for overall training steps). 

In [10]:
def train_experiment(training_steps, clean_start, incremental, run_config):

    if clean_start == True: 
        if tf.gfile.Exists(run_config.model_dir):
            print("Removing previous artefacts...")
            
            tf.gfile.DeleteRecursively(run_config.model_dir)

    print ""
    estimator = create_estimator(run_config)
    print ""
    
    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 
   
    if incremental:
        estimator.train(train_input_fn, steps=training_steps)
    else:
        estimator.train(train_input_fn, max_steps=training_steps)
        
    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    
    return estimator

In [11]:
train_experiment(
    training_steps=1000, 
    clean_start=True,
    incremental=False,
    run_config=run_config
)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x115519990>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 19:37:28
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x10203cc90>

Total number of steps 1000.

Lets run this again, with max_steps, without deleting the previous model.

In [12]:
train_experiment(
    training_steps=1000, 
    clean_start=False,
    incremental=False,
    run_config=run_config
)


INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x115519990>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 19:38:19
.......................................
INFO:tensorflow:Skipping training since max_steps has already saved.
.......................................
Experiment finished at 19:38:19

Experiment elapsed time: 0.007219 seconds


<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x115cf5b90>

As expected, no training occured and since max_steps was reached.

Now let's try incremetal steps

In [14]:
train_experiment(
    training_steps=1000, 
    clean_start=False,
    incremental=True,
    run_config=run_config
)


INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x115519990>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 19:38:33
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/mnist/dnn_classifier/model.ckpt-1000
INFO:tensorflow:Running local_ini

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x101b6ca90>

As shown, the total number of training steps is 2000, starting from step 1000 (from the previous run)

## Train: Steps vs Epochs

While the steps refers to how many **data batchs** are needed for training, the epochs refers to how many times the **whole training data** needs to be used for training. 

While using epochs to define the number of training iteration is a conventional practice in machine learninig, however, when working with very large datasets to train Deep Learning models, batch-level training steps (rather than the whole-training-data-level epochs) are more practical.

In [15]:
num_epochs=10

train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": train_data},
            y=train_labels,
            batch_size=BATCH_SIZE,
            num_epochs=num_epochs,
            shuffle=True)

In [17]:
expected_training_steps = (TRAIN_DATA_SIZE/BATCH_SIZE)*num_epochs

print 'Training data size: {}'.format(TRAIN_DATA_SIZE) 
print 'Batch size: {}'.format(BATCH_SIZE) 
print 'Number of epochs (supplied): {}'.format(num_epochs) 
print 'Number of training steps (expected): {}'.format(expected_training_steps)
print ''

train_experiment(
    training_steps=None, 
    clean_start=True,
    incremental=True,
    run_config=run_config
)

Training data size: 55000
Batch size: 1000
Number of epochs (supplied): 10
Number of training steps (expected): 550

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x115519990>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 19:40:31
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:ten

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1157f7290>

As expected, the training steps, given 10 epochs (for training data of 55000 records and batch size of 1000 records), is 501, which correspods to: ** (TRAIN_DATA_SIZE / BATCH_SIZE) * num_epochs)**

Note that, if both num_epochs (in the train_input_fn) and steps (in estimator.train) are supplied, the model will stop on the earlier criteria.

In [18]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": train_data},
            y=train_labels,
            batch_size=BATCH_SIZE,
            num_epochs=1000, 
            shuffle=True)

train_experiment(
    training_steps=10, # the model will train for only 10 steps, ignoring the 1000 epochs
    clean_start=True,
    incremental=True,
    run_config=run_config
)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x115519990>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 19:41:09
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x116a0de10>

In [21]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": train_data},
            y=train_labels,
            batch_size=BATCH_SIZE,
            num_epochs=1, # the model will train for only 1 epoch (55 steps), ignoring the 1000 steps
            shuffle=True)

train_experiment(
    training_steps=10000000, 
    clean_start=True,
    incremental=True,
    run_config=run_config
)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x115519990>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 19:41:33
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x103814450>

## Train: Checkpoints

By default, a checkpoint is saved every 600 secs (10mins). This behaviour is configured in the run_config passed to the estimator, using only one of the following parameters:

* **save_checkpoints_secs**: Save checkpoints every this many seconds. 
* **save_checkpoints_steps**: Save checkpoints every this many steps.

In addition, you can specify the number of the checkpoints to keep using **keep_checkpoint_max**  Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) 


The following code trains the model for 1000 steps...

In [22]:
os.environ['MODEL_DIR'] = model_dir

In [23]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": train_data},
            y=train_labels,
            batch_size=BATCH_SIZE,
            num_epochs=None,
            shuffle=True)

train_experiment(
    training_steps=1000, 
    clean_start=True,
    incremental=True,
    run_config=run_config # using the default checkpoints param values
)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x115519990>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 19:41:42
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x117371050>

In [24]:
%%bash

ls ${MODEL_DIR}

checkpoint
graph.pbtxt
model.ckpt-0.data-00000-of-00001
model.ckpt-0.index
model.ckpt-0.meta
model.ckpt-1000.data-00000-of-00001
model.ckpt-1000.index
model.ckpt-1000.meta


As shown, since the training (1000 iterasion) finished in less than 600 seconds (default value for **save_checkpoint_sec), only 2 checkpoints where saved: the initial one, and the final one.


Now let's set **save_checkpoints_steps** in the run_config to 200, so that in 1000 steps, you produce 5 checkpoints

In [26]:
run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    model_dir=model_dir,
    save_checkpoints_steps=200, ## so in 1000 steps, you produce 5 checkpoints
    save_checkpoints_secs=None
)


In [32]:
estimator=train_experiment(
    training_steps=1000, 
    clean_start=True,
    incremental=True,
    run_config=run_config 
)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x116236f50>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': 200, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 19:44:50
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:

In [33]:
%%bash

ls ${MODEL_DIR}

checkpoint
graph.pbtxt
model.ckpt-1000.data-00000-of-00001
model.ckpt-1000.index
model.ckpt-1000.meta
model.ckpt-200.data-00000-of-00001
model.ckpt-200.index
model.ckpt-200.meta
model.ckpt-400.data-00000-of-00001
model.ckpt-400.index
model.ckpt-400.meta
model.ckpt-600.data-00000-of-00001
model.ckpt-600.index
model.ckpt-600.meta
model.ckpt-800.data-00000-of-00001
model.ckpt-800.index
model.ckpt-800.meta


Each checkpoint is labelled by the step number it was saved in.

## Evaluate: Epochs vs Steps
* **batch_size** is set (which can be bigger than batch size of training, of the batch fits in memory)
* **num_epochs** is usually set to 1 (as you want to evaluate your model on the entire evaluation data once)

In [34]:
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": eval_data},
            y=eval_labels,
            batch_size=BATCH_SIZE,
            num_epochs=1,
            shuffle=False)

For evaluation, if you set epochs to be 1, you can ignore the steps param (set it you None).


By default, the latest checkpoint is evaluated

In [35]:
estimator.evaluate(eval_input_fn, steps=None)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-14-19:45:39
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/mnist/dnn_classifier/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-14-19:45:39
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.632, average_loss = 1.3125578, global_step = 1000, loss = 1312.5579
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: models/mnist/dnn_classifier/model.ckpt-1000


{'accuracy': 0.632,
 'average_loss': 1.3125578,
 'global_step': 1000,
 'loss': 1312.5579}

This is equivalent to setting epochs to None and setting the **steps** to the number of batches in the dataset

In [36]:
steps = EVAL_DATA_SIZE / BATCH_SIZE

eval_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": eval_data},
            y=eval_labels,
            batch_size=BATCH_SIZE,
            num_epochs=None,
            shuffle=False)

estimator.evaluate(eval_input_fn, steps=steps)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-14-19:45:51
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/mnist/dnn_classifier/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/10]
INFO:tensorflow:Evaluation [2/10]
INFO:tensorflow:Evaluation [3/10]
INFO:tensorflow:Evaluation [4/10]
INFO:tensorflow:Evaluation [5/10]
INFO:tensorflow:Evaluation [6/10]
INFO:tensorflow:Evaluation [7/10]
INFO:tensorflow:Evaluation [8/10]
INFO:tensorflow:Evaluation [9/10]
INFO:tensorflow:Evaluation [10/10]
INFO:tensorflow:Finished evaluation at 2018-09-14-19:45:51
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.632, average_loss = 1.3125578, global_step = 1000, loss = 1312.5579
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: models/mnist/dnn_classifier/model.ckpt-1000


{'accuracy': 0.632,
 'average_loss': 1.3125578,
 'global_step': 1000,
 'loss': 1312.5579}

As shown, the same results are produced!

## Export: Serving Input Receiver Function

In [37]:
def make_serving_input_receiver_fn():
    inputs = {'input_image': tf.placeholder(shape=[None,784], dtype=tf.float32, name='input_image')}
    return tf.estimator.export.build_raw_serving_input_receiver_fn(inputs)

export_dir = os.path.join(model_dir, 'export')

if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
        
estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=make_serving_input_receiver_fn()
)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures EXCLUDED from export because they cannot be be served via TensorFlow Serving APIs:
INFO:tensorflow:'serving_default' : Classification input must be a single string Tensor; got {'input_image': <tf.Tensor 'input_image:0' shape=(?, 784) dtype=float32>}
INFO:tensorflow:'classification' : Classification input must be a single string Tensor; got {'input_image': <tf.Tensor 'input_image:0' shape=(?, 784) dtype=float32>}
INFO:tensorflow:Restoring parameters from models/mnist/dnn_classifier/model.ckpt-1000
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.


'models/mnist/dnn_classifier/export/1536954375'

In [38]:
%%bash

saved_models_base=${MODEL_DIR}/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
ls ${saved_model_dir}
saved_model_cli show --dir=${saved_model_dir} --all

models/mnist/dnn_classifier/export/1536954375
saved_model.pb
variables

MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['predict']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['input_image'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 784)
        name: input_image:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['class_ids'] tensor_info:
        dtype: DT_INT64
        shape: (-1, 1)
        name: dnn/head/predictions/ExpandDims:0
    outputs['classes'] tensor_info:
        dtype: DT_STRING
        shape: (-1, 1)
        name: dnn/head/predictions/str_classes:0
    outputs['logits'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 10)
        name: dnn/logits/BiasAdd:0
    outputs['probabilities'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 10)
        name: dnn/head/predictions/probabilities:0
  Method name is: tensorflow/serving/predict


## Intertwining Training & Evalution
* Use TrainSpec & EvalSpec with tf.estimator.train_and_evaluate()
* In TrainSpec:
    * **num_epochs** in the **train_input_fn** is ignored. (Set it to None)
    * You need to set **max_steps** param, otherwise it will train forever
* In EvalSpec (to evaluate the model using the whole evaluation data once):
    * **num_epochs** is set to 1 in the **eval_input_fn**
    * **steps** param is set to None
* **Evaluation** occurs when a **new checkpoint** is saved
* Checkpoints saving frequency is configures in run_config (using save_checkpoints_steps or save_checkpoints_secs)
* You can set minimum amount of time between two evaluation, using **throttle_secs** in EvalSpec. For example, if **throttle_secs** is set to 60sec, this means that the following evaluation will only occure after 60sec from the previous evaluation, even if **save_checkpoins_sec** is set to 10.
* If **throttle_secs** is set to 0, then evaluation will occure each time a checkpoint is saved, regardless the time difference between two consequtive checkpoints

In [150]:
def train_and_evaluate_experiment(params, run_config):
    
    # TrainSpec ####################################
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": train_data},
            y=train_labels,
            batch_size=params.batch_size,
            num_epochs=None,
            shuffle=True
    )
    
    train_spec = tf.estimator.TrainSpec(
        input_fn = train_input_fn,
        max_steps=params.traning_steps
    )
    ###############################################
    
    
    # EvalSpec ####################################
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": eval_data},
            y=eval_labels,
            batch_size=params.batch_size,
            num_epochs=1,
            shuffle=False
    )

    eval_spec = tf.estimator.EvalSpec(
        name=datetime.utcnow().strftime("%H%M%S"),
        input_fn = eval_input_fn,
        steps=None,
        start_delay_secs=0,
        throttle_secs=params.eval_throttle_secs
    )
    
    ###############################################

    tf.logging.set_verbosity(tf.logging.INFO)
    
    if params.clean_start:
        if tf.gfile.Exists(run_config.model_dir):
            print("Removing previous artefacts...")
            tf.gfile.DeleteRecursively(run_config.model_dir)
            

    print ''
    estimator = create_estimator(run_config)
    print ''
    
    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )

    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    
    return estimator


Now let's try the following:
* Training for 1000 steps (set **num_epochs** to None and **max_steps** to 1000).
* Save a checkpoint after each 200 steps (set **save_checkpoints_steps** to 200).
* Evaluate when each checkpoint is produced (set **eval_throttle_secs** to 0). That is, 5 evaluations in total
* Keep only the latest 3 checkpoints out of the 5 checkpoints to be saved (set **keep_checkpoint_max** to 3)
* When evaluating, use the whole eval_data once (set **num_epochs** to 1 and **steps** to None).


In [151]:
params  = tf.contrib.training.HParams(
    batch_size=BATCH_SIZE,
    traning_steps=1000,
    eval_throttle_secs=0,
    clean_start=True,
)

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    save_checkpoints_steps=200,
    keep_checkpoint_max=3,
    model_dir=model_dir
)

In [152]:
train_and_evaluate_experiment(params, run_config)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 3, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x115bc7510>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': 200, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 23:23:13
.......................................
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_c

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x115599910>

In [45]:
%%bash

ls ${MODEL_DIR}

checkpoint
graph.pbtxt
model.ckpt-1000.data-00000-of-00001
model.ckpt-1000.index
model.ckpt-1000.meta
model.ckpt-600.data-00000-of-00001
model.ckpt-600.index
model.ckpt-600.meta
model.ckpt-800.data-00000-of-00001
model.ckpt-800.index
model.ckpt-800.meta


In order to train the model for **num_epochs**, you need to do the following:
* the training data size needs to be known before training
* compute the training steps as: **(TRAIN_DATA_SIZE / BATCH_SIZE) * num_epochs**
* In TrainSpec, **max_step** to the computed value
* set **num_epochs** in the train_input_fn to None

In [129]:
num_epochs = 10
computed_traning_steps = (TRAIN_DATA_SIZE/BATCH_SIZE)*num_epochs

print 'Training data size: {}'.format(TRAIN_DATA_SIZE) 
print 'Batch size: {}'.format(BATCH_SIZE) 
print 'Number of epochs (supplied): {}'.format(BATCH_SIZE) 
print 'Number of training steps (computed): {}'.format(computed_traning_steps)
print ''


params = tf.contrib.training.HParams(
    batch_size=BATCH_SIZE,
    traning_steps=computed_traning_steps,
    eval_throttle_secs=0,
    clean_start=True,
)

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    save_checkpoints_steps=250,
    model_dir=model_dir
)

train_and_evaluate_experiment(params, run_config)

Training data size: 55000
Batch size: 1000
Number of epochs (supplied): 1000
Number of training steps (computed): 550

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1172d27d0>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': 250, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 23:14:16
.......................................
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. T

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x116b35c90>

Note that, since we set **save_checkpoint_steps** to 250, and we have 550 steps, we get 4 checkpoints (and corresponding evaluations):
* in the beginning
* at step 250
* at step 500
* in the end (at step 550)

In [51]:
%%bash

ls ${MODEL_DIR}

checkpoint
graph.pbtxt
model.ckpt-0.data-00000-of-00001
model.ckpt-0.index
model.ckpt-0.meta
model.ckpt-250.data-00000-of-00001
model.ckpt-250.index
model.ckpt-250.meta
model.ckpt-500.data-00000-of-00001
model.ckpt-500.index
model.ckpt-500.meta
model.ckpt-550.data-00000-of-00001
model.ckpt-550.index
model.ckpt-550.meta


## Train, Evaluate, and Export

In [162]:
def train_evaluate_export_experiment(params, run_config, exporter):
    
    # TrainSpec ####################################
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": train_data},
            y=train_labels,
            batch_size=params.batch_size,
            num_epochs=None,
            shuffle=True
    )
    
    train_spec = tf.estimator.TrainSpec(
        input_fn = train_input_fn,
        max_steps=params.traning_steps
    )
    ###############################################

    
    # EvalSpec ####################################

    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": eval_data},
            y=eval_labels,
            batch_size=params.batch_size,
            num_epochs=1,
            shuffle=False
    )
    
    eval_spec = tf.estimator.EvalSpec(
        name=params.eval_name,
        input_fn=eval_input_fn,
        exporters=[exporter],
        steps=None,
        start_delay_secs=0,
        throttle_secs=params.eval_throttle_secs
    )
    
    ###############################################

    tf.logging.set_verbosity(tf.logging.INFO)
    
    if params.clean_start:
        if tf.gfile.Exists(run_config.model_dir):
            print("Removing previous artefacts...")
            tf.gfile.DeleteRecursively(run_config.model_dir)
            

    print ''
    estimator = create_estimator(run_config)
    print ''
    
    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )

    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    
    return estimator


**Latest exporter** exports a model after each evaluation. 

You can specify the maximum number of exported models to keep using **exports_to_keep** param

In [163]:
exporter = tf.estimator.LatestExporter(
            name="estimate", 
            serving_input_receiver_fn=make_serving_input_receiver_fn(),
            exports_to_keep=3,
)

params = tf.contrib.training.HParams(
    batch_size=BATCH_SIZE,
    traning_steps=computed_traning_steps,
    eval_throttle_secs=0,
    clean_start=True,
    eval_name=datetime.utcnow().strftime("%H%M%S")
)

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    save_checkpoints_steps=250,
    model_dir=model_dir
)

train_evaluate_export_experiment(params, run_config, exporter)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x116a29510>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': 250, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 23:30:45
.......................................
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_c

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1167fc4d0>

In [96]:
%%bash

saved_models_base=${MODEL_DIR}/export/estimate/
echo 'exported model folders:'
ls ${saved_models_base}
echo ''

saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo 'last exported model: '${saved_model_dir}
ls ${saved_model_dir}
saved_model_cli show --dir=${saved_model_dir} --all

exported model folders:
1536965618
1536965632
1536965636

last exported model: models/mnist/dnn_classifier/export/estimate/1536965636
saved_model.pb
variables

MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['predict']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['input_image'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 784)
        name: input_image_11:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['class_ids'] tensor_info:
        dtype: DT_INT64
        shape: (-1, 1)
        name: dnn/head/predictions/ExpandDims:0
    outputs['classes'] tensor_info:
        dtype: DT_STRING
        shape: (-1, 1)
        name: dnn/head/predictions/str_classes:0
    outputs['logits'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 10)
        name: dnn/logits/BiasAdd:0
    outputs['probabilities'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 10)
        

**Final exporter** exports only the very last evaluated checkpoint

In [164]:
exporter = tf.estimator.FinalExporter(
            name="estimate",
            serving_input_receiver_fn=make_serving_input_receiver_fn()
)

params = tf.contrib.training.HParams(
    batch_size=BATCH_SIZE,
    traning_steps=computed_traning_steps,
    eval_throttle_secs=0,
    clean_start=True,
    eval_name=datetime.utcnow().strftime("%H%M%S")
)

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    save_checkpoints_steps=250,
    model_dir=model_dir
)

train_evaluate_export_experiment(params, run_config, exporter)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1157da250>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': 250, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 23:31:28
.......................................
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_c

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1160e0fd0>

In [74]:
%%bash

saved_models_base=${MODEL_DIR}/export/estimate/
echo 'exported model folders:'
ls ${saved_models_base}
echo ''

saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo 'last exported model: '${saved_model_dir}
ls ${saved_model_dir}
saved_model_cli show --dir=${saved_model_dir} --all

exported model folders:
1536964830

last exported model: models/mnist/dnn_classifier/export/estimate/1536964830
saved_model.pb
variables

MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['predict']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['input_image'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 784)
        name: input_image_6:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['class_ids'] tensor_info:
        dtype: DT_INT64
        shape: (-1, 1)
        name: dnn/head/predictions/ExpandDims:0
    outputs['classes'] tensor_info:
        dtype: DT_STRING
        shape: (-1, 1)
        name: dnn/head/predictions/str_classes:0
    outputs['logits'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 10)
        name: dnn/logits/BiasAdd:0
    outputs['probabilities'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 10)
        name: dnn/head/predicti

The **best exporter** runs everytime when the new model is better than any exsiting model. 

It uses the evaluation events stored under the **eval** folder. 

You need to set the **name** of the subfolder in the EvalSpec, and set the **event_file_pattern** in the BestExporter to point to this folder and perform the evalution comparesions.

In [166]:
eval_name=datetime.utcnow().strftime("%H%M%S")

exporter = tf.estimator.BestExporter(
            event_file_pattern='eval_{}/*.tfevents.*'.format(eval_name),
            name="estimate", 
            serving_input_receiver_fn=make_serving_input_receiver_fn(),
            exports_to_keep=1
)

params = tf.contrib.training.HParams(
    batch_size=BATCH_SIZE,
    traning_steps=computed_traning_steps,
    eval_throttle_secs=0,
    exporter_type='best',
    clean_start=True,
    eval_name=eval_name
)

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    save_checkpoints_steps=250,
    model_dir=model_dir
)

train_evaluate_export_experiment(params, run_config, exporter)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11582dfd0>, '_model_dir': 'models/mnist/dnn_classifier', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': 250, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 23:37:10
.......................................
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_c

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1171a4f10>

In [167]:
%%bash

saved_models_base=${MODEL_DIR}/export/estimate/
echo 'exported model folders:'
ls ${saved_models_base}
echo ''

saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo 'last exported model: '${saved_model_dir}
ls ${saved_model_dir}
saved_model_cli show --dir=${saved_model_dir} --all

exported model folders:
1536968260

last exported model: models/mnist/dnn_classifier/export/estimate/1536968260
saved_model.pb
variables

MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['predict']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['input_image'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 784)
        name: input_image_20:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['class_ids'] tensor_info:
        dtype: DT_INT64
        shape: (-1, 1)
        name: dnn/head/predictions/ExpandDims:0
    outputs['classes'] tensor_info:
        dtype: DT_STRING
        shape: (-1, 1)
        name: dnn/head/predictions/str_classes:0
    outputs['logits'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 10)
        name: dnn/logits/BiasAdd:0
    outputs['probabilities'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 10)
        name: dnn/head/predict