### This notebook describes how to implement keras code with TF1.4 APIs.

Unfortunately, **distributed training with tf.keras is not supported** and **exporting model is only supported by latest tensorflow (nightly-built version)**.

Content of this notebook is shown below.

0. Create TFRecords Files
1. Define dataset and global constants
2. Define data input function
3. Define feature columns
4. Instantiate an Estimator
5. Train, evaluate and export ML models
6. Evaluate with Estimator
7. Prediction with Exported Model
8. Distributed Training with Cloud ML Engine

In [None]:
!pip uninstall tensorflow -y -q
!pip install tf-nightly -q

## 1. Create TFRecords Files.

In [None]:
import cPickle
import os
import shutil
import tarfile
import tensorflow as tf

print(tf.__version__)

In [None]:
CIFAR_FILENAME = 'cifar-10-python.tar.gz'
CIFAR_DOWNLOAD_URL = 'http://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME
CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py'

In [None]:
def _download_and_extract(data_dir):
  tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, CIFAR_DOWNLOAD_URL)
  tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), 'r:gz').extractall(data_dir)

In [None]:
def _get_file_names():
  """Returns the file names expected to exist in the input_dir."""
  file_names = {}
  file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)]
  file_names['validation'] = ['data_batch_5']
  file_names['eval'] = ['test_batch']
  return file_names

In [None]:
def _read_pickle_from_file(filename):
  with tf.gfile.Open(filename, 'r') as f:
    data_dict = cPickle.load(f)
  return data_dict

In [None]:
def _convert_to_tfrecord(input_files, output_file):
  """Converts a file to TFRecords."""
  print('Generating %s' % output_file)
  with tf.python_io.TFRecordWriter(output_file) as record_writer:
    for input_file in input_files:
      data_dict = _read_pickle_from_file(input_file)
      data = data_dict['data']
      labels =  data_dict['labels']
      num_entries_in_batch = len(labels)
      for i in range(num_entries_in_batch):
        example = tf.train.Example(features=tf.train.Features(
          feature={
            'image': _bytes_feature(data[i].tobytes()),
            'label': _int64_feature(labels[i])
          }))
        record_writer.write(example.SerializeToString())

In [None]:
def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value)]))

In [None]:
def create_tfrecords_files(data_dir='cifar-10'):
  _download_and_extract(data_dir)
  file_names = _get_file_names()
  input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER)

  for mode, files in file_names.items():
    input_files = [os.path.join(input_dir, f) for f in files]
    output_file = os.path.join(data_dir, mode+'.tfrecords')
    try:
      os.remove(output_file)
    except OSError:
      pass
    # Convert to tf.train.Example and write to TFRecords.
    _convert_to_tfrecord(input_files, output_file)

In [None]:
create_tfrecords_files()

## 2. Define dataset and global constants

In [None]:
# Process images of this size. Note that this differs from the original CIFAR
# image size of 32 x 32. If one alters this number, then the entire model
# architecture will change and any model would need to be retrained.
IMAGE_HEIGHT = 32
IMAGE_WIDTH = 32
IMAGE_DEPTH = 3

# Global constants describing the CIFAR-10 data set.
NUM_CLASSES = 10

# Global constants describing model behaviors
MODEL_NAME = 'distributed-keras'
USE_CHECKPOINT = False

In [None]:
train_data_files = ['cifar-10/train.tfrecords']
valid_data_files = ['cifar-10/validation.tfrecords']
test_data_files = ['cifar-10/eval.tfrecords']

## 3. Define Data Input Function

### a. parsing CIFAR-10 dataset

In [None]:
def parse_record(serialized_example):
  features = tf.parse_single_example(
    serialized_example,
    features={
      'image': tf.FixedLenFeature([], tf.string),
      'label': tf.FixedLenFeature([], tf.int64),
    })
  
  image = tf.decode_raw(features['image'], tf.uint8)
  image.set_shape([IMAGE_DEPTH * IMAGE_HEIGHT * IMAGE_WIDTH])
  image = tf.reshape(image, [IMAGE_DEPTH, IMAGE_HEIGHT, IMAGE_WIDTH])
  image = tf.cast(tf.transpose(image, [1, 2, 0]), tf.float32)
  
  label = tf.cast(features['label'], tf.int32)
  label = tf.one_hot(label, NUM_CLASSES)

  return image, label

### b. preprocessing CIFAR-10 dataset

In [None]:
def preprocess_image(image, is_training=False):
  """Preprocess a single image of layout [height, width, depth]."""
  if is_training:
    # Resize the image to add four extra pixels on each side.
    image = tf.image.resize_image_with_crop_or_pad(
        image, IMAGE_HEIGHT + 8, IMAGE_WIDTH + 8)

    # Randomly crop a [_HEIGHT, _WIDTH] section of the image.
    image = tf.random_crop(image, [IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_DEPTH])

    # Randomly flip the image horizontally.
    image = tf.image.random_flip_left_right(image)

  # Subtract off the mean and divide by the variance of the pixels.
  image = tf.image.per_image_standardization(image)
  return image

### c. data pipeline input function

In [None]:
def generate_input_fn(file_names, mode=tf.estimator.ModeKeys.EVAL, batch_size=1):
  def _input_fn():
    dataset = tf.data.TFRecordDataset(filenames=file_names)

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    if is_training:
      buffer_size = batch_size * 2 + 1
      dataset = dataset.shuffle(buffer_size=buffer_size)

    # Transformation
    dataset = dataset.map(parse_record)
    dataset = dataset.map(
      lambda image, label: (preprocess_image(image, is_training), label))

    dataset = dataset.repeat()
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(2 * batch_size)

    images, labels = dataset.make_one_shot_iterator().get_next()

    features = {'images': images}
    return features, labels
  
  return _input_fn

## 4. Define Feature Columns

In [None]:
def get_feature_columns():
  feature_columns = {
    'images': tf.feature_column.numeric_column('images', (IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_DEPTH)),
  }
  return feature_columns

In [None]:
feature_columns = get_feature_columns()
print("Feature Columns: {}".format(feature_columns))

## 5. Instantiate an Estimator

In [None]:
def get_model():
  model = tf.keras.models.Sequential()
  # Define input tensor in Keras world.
  model.add(tf.keras.layers.InputLayer(
    input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_DEPTH), name='images'))

  # The first convolutional layer.
  model.add(tf.keras.layers.Conv2D(
    filters=32, kernel_size=(3, 3), padding='same', activation='relu'))
  model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), padding='same'))

  # The second convolutional layer.
  model.add(tf.keras.layers.Conv2D(
    filters=32, kernel_size=(3, 3), padding='same', activation='relu'))
  model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), padding='same'))
  model.add(tf.keras.layers.Dropout(0.25))
    
  # The third convolutional layer
  model.add(tf.keras.layers.Conv2D(
    filters=64, kernel_size=(3, 3), padding='same', activation='relu'))
    
  # The fourth convolutional layer
  model.add(tf.keras.layers.Conv2D(
    filters=64, kernel_size=(3, 3), padding='same', activation='relu'))
  model.add(tf.keras.layers.Dropout(0.25))
    
  model.add(tf.keras.layers.Flatten())
    
  model.add(tf.keras.layers.Dense(512, activation='relu'))
  model.add(tf.keras.layers.Dropout(0.5))

  model.add(tf.keras.layers.Dense(NUM_CLASSES))
  model.add(tf.keras.layers.Activation('softmax'))
  
  opt = tf.keras.optimizers.RMSprop(lr=0.0001, decay=1e-6)
  model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
  return model

## 6. Train, Evaluate and Export ML Models

### a. Set HParam and RunConfig

In [None]:
model_dir = 'trained_models/{}'.format(MODEL_NAME)

run_config = tf.estimator.RunConfig(
  keep_checkpoint_max=5,
  tf_random_seed=19851211
)

### b. Define Serving Function

In [None]:
def serving_input_fn():
  receiver_tensor = {'images': tf.placeholder(shape=[None, 32, 32, 3], dtype=tf.float32)}
  features = {'images': tf.map_fn(preprocess_image, receiver_tensor['images'])}
  return tf.estimator.export.ServingInputReceiver(features, receiver_tensor)

### c. Train and Evaluate

In [None]:
# Set learning phase as Training
tf.keras.backend.set_learning_phase(True)

# Get model defined with tf.keras
keras_model = get_model()

# Create estimator from keras model
estimator = tf.keras.estimator.model_to_estimator(
  keras_model=keras_model, model_dir=model_dir, config=run_config)

# Currently (2017.12.14) the latest tf only support exporter with keras models.
exporter = tf.estimator.LatestExporter(
  name='Servo',
  serving_input_receiver_fn=serving_input_fn,
  assets_extra=None,
  as_text=False,
  exports_to_keep=5)

train_spec = tf.estimator.TrainSpec(
  input_fn=generate_input_fn(file_names=train_data_files,
                             mode=tf.estimator.ModeKeys.TRAIN,
                             batch_size=100),
  max_steps=1000,
  hooks=None
)

eval_spec = tf.estimator.EvalSpec(
  input_fn=generate_input_fn(file_names=valid_data_files,
                             mode=tf.estimator.ModeKeys.EVAL,
                             batch_size=100),
  steps=50,
  name=None,
  hooks=None,
  exporters=exporter, # Iterable of Exporters, or single one or None.
  start_delay_secs=120,
  throttle_secs=600
)

In [None]:
if not USE_CHECKPOINT:
  print("Removing previous artifacts...")
  shutil.rmtree(model_dir, ignore_errors=True)

tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

## 7. Evaluate with Estimator

In [None]:
test_size = 1000

test_input_fn = generate_input_fn(file_names=test_data_files,
                                  mode=tf.estimator.ModeKeys.EVAL,
                                  batch_size=test_size)

test_results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print(test_results)

## 8. Prediction with Exported Model

In [None]:
export_dir = model_dir + '/export/Servo/'
saved_model_dir = os.path.join(export_dir, os.listdir(export_dir)[-1]) 

predictor_fn = tf.contrib.predictor.from_saved_model(
  export_dir = saved_model_dir,
  signature_def_key=tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)

In [None]:
# Will use the first and the last layers' name
keras_model.summary()

In [None]:
import numpy

data_dict = _read_pickle_from_file('cifar-10/cifar-10-batches-py/test_batch')

N = 1000
images = data_dict['data'][:N].reshape([N, 3, 32, 32]).transpose([0, 2, 3, 1])
labels = data_dict['labels'][:N]

output = predictor_fn({'images': images})

accuracy = numpy.sum(
  [ans==ret for ans, ret in zip(labels, numpy.argmax(output['activation_4'], axis=1))]) / float(N)

print(accuracy)

## 9. Distributed Training with Cloud ML Engine

Unfortunately, tf.keras currently doesn't support distributed training. [The Issue](https://github.com/tensorflow/tensorflow/issues/14504) has already been assigned to Keras author.

### a. Set environments

In [None]:
import os

PROJECT = 'project-id' # REPLACE WITH YOUR PROJECT ID
BUCKET = 'bucket-name' # REPLACE WITH YOUR BUCKET NAME
REGION = 'bucket-region' # REPLACE WITH YOUR BUCKET REGION e.g. us-central1

PROJECT = 'yaboo-sandbox'
BUCKET = 'yaboo-sandbox-poc-for-lg'
REGION = 'us-central1'

os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION

In [None]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

### b. Set permission to BUCKET (NOTE: Create bucket beforehand)

In [None]:
%%bash

PROJECT_ID=$PROJECT
AUTH_TOKEN=$(gcloud auth print-access-token)

SVC_ACCOUNT=$(curl -X GET -H "Content-Type: application/json" \
    -H "Authorization: Bearer $AUTH_TOKEN" \
    https://ml.googleapis.com/v1/projects/${PROJECT_ID}:getConfig \
    | python -c "import json; import sys; response = json.load(sys.stdin); \
    print response['serviceAccount']")

echo "Authorizing the Cloud ML Service account $SVC_ACCOUNT to access files in $BUCKET"
gsutil -m defacl ch -u $SVC_ACCOUNT:R gs://$BUCKET
gsutil -m acl ch -u $SVC_ACCOUNT:R -r gs://$BUCKET  # error message (if bucket is empty) can be ignored
gsutil -m acl ch -u $SVC_ACCOUNT:W gs://$BUCKET

### c. Copy TFRecords files to GCS BUCKET

In [None]:
%%bash

echo ${BUCKET}
gsutil -m rm -rf gs://${BUCKET}/cifar-10
gsutil -m cp cifar-10/*.tfrecords gs://${BUCKET}/cifar-10

### d. Run distributed training with Cloud MLE

#### [Success] with Keras, BASIC (single), HEAD (1.5.0-dev20171207)

In [None]:
%%bash

OUTDIR=gs://$BUCKET/trained_models_dk1
JOBNAME=sm_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME

gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
   --region=$REGION \
   --module-name='distributed-keras.task' \
   --package-path='trainer/distributed-keras' \
   --job-dir=$OUTDIR \
   --staging-bucket=gs://$BUCKET \
   --scale-tier=BASIC \
   --runtime-version=HEAD \
   -- \
   --bucket_name=$BUCKET \
   --train_data_pattern='cifar-10/train*.tfrecords' \
   --eval_data_pattern='cifar-10/eval*.tfrecords'  \
   --output_dir=$OUTDIR

#### [Fail] Keras, BASIC_GPU (single), HEAD (1.5.0-dev20171208)

In [None]:
%%bash

OUTDIR=gs://$BUCKET/trained_models_dk2
JOBNAME=sm_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME

gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
   --region=$REGION \
   --module-name='distributed-keras.task' \
   --package-path='trainer/distributed-keras' \
   --job-dir=$OUTDIR \
   --staging-bucket=gs://$BUCKET \
   --scale-tier=BASIC_GPU \
   --runtime-version=HEAD \
   -- \
   --bucket_name=$BUCKET \
   --train_data_pattern='cifar-10/train*.tfrecords' \
   --eval_data_pattern='cifar-10/eval*.tfrecords'  \
   --output_dir=$OUTDIR

#### [Failed] Keras, STANDARD_1 (distributed), HEAD (1.5.0-dev20171208)

InvalidArgumentError (see above for traceback): Cannot assign a device for operation 'loss/activation_1_loss/sub': Operation was explicitly assigned to /job:ps/task:2 but available devices are [ /job:localhost/replica:0/task:0/device:CPU:0 ]. Make sure the device specification refers to a valid device. `[[Node: loss/activation_1_loss/sub = Sub[T=DT_FLOAT, _device="/job:ps/task:2"](loss/activation_1_loss/sub/x, loss/activation_1_loss/Const)]]`

In [None]:
%%bash

OUTDIR=gs://$BUCKET/trained_models_dk3
JOBNAME=sm_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME

gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
   --region=$REGION \
   --module-name='distributed-keras.task' \
   --package-path='trainer/distributed-keras' \
   --job-dir=$OUTDIR \
   --staging-bucket=gs://$BUCKET \
   --scale-tier=STANDARD_1 \
   --runtime-version=HEAD \
   -- \
   --bucket_name=$BUCKET \
   --train_data_pattern='cifar-10/train*.tfrecords' \
   --eval_data_pattern='cifar-10/eval*.tfrecords'  \
   --output_dir=$OUTDIR

## Root Cause?

As shown above, tf.keras doesn't work in distributed fashion now. The same issue can be found on [GitHub Issue](https://github.com/tensorflow/tensorflow/issues/14504). Minimal example is shown here.

In [None]:
import os
import numpy as np
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

simulate_cluster = True # InvalidArgumentError will happen if this is True
if simulate_cluster:
    os.environ["TF_CONFIG"] = '{"environment": "cloud", "cluster": {"worker": ["localhost:27184", "localhost:27185"], \
               "ps": ["localhost:27183"], "master": ["localhost:27182"]}, "job": {"args": [""], \
               "job_name": "trainer.task"}, "task": {"index": 0, "type": "master"}}'
else:
    os.environ["TF_CONFIG"] = ''

inputs = tf.keras.layers.Input(shape=(10,))
outputs = tf.keras.layers.Dense(10)(inputs)
model = tf.keras.models.Model(inputs, outputs)
model.compile(optimizer='Adam', loss='binary_crossentropy')
est_keras = tf.keras.estimator.model_to_estimator(keras_model=model) # InvalidArgumentError thrown here if simulate_cluster is True

input_name = model.input_names[0]
data = np.random.rand(1000,10).astype(np.float32)
train_input_fn = tf.estimator.inputs.numpy_input_fn({input_name:data}, data, batch_size=10, num_epochs=None, shuffle=False)

train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=100)
eval_spec = tf.estimator.EvalSpec(input_fn=train_input_fn, steps=10)
tf.estimator.train_and_evaluate(est_keras, train_spec, eval_spec)