In [29]:
import tensorflow as tf
print(tf.__version__)

1.13.1


In [30]:
# change these to try this notebook out
BUCKET = 'huiyi-sandbox'
PROJECT = 'huiyi-training'
REGION = 'us-central1'

In [31]:
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [32]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


In [33]:
%%bash
if ! gsutil ls | grep -q gs://${BUCKET}/; then
  gsutil mb -l ${REGION} gs://${BUCKET}
fi

In [34]:
%bash
gsutil ls gs://${BUCKET}/preproc/*-00000*

gs://huiyi-sandbox/preproc/eval.csv-00000-of-00001
gs://huiyi-sandbox/preproc/train.csv-00000-of-00001


Training on Cloud ML Engine requires:

1. Making the code a Python package
2. Using gcloud to submit the training code to Cloud ML Engine

In [35]:
%writefile tuto/trainer/task.py
import argparse
import json
import os

from . import model

import tensorflow as tf

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--bucket',
        help = 'GCS path to data. We assume that data is in gs://BUCKET/preproc/',
        required = True
    )
    parser.add_argument(
        '--output_dir',
        help = 'GCS location to write checkpoints and export models',
        required = True
    )
    parser.add_argument(
        '--batch_size',
        help = 'Number of examples to compute gradient over.',
        type = int,
        default = 128
    )
    parser.add_argument(
        '--job-dir',
        help = 'this model ignores this field, but it is required by gcloud',
        default = 'junk'
    )
 
    ## TODO 1: add the new arguments here 
    parser.add_argument(
        '--train_examples',
        help = 'Number of examples (in thousands) to run the training job over. If this is more than actual # of examples available,\
        it cycles through them. So specifying 1000 here when you have only 100k examples makes this 10 epochs.',
        type = int,
        default = 5000
    )    
    parser.add_argument(
        '--pattern',
        help = 'Specify a pattern that has to be in input files. For example 00001-of will process only one shard',
        default = 'of'
    )
    parser.add_argument(
        '--eval_steps',
        help = 'Positive number of steps for which to evaluate model. Default to None, \
        which means to evaluate until input_fn raises an end-of-input exception',
        type = int,       
        default = None
    )
        
    ## parse all arguments
    args = parser.parse_args()
    arguments = args.__dict__

    # unused args provided by service
    #arguments.pop('job_dir', None)
    arguments.pop('job-dir', None)

    ## assign the arguments to the model variables
    output_dir = arguments.pop('output_dir')
    model.BUCKET     = arguments.pop('bucket')
    model.BATCH_SIZE = arguments.pop('batch_size')
    model.TRAIN_STEPS = (arguments.pop('train_examples') * 1000) / model.BATCH_SIZE
    model.EVAL_STEPS = arguments.pop('eval_steps')    
    print ("Will train for {} steps using batch_size={}".format(model.TRAIN_STEPS, model.BATCH_SIZE))
    model.PATTERN = arguments.pop('pattern')

    # Append trial_id to path if we are doing hptuning
    # This code can be removed if you are not using hyperparameter tuning
    output_dir = os.path.join(
        output_dir,
        json.loads(
            os.environ.get('TF_CONFIG', '{}')
        ).get('task', {}).get('trial', '')
    )
    
    print("outpur_dir: {}".format(output_dir))
    
    # Run the training job
    model.train_and_evaluate(output_dir)

Overwriting tuto/trainer/task.py


In [36]:
%writefile tuto/trainer/model.py
import shutil
import numpy as np
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

BUCKET = None  # set from task.py
PATTERN = 'of' # gets all files

# Determine CSV, label, and key columns
CSV_COLUMNS = 'SALE_PRICE,BOROUGH,BLOCK,ZIP_CODE,key'.split(',')
LABEL_COLUMN = 'SALE_PRICE'
KEY_COLUMN = 'key'

# Set default values for each CSV column
DEFAULTS = [[600000.0], ['1'], [1418.0], [10065.0],['nokey']]

# Define some hyperparameters
TRAIN_STEPS = 10000
EVAL_STEPS = None
BATCH_SIZE = 128

# Create an input function reading a file using the Dataset API
# Then provide the results to the Estimator API
def read_dataset(prefix, mode, batch_size=128):
    def _input_fn():
        def decode_csv(value_column):
            columns = tf.decode_csv(value_column, record_defaults=DEFAULTS)
            features = dict(zip(CSV_COLUMNS, columns))
            label = features.pop(LABEL_COLUMN)
            return features, label
        
        # Use prefix to create file path
        file_path = 'gs://{}/preproc/{}*{}*'.format(BUCKET, prefix, PATTERN)
        
        # Create list of files that match pattern
        file_list = tf.gfile.Glob(file_path)

        # Create dataset from file list
        dataset = (tf.data.TextLineDataset(file_list)  # Read text file
                    .map(decode_csv))  # Transform each elem by applying decode_csv fn
              
        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None # indefinitely
            dataset = dataset.shuffle(buffer_size = 10 * batch_size) #melanger les donnees
        else:
            num_epochs = 1 # end-of-input after this
 
        dataset = dataset.repeat(num_epochs).batch(batch_size)
  
        return dataset.make_one_shot_iterator().get_next()
    return _input_fn


#Create feature columns for estimator
def make_feature_cols():
    # Define column types
    INPUT_COLUMNS = \
        [\
            tf.feature_column.categorical_column_with_vocabulary_list('BOROUGH', ['1', '2', '3', '4', '5']),
            tf.feature_column.numeric_column('BLOCK'),
            tf.feature_column.numeric_column('ZIP_CODE')
        ]
    
    return INPUT_COLUMNS


# Create serving input function to be able to serve predictions later using provided inputs
def serving_input_fn():
    feature_placeholders = {
        'BOROUGH': tf.placeholder(tf.string, [None]),
        'BLOCK': tf.placeholder(tf.string, [None]),
        'ZIP_CODE': tf.placeholder(tf.string, [None]),
        KEY_COLUMN: tf.placeholder_with_default(tf.constant(['nokey']), [None])
    }

    features = {
        key: tf.expand_dims(tensor, -1)
        for key, tensor in feature_placeholders.items()
    }
    
    return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)


# create metric for hyperparameter tuning
def my_rmse(labels, predictions):
    pred_values = predictions['predictions']
    return {'rmse': tf.metrics.root_mean_squared_error(labels, pred_values)}
  
#create estimator to train and evaluate
def train_and_evaluate(OUTDIR):
    tf.summary.FileWriterCache.clear() # ensure filewriter cache is clear for TensorBoard events file
    #Linear Regression with tf.Estimator framework
    shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

    tf.logging.set_verbosity(tf.logging.INFO)

    #create model of linear regression
    #estimator = tf.estimator.LinearRegressor(feature_columns = make_feature_cols())
    estimator = tf.estimator.LinearRegressor(
                       model_dir = OUTDIR,
                       feature_columns = make_feature_cols())
    
    EVAL_INTERVAL = 300 # seconds

    ## set the save_checkpoints_secs to the EVAL_INTERVAL
    run_config = tf.estimator.RunConfig(save_checkpoints_secs = EVAL_INTERVAL,
                                        keep_checkpoint_max = 3)

    # illustrates how to add an extra metric
    estimator = tf.contrib.estimator.add_metrics(estimator, my_rmse)
    # for batch prediction, you need a key associated with each instance
    estimator = tf.contrib.estimator.forward_features(estimator, KEY_COLUMN)

    #Set the third argument of read_dataset to BATCH_SIZE and set max_steps to TRAIN_STEPS
    train_spec = tf.estimator.TrainSpec(
        input_fn = read_dataset('train', tf.estimator.ModeKeys.TRAIN, BATCH_SIZE),
        max_steps = TRAIN_STEPS)
    
    #exporter = tf.estimator.LatestExporter('exporter', serving_input_fn, exports_to_keep=None)
    exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)

    # Lastly, set steps equal to EVAL_STEPS
    eval_spec = tf.estimator.EvalSpec(
        input_fn = read_dataset('eval', tf.estimator.ModeKeys.EVAL, 2**15),  # no need to batch in eval
        steps = EVAL_STEPS,
        start_delay_secs = 60, # start evaluating after N seconds
        throttle_secs = EVAL_INTERVAL,  # evaluate every N seconds
        exporters = exporter)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)


Overwriting tuto/trainer/model.py


make sure it works standalone. (Note the --pattern and --train_examples lines so that I am not trying to boil the ocean on my laptop).

Test en locale avant de deployer dans Cloud ML Engine

In [9]:
%bash
echo "bucket=${BUCKET}"
rm -rf tuto_trained

export PYTHONPATH=${PYTHONPATH}:${PWD}/tuto

python -m trainer.task \
  --bucket=${BUCKET} \
  --output_dir=tuto_trained \
  --job-dir=./train_tmp \
  --pattern="00000-of-" --train_examples=1 --eval_steps=1

bucket=huiyi-sandbox
Will train for 7 steps using batch_size=128
outpur_dir: tuto_trained/

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



  from ._conv import register_converters as _register_converters
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff196788d90>, '_model_dir': 'tuto_trained/', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_experimental_distribute': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_evaluation_master': '', '_eval_distribute': None, '_train_distribute': None, '_master': ''}
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas

Once the code works in standalone mode, you can run it on Cloud ML Engine. 

Because this is on the entire dataset, it will take a while.  You can monitor the job from the GCP console in the Cloud Machine Learning Engine section.

In [10]:
%bash
OUTDIR=gs://${BUCKET}/tuto/trained_model
JOBNAME=tuto_$(date -u +%y%m%d_%H%M%S)

echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR

gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/tuto/trainer \
  --job-dir=$OUTDIR \
  --staging-bucket=gs://$BUCKET \
  --scale-tier=STANDARD_1 \
  --runtime-version=$TFVERSION \
  -- \
  --bucket=${BUCKET} \
  --output_dir=${OUTDIR} \
  --train_examples=200000

gs://huiyi-sandbox/tuto/trained_model us-central1 tuto_190425_120443
jobId: tuto_190425_120443
state: QUEUED


Removing gs://huiyi-sandbox/tuto/trained_model/#1556191188781780...
Removing gs://huiyi-sandbox/tuto/trained_model/checkpoint#1556191193028981...
Removing gs://huiyi-sandbox/tuto/trained_model/eval/#1556186912153037...
Removing gs://huiyi-sandbox/tuto/trained_model/eval/events.out.tfevents.1556186912.cmle-training-master-1b52619fce-0-crmnn#1556191198642390...
Removing gs://huiyi-sandbox/tuto/trained_model/events.out.tfevents.1556186886.cmle-training-master-1b52619fce-0-crmnn#1556191195564196...
Removing gs://huiyi-sandbox/tuto/trained_model/export/#1556186914807070...
/ [1/78 objects]   1% Done                                                      / [2/78 objects]   2% Done                                                      / [3/78 objects]   3% Done                                                      Removing gs://huiyi-sandbox/tuto/trained_model/export/exporter/#1556186915097740...
Removing gs://huiyi-sandbox/tuto/trained_model/export/exporter/1556186913/#1556186921431443...
/ [

Monitor with TensorBoard

In [21]:
from google.datalab.ml import TensorBoard
TensorBoard().start('gs://{}/trained_model'.format(BUCKET))

12885

stop the tensorboard after if you finish the monitor

In [22]:
for pid in TensorBoard.list()['pid']:
    TensorBoard().stop(pid)
    print('Stopped TensorBoard with pid {}'.format(pid))

Stopped TensorBoard with pid 12885


Hyperparameter tuning

All of these are command-line parameters to my program. To do hyperparameter tuning, create hyperparam.xml and pass it as --configFile. This step will take up to 2 hours -- you can increase maxParallelTrials or reduce maxTrials to get it done faster. Since maxParallelTrials is the number of initial seeds to start searching from, you don't want it to be too large; otherwise, all you have is a random search.

In [37]:
%writefile hyperparam.yaml
trainingInput:
  scaleTier: STANDARD_1
  hyperparameters:
    hyperparameterMetricTag: rmse
    goal: MINIMIZE
    maxTrials: 20
    maxParallelTrials: 5
    enableTrialEarlyStopping: True
    params:
    - parameterName: batch_size
      type: INTEGER
      minValue: 8
      maxValue: 218
      scaleType: UNIT_LOG_SCALE
     

Overwriting hyperparam.yaml


In [28]:
%bash
OUTDIR=gs://${BUCKET}/tuto/hyperparam
JOBNAME=tuto_$(date -u +%y%m%d_%H%M%S)

echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR

gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/tuto/trainer \
  --job-dir=$OUTDIR \
  --staging-bucket=gs://$BUCKET \
  --scale-tier=STANDARD_1 \
  --config=hyperparam.yaml \
  --runtime-version=$TFVERSION \
  -- \
  --bucket=${BUCKET} \
  --output_dir=${OUTDIR} \
  --eval_steps=10 \
  --train_examples=20000
    

gs://huiyi-sandbox/tuto/hyperparam us-central1 tuto_190425_133139
jobId: tuto_190425_133139
state: QUEUED


CommandException: 1 files/objects could not be removed.
Job [tuto_190425_133139] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe tuto_190425_133139

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs tuto_190425_133139


Repeat training

This time with tuned parameters (note last line)

In [38]:
%bash
OUTDIR=gs://${BUCKET}/tuto/trained_model_tuned
JOBNAME=tuto_$(date -u +%y%m%d_%H%M%S)

echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR

gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/tuto/trainer \
  --job-dir=$OUTDIR \
  --staging-bucket=gs://$BUCKET \
  --scale-tier=STANDARD_1 \
  --runtime-version=$TFVERSION \
  -- \
  --bucket=${BUCKET} \
  --output_dir=${OUTDIR} \
  --train_examples=20000 --batch_size=35

gs://huiyi-sandbox/tuto/trained_model_tuned us-central1 tuto_190425_142441
jobId: tuto_190425_142441
state: QUEUED
gs://huiyi-sandbox/tuto/trained_model_tuned us-central1 tuto_190425_142441
jobId: tuto_190425_142441
state: QUEUED


CommandException: 1 files/objects could not be removed.
Job [tuto_190425_142441] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe tuto_190425_142441

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs tuto_190425_142441
CommandException: 1 files/objects could not be removed.
Job [tuto_190425_142441] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe tuto_190425_142441

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs tuto_190425_142441
