# Hyperparameter tuning with Cloud ML Engine

**Learning Objectives:**
  * Improve the accuracy of a model by hyperparameter tuning

## Create command-line program

In order to submit to Cloud ML Engine, we need to create a distributed training program. Let's convert our housing example to fit that paradigm, using Experiment and Estimator.

In [74]:
%bash
rm -rf trainer
mkdir trainer
touch trainer/__init__.py

In [75]:
%%writefile trainer/house.py
import os
import math
import json
import shutil
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.contrib.learn as estimators
import tensorflow.contrib.layers as tflayers
from tensorflow.contrib.learn.python.learn import learn_runner
from tensorflow.contrib.learn.python.learn.learn_io import pandas_input_fn

def train(output_dir, batch_size, learning_rate):
    tf.logging.set_verbosity(tf.logging.INFO)
    # read dataset and split into train and eval
    df = pd.read_csv("https://storage.googleapis.com/ml_universities/california_housing_train.csv", sep=",")
    df['num_rooms'] = df['total_rooms'] / df['households']
    msk = np.random.rand(len(df)) < 0.8
    traindf = df[msk]
    evaldf = df[~msk]
    
    # train and eval input functions
    SCALE = 100000
    train_input_fn = pandas_input_fn(x=traindf[["num_rooms"]],
                           y=traindf["median_house_value"]/SCALE,  # note the scaling
                           num_epochs=1,
                           batch_size=batch_size, # note the batch size
                           shuffle=True)
    eval_input_fn = pandas_input_fn(x=evaldf[["num_rooms"]],
                              y=evaldf["median_house_value"]/SCALE,  # note the scaling
                              num_epochs=1, batch_size=len(evaldf), shuffle=False)
    
    def _experiment_fn(output_dir):
        # setup output directory to not clobber
        output_dir = os.path.join(output_dir,json.loads(
          os.environ.get('TF_CONFIG', '{}')).get('task', {}).get('trial', ''))

        # create estimator
        features = [tflayers.real_valued_column('num_rooms')]
        myopt = tf.train.FtrlOptimizer(learning_rate=learning_rate) # note the learning rate
        model = estimators.LinearRegressor(model_dir=output_dir,
                                   feature_columns=features,
                                   optimizer=myopt,
                                   gradient_clip_norm=5.0)

        # compute appropriate number of steps
        num_steps = (len(df) / batch_size) / learning_rate  # if learning_rate=0.01, hundred epochs
    
        experiment = estimators.Experiment(model, 
            train_input_fn=train_input_fn,
            eval_input_fn=eval_input_fn,
            #eval_metrics = {'rmse': estimators.MetricSpec(metric_fn=tf.metrics.root_mean_squared_error)},
            train_steps=num_steps
        )
        return experiment
    
    learn_runner.run(_experiment_fn, output_dir=output_dir)
    
if __name__ == '__main__' and "get_ipython" not in dir():
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--learning_rate',
      type=float, default=0.01
  )
  parser.add_argument(
      '--batch_size',
      type=int, default=30
  ),
  parser.add_argument(
      '--output_dir',
      help='GCS location to write checkpoints and export models',
      required=True
  )
  parser.add_argument(
      '--job-dir',
      help='this model ignores this field, but it is required by gcloud',
      default='junk'
  )
  args = parser.parse_args()
  train(args.output_dir, args.batch_size, args.learning_rate)

Writing trainer/house.py


In [None]:
%bash
rm -rf house_trained
gcloud ml-engine local train --module-name=trainer.house --job-dir=output_dir --package-path=$(pwd)/trainer \
   -- --output_dir=house_trained --batch_size=30 --learning_rate=0.02

# Create hyperparam.yaml

In [80]:
%writefile hyperparam.yaml
trainingInput:
  hyperparameters:
    goal: MINIMIZE
    maxTrials: 5
    maxParallelTrials: 1
    hyperparameterMetricTag: loss
    params:
    - parameterName: batch_size
      type: INTEGER
      minValue: 8
      maxValue: 512
      scaleType: UNIT_LOG_SCALE
    - parameterName: learning_rate
      type: DOUBLE
      minValue: 0.001
      maxValue: 0.1
      scaleType: UNIT_LOG_SCALE

Overwriting hyperparam.yaml


In [None]:
%bash
OUTDIR=gs://asl-ml-immersion-temp/house_trained   # CHANGE bucket name appropriately
gsutil rm -rf $OUTDIR
gcloud ml-engine jobs submit training house_$(date -u +%y%m%d_%H%M%S) --config=hyperparam.yaml \
   --module-name=trainer.house --package-path=$(pwd)/trainer --job-dir=$OUTDIR \
   -- --output_dir=$OUTDIR

In [90]:
!gcloud ml-engine jobs describe house_170820_034011

createTime: '2017-08-20T03:40:19Z'
endTime: '2017-08-20T04:07:30Z'
jobId: house_170820_034011
startTime: '2017-08-20T03:40:23Z'
state: SUCCEEDED
trainingInput:
  args:
  - --output_dir=gs://asl-ml-immersion-temp/house_trained
  hyperparameters:
    goal: MINIMIZE
    hyperparameterMetricTag: loss
    maxParallelTrials: 1
    maxTrials: 5
    params:
    - maxValue: 512.0
      minValue: 8.0
      parameterName: batch_size
      scaleType: UNIT_LOG_SCALE
      type: INTEGER
    - maxValue: 0.1
      minValue: 0.001
      parameterName: learning_rate
      scaleType: UNIT_LOG_SCALE
      type: DOUBLE
  jobDir: gs://asl-ml-immersion-temp/house_trained
  packageUris:
  - gs://asl-ml-immersion-temp/house_trained/packages/2663496b2b798e578d89c2b8155296551772bfde42525379a8cc3c9f28153661/trainer-0.0.0.tar.gz
  pythonModule: trainer.house
  region: us-central1
trainingOutput:
  completedTrialCount: '5'
  consumedMLUnits: 0.23
  isHyperparameterTuningJob: true
  trials:
  - finalMetric:
      ob