<h1> Exploring tf.transform </h1>

In [None]:
%bash
pip install --upgrade tensorflow_transform

In [None]:
import google.cloud.ml as ml
import tensorflow as tf
import tensorflow_transform as tft
import shutil
print tf.__version__
print ml.sdk_location

In [None]:
import os
PROJECT = 'cloud-training-demos'    # CHANGE THIS
BUCKET = 'cloud-training-demos-ml'  # CHANGE THIS
REGION = 'us-central1' # CHANGE THIS

os.environ['PROJECT'] = PROJECT # for bash
os.environ['BUCKET'] = BUCKET # for bash
os.environ['REGION'] = REGION # for bash

In [22]:
import apache_beam as beam
import tensorflow as tf

from tensorflow_transform import coders
from tensorflow_transform.beam import impl as tft
from tensorflow_transform.beam import io
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema

from tensorflow_transform import api
from tensorflow_transform import mappers

INPUT_COLUMNS = ['pickuplon','pickuplat','dropofflon','dropofflat','passengers', 'key']
LABEL_COLUMN = 'fare_amount'

class PathConstants:
  def __init__(self):
    self.TEMP_DIR = 'tmp'
    self.TRANSFORM_FN_DIR = 'transform_fn'
    self.RAW_METADATA_DIR = 'raw_metadata'
    self.TRANSFORMED_METADATA_DIR = 'transformed_metadata'
    self.TRANSFORMED_TRAIN_DATA_FILE_PREFIX = 'features_train'
    self.TRANSFORMED_EVAL_DATA_FILE_PREFIX = 'features_eval'
    self.TRANSFORMED_PREDICT_DATA_FILE_PREFIX = 'features_predict'
    self.TRAIN_RESULTS_FILE = 'train_results'
    self.DEPLOY_SAVED_MODEL_DIR = 'saved_model'
    self.MODEL_EVALUATIONS_FILE = 'model_evaluations'
    self.BATCH_PREDICTION_RESULTS_FILE = 'batch_prediction_results'
    
def make_preprocessing_fn():
  # stop-gap ...
  def _scalar_to_vector(scalar):
    # FeatureColumns expect shape (batch_size, 1), not just (batch_size)
    return api.map(lambda x: tf.expand_dims(x, -1), scalar)
  
  def preprocessing_fn(inputs):
    result = {LABEL_COLUMN: _scalar_to_vector(inputs[LABEL_COLUMN])}
    for name in INPUT_COLUMNS:
      result[name] = _scalar_to_vector(mappers.scale_to_0_1(inputs[name]))

    # use tft.map() to create new columns
    # tft.scale_to_0_1
    # tft.map(tf.sparse_column_with_keys, inputs['gender'], Statistic({'M', 'F'})
    # tft.string_to_int(inputs[name], frequency_threshold=frequency_threshold)
    return result

  return preprocessing_fn

def make_input_schema(mode):
  input_schema = ({} if mode == tf.contrib.learn.ModeKeys.INFER
            else {LABEL_COLUMN: tf.FixedLenFeature(shape=[], dtype=tf.float64)})
  for name in INPUT_COLUMNS:
    input_schema[name] = tf.FixedLenFeature(
        shape=[], dtype=tf.float64, default_value=0)
  input_schema = dataset_schema.from_feature_spec(input_schema)
  return input_schema

def make_coder(schema, mode):
  column_names = [] if mode == tf.contrib.learn.ModeKeys.INFER else [LABEL_COLUMN]
  column_names.extend(INPUT_COLUMNS)
  coder = coders.CsvCoder(column_names, schema)
  return coder

def preprocess_all(pipeline, training_data, eval_data, predict_data, output_dir, mode=tf.contrib.learn.ModeKeys.TRAIN):
  path_constants = PathConstants()
  work_dir = os.path.join(output_dir, path_constants.TEMP_DIR)
  
  # create schema
  input_schema = make_input_schema(mode)

  # coder
  coder = make_coder(input_schema, mode)

  # 3) Read from text using the coder.
  train_data = (
      pipeline
      | 'ReadTrainingData' >> beam.io.ReadFromText(training_data)
      | 'ParseTrainingCsv' >> beam.Map(coder.decode))

  evaluate_data = (
      pipeline
      | 'ReadEvalData' >> beam.io.ReadFromText(eval_data)
      | 'ParseEvalCsv' >> beam.Map(coder.decode))

  # metadata
  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

  _ = (input_metadata
       | 'WriteInputMetadata' >> io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  preprocessing_fn = make_preprocessing_fn()
  (train_dataset, train_metadata), transform_fn = (
      (train_data, input_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn, work_dir))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  transform_fn_is_written = (transform_fn | io.WriteTransformFn(output_dir))

  # TODO(b/34231369) Remember to eventually also save the statistics.

  (evaluate_dataset, evaluate_metadata) = (
      ((evaluate_data, input_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  train_coder = coders.ExampleProtoCoder(train_metadata.schema)
  _ = (train_dataset
       | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
       | 'WriteTraining'
       >> beam.io.WriteToTFRecord(
           os.path.join(output_dir,
                        path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
           file_name_suffix='.tfrecord.gz'))

  evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
  _ = (evaluate_dataset
       | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
       | 'WriteEval'
       >> beam.io.WriteToTFRecord(
           os.path.join(output_dir,
                        path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
           file_name_suffix='.tfrecord.gz'))

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = make_input_schema(mode=predict_mode)
    tsv_coder = make_coder(predict_schema, mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)
    _ = (pipeline
         | 'ReadPredictData' >> beam.io.ReadFromText(predict_data,
                                                     coder=tsv_coder)
         # TODO(b/35194257) Obviate the need for this explicit serialization.
         | 'EncodePredictData' >> beam.Map(predict_coder.encode)
         | 'WritePredictData' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))

  # Workaround b/35366670, to ensure that training and eval don't start before
  # the transform_fn is written.
  train_dataset |= beam.Map(
      lambda x, y: x, y=beam.pvalue.AsSingleton(transform_fn_is_written))
  evaluate_dataset |= beam.Map(
      lambda x, y: x, y=beam.pvalue.AsSingleton(transform_fn_is_written))

  return transform_fn, train_dataset, evaluate_dataset

train_data_paths='/content/training-data-analyst/CPB102/lab1a/taxi-train.csv' 
eval_data_paths='/content/training-data-analyst/CPB102/lab1a/taxi-valid.csv'  
output_dir='/content/training-data-analyst/CPB102/lab4a/taxi_preproc' 
predict_data_paths=None

shutil.rmtree('/content/training-data-analyst/CPB102/lab4a/taxi_preproc', ignore_errors=True)
p = beam.Pipeline()
transform_fn, train_dataset, eval_dataset = preprocess_all(
      p, train_data_paths, eval_data_paths, predict_data_paths, output_dir)

p.run()

INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /content/training-data-analyst/CPB102/lab4a/taxi_preproc/tmp/tmp/level_0/unbound_transform_fn/saved_model.pb


INFO:tensorflow:SavedModel written to: /content/training-data-analyst/CPB102/lab4a/taxi_preproc/tmp/tmp/level_0/unbound_transform_fn/saved_model.pb


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /content/training-data-analyst/CPB102/lab4a/taxi_preproc/tmp/unbound_transform_fn/saved_model.pb


INFO:tensorflow:SavedModel written to: /content/training-data-analyst/CPB102/lab4a/taxi_preproc/tmp/unbound_transform_fn/saved_model.pb


<apache_beam.runners.direct.direct_runner.DirectPipelineResult at 0x7f4defebddd0>

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /content/training-data-analyst/CPB102/lab4a/taxi_preproc/tmp/transform_fn/saved_model.pb


INFO:tensorflow:SavedModel written to: /content/training-data-analyst/CPB102/lab4a/taxi_preproc/tmp/transform_fn/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [23]:
!ls taxi_preproc

features_eval-00000-of-00002.tfrecord.gz
features_eval-00001-of-00002.tfrecord.gz
features_train-00000-of-00008.tfrecord.gz
features_train-00001-of-00008.tfrecord.gz
features_train-00002-of-00008.tfrecord.gz
features_train-00003-of-00008.tfrecord.gz
features_train-00004-of-00008.tfrecord.gz
features_train-00005-of-00008.tfrecord.gz
features_train-00006-of-00008.tfrecord.gz
features_train-00007-of-00008.tfrecord.gz
raw_metadata
tmp
transform_fn
transformed_metadata


<h2> Train off preprocessed data </h2>

In [24]:
%bash
rm -rf taxifare.tar.gz taxi_trained
gcloud beta ml local train \
   --module-name=trainer.task \
   --package-path=/content/training-data-analyst/CPB102/lab4a/taxifare/trainer \
   -- \
   --train_data_paths=/content/training-data-analyst/CPB102/lab4a/taxi_preproc/features_train* \
   --eval_data_paths=/content/training-data-analyst/CPB102/lab4a/taxi_preproc/features_eval*  \
   --num_epochs=100 \
   --format=tfrecord \
   --output_dir=/content/training-data-analyst/CPB102/lab4a/taxi_trained 

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_task_type': None, '_environment': u'cloud', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc4fa36fd10>, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_task_id': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_evaluation_master': '', '_keep_checkpoint_every_n_hours': 10000, '_master': ''}
Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
Instructions for upda

In [25]:
!ls /content/training-data-analyst/CPB102/lab4a/taxi_trained/export/Servo

1487449774892


In [14]:
%writefile /tmp/test.json
{"pickuplon": -73.885262,"pickuplat": 40.773008,"dropofflon": -73.987232,"dropofflat": 40.732403,"passengers": 2}

Overwriting /tmp/test.json


In [26]:
%bash
gcloud beta ml local predict \
    --model-dir=/content/training-data-analyst/CPB102/lab4a/taxi_trained/export/Servo/1487449774892 \
    --json-instances=/tmp/test.json

predictions:
- outputs: 16.3879


W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.



Copyright 2016 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License