<h1> Exploring tf.transform </h1>

tf.transform is in beta.

In [None]:
%bash
pip install --upgrade tensorflow_transform
pip install --upgrade protobuf==3.1.0

In [None]:
import google.cloud.ml as ml
import tensorflow as tf
import tensorflow_transform as tft
import shutil
print tf.__version__
print ml.sdk_location

In [None]:
import os
PROJECT = 'cloud-training-demos'    # CHANGE THIS
BUCKET = 'cloud-training-demos-ml'  # CHANGE THIS
REGION = 'us-central1' # CHANGE THIS

os.environ['PROJECT'] = PROJECT # for bash
os.environ['BUCKET'] = BUCKET # for bash
os.environ['REGION'] = REGION # for bash

In [None]:
import apache_beam as beam
import tensorflow as tf

from tensorflow_transform import coders
from tensorflow_transform.beam import impl as tft
from tensorflow_transform.beam import io
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema

from tensorflow_transform import api
from tensorflow_transform import mappers

CSV_COLUMNS = 'fare_amount,dayofweek,hourofday,pickuplon,pickuplat,dropofflon,dropofflat,passengers,key'.split(',')
SCALE_COLUMNS = ['pickuplon','pickuplat','dropofflon','dropofflat','passengers']
LABEL_COLUMN = 'fare_amount'
KEY_FEATURE_COLUMN = 'key'

class PathConstants:
  def __init__(self):
    self.TEMP_DIR = 'tmp'
    self.TRANSFORM_FN_DIR = 'transform_fn'
    self.RAW_METADATA_DIR = 'raw_metadata'
    self.TRANSFORMED_METADATA_DIR = 'transformed_metadata'
    self.TRANSFORMED_TRAIN_DATA_FILE_PREFIX = 'features_train'
    self.TRANSFORMED_EVAL_DATA_FILE_PREFIX = 'features_eval'
    self.TRANSFORMED_PREDICT_DATA_FILE_PREFIX = 'features_predict'
    self.TRAIN_RESULTS_FILE = 'train_results'
    self.DEPLOY_SAVED_MODEL_DIR = 'saved_model'
    self.MODEL_EVALUATIONS_FILE = 'model_evaluations'
    self.BATCH_PREDICTION_RESULTS_FILE = 'batch_prediction_results'
    
def make_preprocessing_fn():
  # stop-gap ...
  def _scalar_to_vector(scalar):
    # FeatureColumns expect shape (batch_size, 1), not just (batch_size)
    return api.map(lambda x: tf.expand_dims(x, -1), scalar)
  
  def preprocessing_fn(inputs):
    result = {col: _scalar_to_vector(inputs[col]) for col in CSV_COLUMNS}
    for name in SCALE_COLUMNS:
      result[name] = _scalar_to_vector(mappers.scale_to_0_1(inputs[name]))

    # use tft.map() to create new columns
    # tft.scale_to_0_1
    # tft.map(tf.sparse_column_with_keys, inputs['gender'], Statistic({'M', 'F'})
    # tft.string_to_int(inputs[name], frequency_threshold=frequency_threshold)
    return result

  return preprocessing_fn

def make_input_schema(mode):
  input_schema = {}
  if mode != tf.contrib.learn.ModeKeys.INFER:
      input_schema[LABEL_COLUMN] = tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=0.0)
  for name in ['dayofweek', 'key']:
      input_schema[name] = tf.FixedLenFeature(shape=[], dtype=tf.string, default_value='null')
  for name in ['hourofday']:
      input_schema[name] = tf.FixedLenFeature(shape=[], dtype=tf.int64, default_value=0)
  for name in SCALE_COLUMNS:
      input_schema[name] = tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=0.0)
      
  input_schema = dataset_schema.from_feature_spec(input_schema)
  return input_schema

def make_coder(schema, mode):
  import copy
  column_names = copy.deepcopy(CSV_COLUMNS)
  if mode == tf.contrib.learn.ModeKeys.INFER:
    column_names.pop(LABEL_COLUMN)
  coder = coders.CsvCoder(column_names, schema)
  return coder

def preprocess_all(pipeline, training_data, eval_data, predict_data, output_dir, mode=tf.contrib.learn.ModeKeys.TRAIN):
  path_constants = PathConstants()
  work_dir = os.path.join(output_dir, path_constants.TEMP_DIR)
  
  # create schema
  input_schema = make_input_schema(mode)

  # coder
  coder = make_coder(input_schema, mode)

  # 3) Read from text using the coder.
  train_data = (
      pipeline
      | 'ReadTrainingData' >> beam.io.ReadFromText(training_data)
      | 'ParseTrainingCsv' >> beam.Map(coder.decode))

  evaluate_data = (
      pipeline
      | 'ReadEvalData' >> beam.io.ReadFromText(eval_data)
      | 'ParseEvalCsv' >> beam.Map(coder.decode))

  # metadata
  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

  _ = (input_metadata
       | 'WriteInputMetadata' >> io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  preprocessing_fn = make_preprocessing_fn()
  (train_dataset, train_metadata), transform_fn = (
      (train_data, input_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn, work_dir))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  transform_fn_is_written = (transform_fn | io.WriteTransformFn(output_dir))

  # TODO(b/34231369) Remember to eventually also save the statistics.

  (evaluate_dataset, evaluate_metadata) = (
      ((evaluate_data, input_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  train_coder = coders.ExampleProtoCoder(train_metadata.schema)
  _ = (train_dataset
       | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
       | 'WriteTraining'
       >> beam.io.WriteToTFRecord(
           os.path.join(output_dir,
                        path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
           file_name_suffix='.tfrecord.gz'))

  evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
  _ = (evaluate_dataset
       | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
       | 'WriteEval'
       >> beam.io.WriteToTFRecord(
           os.path.join(output_dir,
                        path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
           file_name_suffix='.tfrecord.gz'))

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = make_input_schema(mode=predict_mode)
    tsv_coder = make_coder(predict_schema, mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)
    _ = (pipeline
         | 'ReadPredictData' >> beam.io.ReadFromText(predict_data,
                                                     coder=tsv_coder)
         # TODO(b/35194257) Obviate the need for this explicit serialization.
         | 'EncodePredictData' >> beam.Map(predict_coder.encode)
         | 'WritePredictData' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))

  # Workaround b/35366670, to ensure that training and eval don't start before
  # the transform_fn is written.
  train_dataset |= beam.Map(
      lambda x, y: x, y=beam.pvalue.AsSingleton(transform_fn_is_written))
  evaluate_dataset |= beam.Map(
      lambda x, y: x, y=beam.pvalue.AsSingleton(transform_fn_is_written))

  return transform_fn, train_dataset, evaluate_dataset

train_data_paths='./sample/train.csv' 
eval_data_paths='./sample/valid.csv'  
output_dir='./taxi_preproc' 
predict_data_paths=None

shutil.rmtree('./taxi_preproc', ignore_errors=True)
p = beam.Pipeline()
transform_fn, train_dataset, eval_dataset = preprocess_all(
      p, train_data_paths, eval_data_paths, predict_data_paths, output_dir)

p.run()

In [None]:
!ls taxi_preproc

<h2> Train off preprocessed data </h2>

In [None]:
%bash
rm -rf taxifare.tar.gz taxi_trained
export PYTHONPATH=${PYTHONPATH}:/content/training-data-analyst/courses/machine_learning/feateng/taxifare
python -m trainer.task \
   --train_data_paths="/content/training-data-analyst/courses/machine_learning/feateng/taxi_preproc/features_train-00001*" \
   --eval_data_paths="/content/training-data-analyst/courses/machine_learning/feateng/taxi_preproc/features_eval-00001*"  \
   --output_dir=/content/training-data-analyst/courses/machine_learning/feateng/taxi_trained \
   --num_epochs=10 --job-dir=/tmp \
   --format=tfrecord

In [None]:
!ls /content/training-data-analyst/courses/machine_learning/feateng/taxi_trained/export/Servo

In [None]:
%writefile /tmp/test.json
{"dayofweek":"Thu","hourofday":17,"pickuplon": -73.885262,"pickuplat": 40.773008,"dropofflon": -73.987232,"dropofflat": 40.732403,"passengers": 2}

In [None]:
%bash
model_dir=$(ls /content/training-data-analyst/courses/machine_learning/feateng/taxi_trained/export/Servo/)
gcloud ml-engine local predict \
    --model-dir=/content/training-data-analyst/courses/machine_learning/feateng/taxi_trained/export/Servo/${model_dir} \
    --json-instances=/tmp/test.json

Copyright 2016 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License