# Babyweight Estimation with Transformed Data

### Set global flags

In [None]:
PROJECT = 'ksalama-gcp-playground' # change to your project_Id
BUCKET = 'ksalama-gcs-cloudml' # change to your bucket name
REGION = 'europe-west1' # change to your region
ROOT_DIR = 'babyweight_tft' # directory where the output is stored locally or on GCS

RUN_LOCAL = True

In [None]:
import os

os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION
os.environ['ROOT_DIR'] = ROOT_DIR
os.environ['RUN_LOCAL'] = 'true' if RUN_LOCAL else 'false'

## Import required packages and modules

In [None]:
import os

import tensorflow as tf
from tensorflow import data

from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema

from tensorflow_transform.tf_metadata import metadata_io
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

In [None]:
!pip list | grep 'tensorflow'
!pip list | grep 'beam'
!pip list | grep 'cloud-dataflow'

In [None]:
OUTPUT_DIR = ROOT_DIR if RUN_LOCAL==True else "gs://{}/{}".format(BUCKET,ROOT_DIR)
TRANSFORM_ARTEFACTS_DIR = os.path.join(OUTPUT_DIR,'transform')
TRANSFORMED_DATA_DIR = os.path.join(OUTPUT_DIR,'transformed')
TEMP_DIR = os.path.join(OUTPUT_DIR, 'tmp')
MODELS_DIR = os.path.join(OUTPUT_DIR,'models')

## Transform Metadata

In [None]:
transformed_metadata = metadata_io.read_metadata(
        os.path.join(TRANSFORM_ARTEFACTS_DIR,"transformed_metadata"))

TARGET_FEATURE_NAME = 'weight_pounds'

print transformed_metadata.schema

## Input Function

In [None]:
def tfrecords_input_fn(files_name_pattern, transformed_metadata,
                       mode=tf.estimator.ModeKeys.EVAL,  
                       num_epochs=1, 
                       batch_size=500):
    
    dataset = tf.contrib.data.make_batched_features_dataset(
        file_pattern=files_name_pattern,
        batch_size=batch_size,
        features=transformed_metadata.schema.as_feature_spec(),
        reader=tf.data.TFRecordDataset,
        num_epochs=num_epochs,
        shuffle=True if mode == tf.estimator.ModeKeys.TRAIN else False,
        shuffle_buffer_size=1+(batch_size*2),
        prefetch_buffer_size=1
    )
    
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    target = features.pop(TARGET_FEATURE_NAME)
    return features, target

## Feature columns

In [None]:
def create_wide_and_deep_feature_columns(transformed_metadata, hparams):
    
    deep_feature_columns = []
    wide_feature_columns = []
    
    column_schemas = transformed_metadata.schema.column_schemas
    
    for feature_name in column_schemas:
        if feature_name == TARGET_FEATURE_NAME:
            continue
        column_schema = column_schemas[feature_name]
        
        # creating numerical features
        if isinstance(column_schema._domain, dataset_schema.FloatDomain):
            deep_feature_columns.append(tf.feature_column.numeric_column(feature_name))
            
        # creating categorical features with identity
        elif isinstance(column_schema._domain, dataset_schema.IntDomain):
            if column_schema._domain._is_categorical==True:
                wide_feature_columns.append(
                    tf.feature_column.categorical_column_with_identity(
                        feature_name, 
                        num_buckets=column_schema._domain._max_value+1)
                )
            else:
                deep_feature_columns.append(tf.feature_column.numeric_column(feature_name)) 
     
    if hparams.extend_feature_columns==True:
        mother_race_X_mother_age_bucketized = tf.feature_column.crossed_column(
            ['mother_age_bucketized', 'mother_race_index'],  55)
        
        wide_feature_columns.append(mother_race_X_mother_age_bucketized)
        
        mother_race_X_mother_age_bucketized_embedded = tf.feature_column.embedding_column(
            mother_race_X_mother_age_bucketized, hparams.embed_dimension)
        
        deep_feature_columns.append(mother_race_X_mother_age_bucketized_embedded)
    
    print "Wide columns:"
    print wide_feature_columns
    print ""
    print "Deep columns:"
    print deep_feature_columns
    print ""
    
    return wide_feature_columns, deep_feature_columns

## Estimator

In [None]:
def create_estimator(run_config, hparams):
  
    wide_feature_columns, deep_feature_columns = create_wide_and_deep_feature_columns(transformed_metadata, 
                                                                                      hparams)
    estimator = tf.estimator.DNNLinearCombinedRegressor(
                linear_feature_columns = wide_feature_columns,
                dnn_feature_columns = deep_feature_columns,
                dnn_hidden_units=hparams.hidden_units,
                config = run_config
                )
    
    return estimator

## Experiment

In [None]:
hparams  = tf.contrib.training.HParams(
    num_epochs=10,
    batch_size=500,
    hidden_units=[32, 16],
    max_steps=100,
    embed_dimension=5,
    extend_feature_columns=False,
    evaluate_after_sec=10
)

model_dir = os.path.join(MODELS_DIR,"dnn_estimator")
run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    model_dir=model_dir
)

In [None]:
train_data_files = os.path.join(TRANSFORMED_DATA_DIR, "train-*.tfrecords")
eval_data_files = os.path.join(TRANSFORMED_DATA_DIR, "eval-*.tfrecords")

# TrainSpec
train_spec = tf.estimator.TrainSpec(
  input_fn = lambda: tfrecords_input_fn(train_data_files,transformed_metadata,
    mode=tf.estimator.ModeKeys.TRAIN,
    num_epochs= hparams.num_epochs,
    batch_size = hparams.batch_size
  ),
  max_steps=hparams.max_steps,
)

# EvalSpec
eval_spec = tf.estimator.EvalSpec(
  input_fn =lambda: tfrecords_input_fn(eval_data_files,transformed_metadata),
  steps = None,
  throttle_secs = hparams.evaluate_after_sec # evalute after each 10 training seconds!
)

In [None]:
from datetime import datetime

if tf.gfile.Exists(model_dir):
    tf.gfile.DeleteRecursively(model_dir)

estimator = create_estimator(run_config, hparams)

tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("")
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 


tf.estimator.train_and_evaluate(
  estimator,
  train_spec,
  eval_spec
)


time_end = datetime.utcnow() 
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

## Raw data metadata

In [None]:
CATEGORICAL_FEATURE_NAMES = ['is_male', 'mother_race']
NUMERIC_FEATURE_NAMES = ['mother_age', 'plurality', 'gestation_weeks']
TARGET_FEATURE_NAME = 'weight_pounds'
KEY_COLUMN = 'key'

def create_raw_metadata():  
    
    raw_data_schema = {}
    
    # key feature scehma
    raw_data_schema[KEY_COLUMN]= dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
    
    # target feature scehma
    raw_data_schema[TARGET_FEATURE_NAME]= dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
    
    # categorical features scehma
    raw_data_schema.update({ column_name : dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation())
                            for column_name in CATEGORICAL_FEATURE_NAMES})
    
    # numerical features scehma
    raw_data_schema.update({ column_name : dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
                            for column_name in NUMERIC_FEATURE_NAMES})
    
      # create dataset_metadata given raw_schema
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema(raw_data_schema))
    
    return raw_metadata

import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(create_raw_metadata().schema.as_feature_spec())

## Export Estimator to SavedModel

In [None]:
def serving_input_receiver_fn():
    
    from tensorflow_transform.saved import saved_transform_io
    
    # get the feature_spec of raw data
    raw_metadata = create_raw_metadata()
    
    # create receiver placeholders to the raw input features
    raw_input_features = raw_metadata.schema.as_batched_placeholders()
    raw_input_features.pop(TARGET_FEATURE_NAME)
    raw_input_features.pop(KEY_COLUMN)

    # apply tranform_fn on raw features
    _, transformed_features = (
        saved_transform_io.partially_apply_saved_transform(
            os.path.join(TRANSFORM_ARTEFACTS_DIR,transform_fn_io.TRANSFORM_FN_DIR),
        raw_input_features)
    )
    
    return tf.estimator.export.ServingInputReceiver(
        transformed_features, raw_input_features)

export_dir = os.path.join(model_dir, 'export')

if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
        
estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=serving_input_receiver_fn
)

os.environ['export_dir'] = export_dir

## Inspect the Exported Model

In [None]:
%%bash
if [ ${RUN_LOCAL} ]
then 
saved_model_dir=$(gsutil ls ${export_dir} | tail -n 1)
else
saved_model_dir=${export_dir}/$(ls ${export_dir} | tail -n 1)
fi

echo $saved_model_dir
saved_model_cli show --dir=${saved_model_dir} --all

## Use Exported Model for Prediction

In [None]:
saved_model_dir=os.path.join(export_dir, tf.gfile.ListDirectory(export_dir)[0])

print saved_model_dir

def estimate_local(instance):
 
    predictor_fn = tf.contrib.predictor.from_saved_model(
        export_dir=saved_model_dir,
        signature_def_key="predict"
    )
    
    instance = dict((k, [v]) for k, v in instance.items())
    value = predictor_fn(instance)['predictions'][0][0]
    return value

instance = {
        'is_male': 'True',
        'mother_age': 26.0,
        'mother_race': 'Asian Indian',
        'plurality': 1.0,
        'gestation_weeks': 39
}

prediction = estimate_local(instance)
print(prediction)