In [167]:
# Setup constants and globals.
import os
PROJECT = 'predictions-api-to-cloud-ml'       # CHANGE THIS
REGION = 'us-west1-a'                         # CHANGE THIS
STORAGE_BUCKET = 'papi-bucket'                # CHANGE THIS
TRAINING_DATA_FILE = 'sample/train.csv'       # CHANGE THIS
VALIDATION_DATA_FILE = 'sample/valid.csv'     # CHANGE THIS
SCHEMA_FILE = 'sample/taxifare.json'          # CHANGE THIS
TAXIFARE_REGRESSION_MODEL_ID = 'taxifare_regression'
TRAINING_DATA_FILE_BUCKET_LOCATION = 'gs://' + STORAGE_BUCKET + '/' + TRAINING_DATA_FILE
EVAL_DATA_FILE_BUCKET_LOCATION = 'gs://' + STORAGE_BUCKET + '/' + VALIDATION_DATA_FILE
SCHEMA_FILE_BUCKET_LOCATION = 'gs://' + STORAGE_BUCKET + '/' + SCHEMA_FILE
OUTDIR = 'gs://' + STORAGE_BUCKET + '/'
PREPROCESSING = 'preprocessing'
PREPROCESSING_DIR = OUTDIR + PREPROCESSING
TRAINING = 'training'
TRAINING_DIR = OUTDIR + TRAINING
FEATURES_FILE = PREPROCESSING_DIR + '/features.json'
TRAINING_REGION = 'us-central1' # This has to be one of [us-central1, us-east1, europe-west1, asia-east1] apparently.
SCALE_TIER = 'STANDARD_1'
VERSION_NAME = 'v1'

# for bash
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [95]:
%bash
echo "project=$PROJECT"
echo "region=$REGION"
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

project=predictions-api-to-cloud-ml
region=us-west1-a


Updated property [core/project].
Updated property [compute/region].


In [148]:
# Import necessary libs.
import google.datalab.ml as ml
import google.datalab.storage as storage
import json
import mltoolbox.regression.linear as sd
from tensorflow.python.lib.io import file_io

print('sd ' + str(sd.__version__))

cml 0.1.9.1-alpha
sd 1.0.0
tf 1.0.0


In [149]:
# Setup storage bucket for Machine Learning Engine and empty preprocessing directory.
BUCKET = storage.Bucket(STORAGE_BUCKET)
for preprocessing_obj in BUCKET.objects(prefix=PREPROCESSING):
    preprocessing_obj.delete()
# Read all files in preprocessing folder within the bucket.
for preprocessing_obj in BUCKET.objects(prefix=PREPROCESSING):
    print preprocessing_obj

In [150]:
# Initialize training and evaluation data sets and analyze training data for the model in Machine Learning Engine.
train_csv = ml.CsvDataSet(
  file_pattern=TRAINING_DATA_FILE_BUCKET_LOCATION,
  schema_file=SCHEMA_FILE_BUCKET_LOCATION)
eval_csv = ml.CsvDataSet(
  file_pattern=EVAL_DATA_FILE_BUCKET_LOCATION,
  schema_file=SCHEMA_FILE_BUCKET_LOCATION)
sd.analyze(
  dataset=train_csv,
  output_dir=PREPROCESSING_DIR,
  cloud=True
)

Track BigQuery status at
https://bigquery.cloud.google.com/queries/predictions-api-to-cloud-ml
Running numerical analysis...done.
Running categorical analysis...done.
Analyze: completed


In [152]:
# Create feature transformations for the model in Machine Learning Engine.
features = {
  "fare_amount": {"transform": "target"},
  "key": {"transform": "key"}, 
  "dayofweek": {"transform": "one_hot"},
  "hourofday": {"transform": "embedding", "embedding_dim": 2}, # group-combine the hour
}
file_io.write_string_to_file(FEATURES_FILE, json.dumps(features, indent=2))

In [153]:
# Read all files in preprocessing folder within the bucket.
for preprocessing_obj in BUCKET.objects(prefix=PREPROCESSING):
    print preprocessing_obj

Google Cloud Storage Object gs://papi-bucket/preprocessing/features.json
Google Cloud Storage Object gs://papi-bucket/preprocessing/schema.json
Google Cloud Storage Object gs://papi-bucket/preprocessing/stats.json
Google Cloud Storage Object gs://papi-bucket/preprocessing/vocab_dayofweek.csv
Google Cloud Storage Object gs://papi-bucket/preprocessing/vocab_hourofday.csv
Google Cloud Storage Object gs://papi-bucket/preprocessing/vocab_key.csv


In [154]:
# Empty training directory to prepare for training.
for training_obj in BUCKET.objects(prefix=TRAINING):
    training_obj.delete()
# Read all files in training folder within the bucket.
for training_obj in BUCKET.objects(prefix=TRAINING):
    print training_obj

In [None]:
# Train new model in Machine Learning Engine.
TRAINING_CONFIG = ml.CloudTrainingConfig(region=TRAINING_REGION, scale_tier=SCALE_TIER)
sd.train(
  train_dataset=train_csv,
  eval_dataset=eval_csv,
  features=features,
  analysis_dir=PREPROCESSING_DIR,
  output_dir=TRAINING_DIR,
  max_steps=2500,
  cloud=TRAINING_CONFIG
)

Building package and uploading to gs://papi-bucket/training/staging/trainer.tar.gz
Job request send. View status of job at
https://console.developers.google.com/ml/jobs?project=predictions-api-to-cloud-ml


In [158]:
# Read all files in training folder within the bucket.
for training_obj in BUCKET.objects(prefix=TRAINING):
    print training_obj

Google Cloud Storage Object gs://papi-bucket/training/
Google Cloud Storage Object gs://papi-bucket/training/evaluation_model/
Google Cloud Storage Object gs://papi-bucket/training/evaluation_model/assets.extra/
Google Cloud Storage Object gs://papi-bucket/training/evaluation_model/assets.extra/features.json
Google Cloud Storage Object gs://papi-bucket/training/evaluation_model/assets.extra/schema.json
Google Cloud Storage Object gs://papi-bucket/training/evaluation_model/saved_model.pb
Google Cloud Storage Object gs://papi-bucket/training/evaluation_model/variables/
Google Cloud Storage Object gs://papi-bucket/training/evaluation_model/variables/variables.data-00000-of-00001
Google Cloud Storage Object gs://papi-bucket/training/evaluation_model/variables/variables.index
Google Cloud Storage Object gs://papi-bucket/training/features_file.json
Google Cloud Storage Object gs://papi-bucket/training/model/
Google Cloud Storage Object gs://papi-bucket/training/model/assets.extra/
Google Clo

In [159]:
# List current ML jobs.
for job in ml.Jobs().get_iterator():
    print job

{u'trainingOutput': {u'consumedMLUnits': 1.67}, u'trainingInput': {u'scaleTier': u'STANDARD_1', u'region': u'us-central1', u'args': [u'--train-data-paths=gs://papi-bucket/sample/train.csv', u'--eval-data-paths=gs://papi-bucket/sample/valid.csv', u'--preprocess-output-dir=gs://papi-bucket/preprocessing', u'--transforms-file=gs://papi-bucket/training/features_file.json', u'--model-type=linear_regression', u'--max-steps=2500', u'--train-batch-size=100', u'--eval-batch-size=16', u'--min-eval-frequency=100', u'--learning-rate=0.01', u'--epsilon=0.0005'], u'pythonModule': u'mltoolbox._structured_data.trainer.task', u'jobDir': u'gs://papi-bucket/training', u'packageUris': [u'gs://papi-bucket/training/staging/trainer.tar.gz', u'gs://cloud-datalab/deploy/tf/tensorflow-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl', u'gs://cloud-datalab/deploy/tf/protobuf-3.1.0-py2.py3-none-any.whl']}, u'jobId': u'mltoolbox_regression_linear_170321_180125', u'state': u'SUCCEEDED', u'startTime': u'2017-03-21T18:12:44Z'

In [168]:
# Delete the model in Machine Learning Engine in case it was created previously.
ml.Models().delete(model_name=TAXIFARE_REGRESSION_MODEL_ID)

Waiting for operation "projects/predictions-api-to-cloud-ml/operations/delete_model_taxifare_regression-1490190644"
Done.


In [None]:
# Deploy the model in Machine Learning Engine (create a model object for it and push the training binary into the object).
ml.Models().create(model_name=TAXIFARE_REGRESSION_MODEL_ID)
ml.ModelVersions(model_name=TAXIFARE_REGRESSION_MODEL_ID).deploy(version_name=VERSION_NAME, path=TRAINING_DIR)

Waiting for operation "projects/predictions-api-to-cloud-ml/operations/create_taxifare_regression_v1-1490190693190"


In [171]:
# List current ML models.
for model in ml.Models().get_iterator():
    print model

{u'regions': [u'us-central1'], u'defaultVersion': {u'deploymentUri': u'gs://papi-bucket/training/model', u'name': u'projects/predictions-api-to-cloud-ml/models/taxifare_regression/versions/v1', u'isDefault': True, u'createTime': u'2017-03-22T13:51:33Z'}, u'name': u'projects/predictions-api-to-cloud-ml/models/taxifare_regression'}


In [188]:
# Get predictions from new model in Machine Learning Engine.
csv_format_predictions_input = [
  'Mon,0,-73.984685,40.769262,-73.991065,40.728145,3.0,2009-06-01 00:48:00.000000-73.984740.769340.7281-73.9911',
  'Mon,0,-74.006927,40.739993,-73.950025,40.773403,3.0,2009-06-01 00:48:00.000000-74.006940.7440.7734-73.95',
  'Tue,0,-73.977345,40.779387,-73.97615,40.778867,3.0,2009-06-02 00:48:00.000000-73.977340.779440.7789-73.9762',
  'Wed,0,-73.97136,40.794413,-73.99623,40.74524,3.0,2009-06-03 00:48:00.000000-73.971440.794440.7452-73.9962',
  'Thu,0,-73.997642,40.763853,-73.99485,40.750282,3.0,2009-06-04 00:48:00.000000-73.997640.763940.7503-73.9948',
  'Fri,0,-74.004538,40.742202,-73.955823,40.773485,3.0,2009-06-05 00:48:00.000000-74.004540.742240.7735-73.9558',
  'Fri,0,-74.000589,40.73731,-73.985902,40.692725,3.0,2012-06-15 00:46:17.000000-74.000640.737340.6927-73.9859',
  'Sat,0,-73.995432,40.72114,-73.992403,40.719745,3.0,2009-06-06 00:48:00.000000-73.995440.721140.7197-73.9924',
  'Sat,0,-73.945033,40.779203,-73.952037,40.766802,3.0,2009-06-06 00:48:00.000000-73.94540.779240.7668-73.952',
]
sd.predict(
  data=csv_format_predictions_input,
  model_name=TAXIFARE_REGRESSION_MODEL_ID,
  model_version=VERSION_NAME,
  cloud=True
)

Unnamed: 0,key,predicted
0,2009-06-01 00:48:00.000000-73.984740.769340.72...,9.82895
1,2009-06-01 00:48:00.000000-74.006940.7440.7734...,9.82887
2,2009-06-02 00:48:00.000000-73.977340.779440.77...,9.96345
3,2009-06-03 00:48:00.000000-73.971440.794440.74...,10.2289
4,2009-06-04 00:48:00.000000-73.997640.763940.75...,9.73902
5,2009-06-05 00:48:00.000000-74.004540.742240.77...,10.0021
6,2012-06-15 00:46:17.000000-74.000640.737340.69...,9.99913
7,2009-06-06 00:48:00.000000-73.995440.721140.71...,9.49842
8,2009-06-06 00:48:00.000000-73.94540.779240.766...,9.49971
