<h1> Structured Data Solution </h1>

Use the structured data package in Datalab to build a model.

In [None]:
import os
PROJECT = 'cloud-training-demos'    # CHANGE THIS
BUCKET = 'cloud-training-demos-ml'  # CHANGE THIS
REGION = 'us-central1' # CHANGE THIS

os.environ['PROJECT'] = PROJECT # for bash
os.environ['BUCKET'] = BUCKET # for bash
os.environ['REGION'] = REGION # for bash

In [None]:
%bash
echo "project=$PROJECT"
echo "bucket=$BUCKET"
echo "region=$REGION"
gcloud config set project $PROJECT
gcloud config set compute/region $REGION
gcloud beta ml init-project -q

In [None]:
import tensorflow as tf
import google.cloud.ml as cml
import datalab_solutions.structured_data as sd
print('tf ' + str(tf.__version__))
print('sd ' + str(sd.__version__))
print('cml ' + str(cml.__version__))

<h2> Set up schema file </h2>

Schema of training/test. Same format as BigQuery.  STRING/INTEGER/FLOAT only.

In [None]:
%writefile taxifare.json
[
    {
        "mode": "NULLABLE",
        "name": "fare_amount",
        "type": "FLOAT"
    },  
    {
        "mode": "NULLABLE",
        "name": "pickuplon",
        "type": "FLOAT"
    },
    {
        "mode": "NULLABLE",
        "name": "pickuplat",
        "type": "FLOAT"
    },
    {
        "mode": "NULLABLE",
        "name": "dropofflon",
        "type": "FLOAT"
    },
    {
        "mode": "NULLABLE",
        "name": "dropofflat",
        "type": "FLOAT"
    },
    {
        "mode": "NULLABLE",
        "name": "passengers",
        "type": "FLOAT"
    },
    {
        "mode": "REQUIRED",
        "name": "key",
        "type": "STRING"
    } 
]

<h2> Local preprocessing, training and prediction </h2>

In [None]:
!rm -rf taxi_preproc taxi_model

In [None]:
sd.local_preprocess(
  input_file_pattern=os.path.join('../lab1a/', 'taxi-train*'),
  output_dir=os.path.join('.', 'taxi_preproc'),
  schema_file=os.path.join('.', 'taxifare.json'),
)

In [None]:
sd.local_train(
  train_file_pattern=os.path.join('../lab1a/', 'taxi-train*'),
  eval_file_pattern=os.path.join('../lab1a/', 'taxi-valid*'),
  preprocess_output_dir=os.path.join('.', 'taxi_preproc'),
  output_dir=os.path.join('.', 'taxi_model'),
  key_column='key',
  model_type='dnn_regression',
#  top_n=3,
  max_steps=250,
  layer_sizes=[64, 4]
)

In [None]:
# the true fare_amount (in the input) is optional, but useful for comparison
sd.local_predict(
  model_dir=os.path.join('.', 'taxi_model/model'),
  data=['6.5,-73.981648,40.755953,-73.985661,40.746788,1,0',
'5.5,-73.967817,40.787522,-73.959902,40.780477,1,1',
'6.9,-73.991977,40.729534,-73.996962,40.744703,1,2',
'12.5,-73.978102,40.75265,-73.999422,40.714375,2,3',
'6.9,-73.989687,40.771346,-73.972793,40.780418,2,4',
'4.0,-73.865865,40.770915,-73.86766,40.768462,2,5',
'12.6,-73.978121,40.763045,-73.925264,40.768753,1,6',
'4.9,-73.996283,40.743747,-73.995543,40.731898,1,7',
'6.5,-73.990883,40.750464,-73.97307,40.750428,2,8',
'11.3,-73.971663,40.763008,-73.992665,40.752327,3,9']
)

In [None]:
!rm -rf batch_predict

In [None]:
sd.local_batch_predict(
  model_dir=os.path.join('.', 'taxi_model/model'),
  prediction_input_file=os.path.join('../lab1a/', 'taxi-valid*'),
  output_dir=os.path.join('./', 'batch_predict'),
  output_format='csv'
)

In [None]:
!ls batch_predict

In [None]:
!head -5 batch_predict/predictions*.csv

<h2> Cloud preprocessing and training </h2>

In [None]:
%bash
echo $BUCKET
gsutil cp \
   /content/training-data-analyst/CPB102/lab1a/*.csv \
  ./taxifare.json \
   gs://${BUCKET}/taxifare/smallinput/

In [None]:
%bash
echo $BUCKET
gsutil rm -r gs://${BUCKET}/taxifare/smallinput/taxi_preproc gs://${BUCKET}/taxifare/smallinput/taxi_model

In [None]:
CLOUD_ROOT='gs://{}/taxifare/smallinput/'.format(BUCKET)
sd.cloud_preprocess(
  input_file_pattern=os.path.join(CLOUD_ROOT, 'taxi-train*'),
  output_dir=os.path.join(CLOUD_ROOT, 'taxi_preproc'),
  schema_file=os.path.join(CLOUD_ROOT, 'taxifare.json'),
)

In [None]:
CLOUD_ROOT='gs://{}/taxifare/smallinput/'.format(BUCKET)
sd.cloud_train(
  train_file_pattern=os.path.join(CLOUD_ROOT, 'taxi-train*'),
  eval_file_pattern=os.path.join(CLOUD_ROOT, 'taxi-valid*'),
  preprocess_output_dir=os.path.join(CLOUD_ROOT, 'taxi_preproc'),
  output_dir=os.path.join(CLOUD_ROOT, 'taxi_model'),
  key_column='key',
  model_type='dnn_regression',
#  top_n=3,
  max_steps=250,
  layer_sizes=[64, 4],
  region=REGION,
  scale_tier='BASIC'
)

<h2> Cloud deploy model and predict </h2>

In [None]:
%bash
MODEL_NAME="taxifare"
MODEL_VERSION="v1"
CLOUD_ROOT="gs://${BUCKET}/taxifare/smallinput/"
echo "Deleting and deploying $MODEL_NAME $MODEL_VERSION from $CLOUD_ROOT ... this will take a few minutes"
gcloud beta ml versions delete ${MODEL_VERSION} --model ${MODEL_NAME}
gcloud beta ml models delete ${MODEL_NAME}
gcloud beta ml models create ${MODEL_NAME} --regions $REGION
gcloud beta ml versions create ${MODEL_VERSION} --model ${MODEL_NAME} --origin ${CLOUD_ROOT}taxi_model/model

In [None]:
# the true fare_amount (in the input) is optional, but useful for comparison
sd.cloud_predict(
  model_name='taxifare',
  model_version='v1',
  data=['6.5,-73.981648,40.755953,-73.985661,40.746788,1,0',
'5.5,-73.967817,40.787522,-73.959902,40.780477,1,1',
'6.9,-73.991977,40.729534,-73.996962,40.744703,1,2',
'12.5,-73.978102,40.75265,-73.999422,40.714375,2,3',
'6.9,-73.989687,40.771346,-73.972793,40.780418,2,4',
'4.0,-73.865865,40.770915,-73.86766,40.768462,2,5',
'12.6,-73.978121,40.763045,-73.925264,40.768753,1,6',
'4.9,-73.996283,40.743747,-73.995543,40.731898,1,7',
'6.5,-73.990883,40.750464,-73.97307,40.750428,2,8',
'11.3,-73.971663,40.763008,-73.992665,40.752327,3,9']
)