## Initialize settings

In [1]:
# change these to try this notebook out
BUCKET = 'cloudonair-ml-demo'
PROJECT = 'cloudonair-ml-demo'
REGION = 'us-central1'

In [2]:
import os

os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [3]:
%%bash

gcloud config set project $PROJECT
gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


## Explore Natality dataset

In [4]:
# Create SQL query using natality data after the year 2000
query = """
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 100
"""

In [5]:
# Call BigQuery and examine in dataframe
import pandas
from pandas.io import gbq

df = gbq.read_gbq(query=query, dialect='standard', project_id=os.environ['PROJECT'])
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,3.56267,True,25,1,30
1,3.999185,False,30,1,32
2,7.438397,True,13,1,34
3,4.806077,True,19,1,34
4,4.812691,True,22,3,34


## Training on Cloud ML Engine

In [6]:
%%bash

if ! gsutil ls | grep -q gs://${BUCKET}/; then
  gsutil mb -l ${REGION} gs://${BUCKET}
  # copy canonical set of preprocessed files if you didn't do previous notebook
  gsutil -m cp -R gs://cloud-training-demos/babyweight gs://${BUCKET}
fi

In [7]:
%%bash

gsutil ls gs://${BUCKET}/babyweight/preproc/*-00000*

gs://cloudonair-ml-demo/babyweight/preproc/eval.csv-00000-of-00012
gs://cloudonair-ml-demo/babyweight/preproc/train.csv-00000-of-00043


In [13]:
%%bash

OUTDIR=gs://${BUCKET}/babyweight/trained_model
JOBNAME=babyweight_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME

gsutil -m rm -rf $OUTDIR

gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/model_evaluation_pipeline/trainer \
  --job-dir=$OUTDIR \
  --staging-bucket=gs://$BUCKET \
  --scale-tier=STANDARD_1 \
  --runtime-version=1.4 \
  -- \
  --bucket=${BUCKET} \
  --output_dir=${OUTDIR} \
  --train_examples=200000

gs://cloudonair-ml-demo/babyweight/trained_model us-central1 babyweight_180516_034734
jobId: babyweight_180516_034734
state: QUEUED


CommandException: 1 files/objects could not be removed.
Job [babyweight_180516_034734] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe babyweight_180516_034734

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs babyweight_180516_034734


### Check model directory

In [32]:
%%bash

gsutil du -shc gs://cloudonair-ml-demo/babyweight/trained_model/export/exporter/

0 B         gs://cloudonair-ml-demo/babyweight/trained_model/export/exporter
0 B         total


### Execute Apache Beam Pipeline with DirectRunner

In [15]:
import os
import datetime

os.environ['RUNNER'] = 'DirectRunner'
os.environ['JOB_NAME'] = 'evaluate-ml-model-{0}'.format(datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
os.environ['SAVED_MODEL_DIR'] = 'gs://cloudonair-ml-demo/babyweight/trained_model/export/exporter/1526369461'
os.environ['OUTPUT_TABLE'] = 'cloudonair-ml-demo:model_evaluation.sample'

In [28]:
%%bash

cd $(pwd)/model_evaluation_pipeline/evaluator

python -m main \
  --datasize=100 \
  --year_from=1974 \
  --year_to=1978 \
  --saved_model_dir=${SAVED_MODEL_DIR} \
  --output_table=${OUTPUT_TABLE} \
  --project=${PROJECT} \
  --runner=DirectRunner \
  --region=${REGION} \
  --job_name=${JOB_NAME}

  from compiler import parse, ast, pycodegen
  pipeline.replace_all(_get_transform_overrides(pipeline.options))
INFO:root:Running pipeline with DirectRunner.
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Refreshing access_token
INFO:root:initializing predictor...
2018-05-16 12:56:59.395668: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
ERROR:root:Exception at bundle <apache_beam.runners.direct.bundle_factory._Bundle object at 0x7ffb544fb908>, due to an exception.
 Traceback (most recent call last):
  File "/usr/local/google/home/yaboo/Resources/anaconda2/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py", line 307, in call
    side_input_values)
  File "/usr/local/google/home/yaboo/Resources/anaconda2/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py", line 340, in attempt_call
  

### Query BQ Table

In [29]:
import pandas
from pandas.io import gbq

In [30]:
query = """
SELECT * FROM `{0}` LIMIT 10
""".format(os.environ['OUTPUT_TABLE'].replace(':', '.'))

df = gbq.read_gbq(query=query, dialect='standard', project_id=os.environ['PROJECT'])
df.head()

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks,weight_true,weight_predicted,weight_residual,model,testdata,time_inference
0,True,26,Single(1),33,4.938355,0.292607,4.645748,gs://cloudonair-ml-demo/babyweight/trained_mod...,1974-1978,0.605822
1,True,32,Single(1),35,7.561856,0.29829,7.263566,gs://cloudonair-ml-demo/babyweight/trained_mod...,1974-1978,1.049995
2,True,21,Single(1),39,6.437498,0.291146,6.146352,gs://cloudonair-ml-demo/babyweight/trained_mod...,1974-1978,0.945091
3,True,24,Single(1),42,8.126239,0.29324,7.832999,gs://cloudonair-ml-demo/babyweight/trained_mod...,1974-1978,1.027822
4,True,20,Single(1),42,8.375361,0.29324,8.082121,gs://cloudonair-ml-demo/babyweight/trained_mod...,1974-1978,0.81706


In [31]:
query = """
SELECT
  model,
  testdata,
  SQRT(SUM(POW(weight_residual,2))/COUNT(*)) AS RMSE,
  AVG(time_inference) AS AVG_SEC
FROM
  `{0}`
GROUP BY
  model, testdata
""".format(os.environ['OUTPUT_TABLE'].replace(':', '.'))

df = gbq.read_gbq(query=query, dialect='standard', project_id=os.environ['PROJECT'])
df.head()

Unnamed: 0,model,testdata,RMSE,AVG_SEC
0,gs://cloudonair-ml-demo/babyweight/trained_mod...,1969-1973,7.099255,2.889484
1,gs://cloudonair-ml-demo/babyweight/trained_mod...,1974-1978,7.097674,4.989076
2,gs://cloudonair-ml-demo/babyweight/trained_mod...,1984-1988,7.208219,3.528221


### Configure Dataflow job

In [1]:
import os
import datetime

os.environ['RUNNER'] = 'DataflowRunner'
os.environ['STAGING_LOCATION'] = 'gs://{0}/babyweight/staging'.format(os.environ['BUCKET'])
os.environ['TEMP_LOCATION'] = 'gs://{0}/babyweight/temp'.format(os.environ['BUCKET'])
os.environ['JOB_NAME'] = 'evaluate-ml-model-{0}'.format(datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
os.environ['SAVED_MODEL_DIR'] = 'gs://cloudonair-ml-demo/babyweight/trained_model/export/exporter/1526369461'
os.environ['OUTPUT_TABLE'] = 'cloudonair-ml-demo:model_evaluation.prediction_results'

### Execute Dataflow job

In [2]:
%%bash

cd $(pwd)/model_evaluation_pipeline/evaluator

python -m main \
  --datasize=100 \
  --year_from=1969 \
  --year_to=1973 \
  --saved_model_dir=${SAVED_MODEL_DIR} \
  --output_table=${OUTPUT_TABLE} \
  --project=${PROJECT} \
  --runner=${RUNNER} \
  --region=${REGION} \
  --staging_location=${STAGING_LOCATION} \
  --temp_location=${TEMP_LOCATION} \
  --job_name=${JOB_NAME} \
  --setup_file=$(pwd)/setup.py

running sdist
running egg_info
creating model_evaluation_demo.egg-info
writing requirements to model_evaluation_demo.egg-info/requires.txt
writing model_evaluation_demo.egg-info/PKG-INFO
writing top-level names to model_evaluation_demo.egg-info/top_level.txt
writing dependency_links to model_evaluation_demo.egg-info/dependency_links.txt
writing manifest file 'model_evaluation_demo.egg-info/SOURCES.txt'
reading manifest file 'model_evaluation_demo.egg-info/SOURCES.txt'
writing manifest file 'model_evaluation_demo.egg-info/SOURCES.txt'
running check
creating model-evaluation-demo-0.1
creating model-evaluation-demo-0.1/model_evaluation_demo.egg-info
creating model-evaluation-demo-0.1/process
copying files to model-evaluation-demo-0.1...
copying setup.py -> model-evaluation-demo-0.1
copying model_evaluation_demo.egg-info/PKG-INFO -> model-evaluation-demo-0.1/model_evaluation_demo.egg-info
copying model_evaluation_demo.egg-info/SOURCES.txt -> model-evaluation-demo-0.1/model_evaluation_demo.

  from compiler import parse, ast, pycodegen
INFO:root:Starting GCS upload to gs://cloudonair-ml-demo/babyweight/staging/evaluate-ml-model-180515-225339.1526392426.514969/pipeline.pb...
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Refreshing access_token
INFO:root:Completed GCS upload to gs://cloudonair-ml-demo/babyweight/staging/evaluate-ml-model-180515-225339.1526392426.514969/pipeline.pb
INFO:root:Executing command: ['/usr/local/google/home/yaboo/Resources/anaconda2/bin/python', 'setup.py', 'sdist', '--dist-dir', '/tmp/tmpCcekPV']


INFO:root:Starting GCS upload to gs://cloudonair-ml-demo/babyweight/staging/evaluate-ml-model-180515-225339.1526392426.514969/workflow.tar.gz...
INFO:root:Completed GCS upload to gs://cloudonair-ml-demo/babyweight/staging/evaluate-ml-model-180515-225339.1526392426.514969/workflow.tar.gz
INFO:root:Starting GCS upload to gs://cloudonair-ml-demo/babyweight/staging/evaluate-ml-model-180515-225339.1526

In [None]:
query = """
SELECT
  model,
  testdata,
  SQRT(SUM(POW(weight_residual,2))/COUNT(*)) AS RMSE,
  AVG(time_inference) AS AVG_SEC
FROM
  `{0}`
GROUP BY
  model, testdata
""".format(os.environ['OUTPUT_TABLE'].replace(':', '.'))

df = gbq.read_gbq(query=query, dialect='standard', project_id=os.environ['PROJECT'])
df.head()