In [4]:
# Adapted from https://github.com/GoogleCloudPlatform/cloudml-samples/blob/master/notebooks/tensorflow/getting-started-keras.ipynb
import os
import sys

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../credentials/sticky-notes1-data-access.json'

### Run package locally with 'python3 -m'

In [39]:
# Test that the package works
! python3 -m trainer.task \
    --job-dir ../train-output \
    --data_dir ../data/processed/tfrecords/100-examples \
    --initial_weights_path gs://sticky-notes1/training-jobs/200430_151618/1/model-weights.tf

2020-04-30 16:47:06.556858: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-04-30 16:47:06.572175: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7ff9d41d7060 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-04-30 16:47:06.572202: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
No GPUs available
Loading initial model weights from gs://sticky-notes1/training-jobs/200430_151618/1/model-weights.tf
Train for 2 steps, validate for 1 steps
^C


### Run locally with 'gcloud ai-platform local train'

In [17]:
# Explicitly tell `gcloud ai-platform local train` to use Python 3 
! gcloud config set ml_engine/local_python $(which python3)

# Run the trainer with ai-platform local train
! gcloud ai-platform local train \
    --package-path ./trainer \
    --module-name trainer.task \
    --job-dir ../train-output \
    -- \
    --data_dir ../data/processed/tfrecords/100-examples

Updated property [ml_engine/local_python].
2020-04-29 21:00:54.880603: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-04-29 21:00:54.896925: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fe66042d710 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-04-29 21:00:54.896956: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
Train for 2 steps, validate for 1 steps
2020-04-29 21:01:06.322637: I tensorflow/core/profiler/lib/profiler_session.cc:225] Profiler session started.


### Submit job to gcloud

In [42]:
import datetime

BUCKET_NAME = 'sticky-notes1'
os_variables = {
    'BUCKET_NAME': BUCKET_NAME,
    'REGION': 'europe-west1',
    'JOB_NAME': 'sticky_job',
    'JOB_DIR': 'gs://' + BUCKET_NAME + '/training-jobs',
    'DATA_DIR': 'gs://' + BUCKET_NAME + '/training-data'
}
for key, value in os_variables.items():
    os.environ[key] = value

! export date=$(date +%y%m%d_%H%M%S); \
  gcloud ai-platform jobs submit training "${JOB_NAME}_${date}" \
    --config config.yaml \
    --package-path trainer/ \
    --module-name trainer.task \
    --region $REGION \
    --python-version 3.7 \
    --runtime-version 2.1 \
    --job-dir "${JOB_DIR}/${date}" \
    -- \
    --epochs 5 \
    --data_dir $DATA_DIR \
    --initial_weights_path gs://sticky-notes1/training-jobs/200430_151618/3/model-weights.tf

Job [sticky_job_200430_202730] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe sticky_job_200430_202730

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs sticky_job_200430_202730
jobId: sticky_job_200430_202730
state: QUEUED


Updates are available for some Cloud SDK components.  To install them,
please run:
  $ gcloud components update



In [None]:
# view training logs
! tensorboard --logdir $JOB_DIR

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.1.1 at http://localhost:6006/ (Press CTRL+C to quit)
2020-05-01 02:51:40.236014: E tensorflow/core/platform/cloud/curl_http_request.cc:596] The transmission  of request 0x7f83bb3cdfc0 (URI: https://storage.googleapis.com/sticky-notes1/training-jobs%2F200430_151618%2F1%2Ftraining-logs%2Fvalidation%2Fevents.out.tfevents.1588252871.cmle-training-216446297068919051.410.3650.v2) has been stuck at 0 of 0 bytes for 7205 seconds and will be aborted. CURL timing information: lookup time: 0.000787 (No error), connect time: 0.005467 (No error), pre-transfer time: 0.012653 (No error), start-transfer time: 0.012653 (No error)
2020-05-01 04:51:56.884559: E tensorflow/core/platform/cloud/curl_http_request.cc:596] The transmission  of request 0x7f83ba5c5ac0 (URI: https://www.googleapis.com/storage/v1/b/sticky-notes1/o?fields=items%2Fname%2CnextPageToken&prefix=training-jobs%2F200430_151618%2F3%2Ftr

In [28]:
# MODEL_NAME="mynet"

# ! gcloud ai-platform models create $MODEL_NAME \
#   --regions $REGION

In [29]:
# MODEL_VERSION="v1"

# # Get a list of directories in the `keras_export` parent directory. Then pick
# # the directory with the latest timestamp, in case you've trained multiple
# # times.
# SAVED_MODEL_PATH=$(gsutil ls $JOB_DIR/keras_export | tail -n 1)

# # Create model version based on that SavedModel directory
# gcloud ai-platform versions create $MODEL_VERSION \
#   --model $MODEL_NAME \
#   --runtime-version 1.15 \
#   --python-version 3.7 \
#   --framework tensorflow \
#   --origin $SAVED_MODEL_PATH