# Using TensorFlow on Google Cloud AI Platform

In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

This notebook demonstrates how [TensorFlow](https://tensorflow.org) is used in multiple services throughout the [Google Cloud AI Platform](https://cloud.google.com/ai-platform).

The first section shows how you can programatically setup Deep Learning VM and Notebook instances configured with TensorFlow.

After that, you'll perform the following steps:
* Build a TensorFlow model for the [MNIST dataset](http://yann.lecun.com/exdb/mnist/)
* Package the model code in a container
* Train the model in the Cloud AI Platform Training service
* Deploy the model to the Cloud AI Platform Prediction service
* Make predictions with the deployed model

## Constants

In [None]:
# You must change these parameters

PROJECT = 'change-me-123456'
BUCKET = 'gs://change-me'
REGION = 'us-west1'
ZONE = 'us-west1-b'

In [None]:
# Other parameters

DLVM_NAME = 'my-dlvm'
NOTEBOOK_NAME = 'my-notebook'
MODEL_NAME = 'mnist'
DLVM_IMAGE_FAMILY = 'tf2-ent-latest-cpu'
TF_IMAGE_PROJECT = 'deeplearning-platform-release'
TF_IMAGE_FAMILY = 'tf2-cpu'

IMAGE_REPO_NAME = 'tf_gcp_custom_container'
IMAGE_TAG = 'mnist'
IMAGE_URI = f'gcr.io/{PROJECT}/{IMAGE_REPO_NAME}:{IMAGE_TAG}'
JOB_DIR = f'{BUCKET}/{MODEL_NAME}'

## Administration tasks

### Working with a Deep Learning VM Image

In [None]:
# Create Google Compute Engine (GCE) instance using the TensorFlow image

!gcloud compute instances create $DLVM_NAME \
  --zone=$ZONE \
  --image-family=$DLVM_IMAGE_FAMILY \
  --image-project=$TF_IMAGE_PROJECT

In [None]:
# Check that instance was created

!gcloud compute instances list | grep $DLVM_NAME

In [None]:
# Delete instance

!gcloud compute instances delete $DLVM_NAME \
  --zone=$ZONE \
  --quiet

### Working with the Notebooks API

In [None]:
# Install package

!pip install google-cloud-notebooks --quiet

In [None]:
# Imports

from google.cloud.notebooks_v1beta1.services.notebook_service import NotebookServiceClient
from google.cloud.notebooks_v1beta1.types import ListInstancesRequest

In [None]:
# Initialize the client

client = NotebookServiceClient()

In [None]:
# Return the list of instances

parent = f'projects/{PROJECT}/locations/{ZONE}'
response = client.list_instances(ListInstancesRequest({"parent": parent}))

In [None]:
# Print the name of the first instance

response.instances[0].name

In [None]:
# Run the same command via CLI

!gcloud beta notebooks instances list --location $ZONE

## Cloud training

### Create bundle with model and training code

In [None]:
# Imports

import datetime
import os

In [None]:
# Create directory for training files

TRAIN_DIR = 'train'

if os.path.isdir(TRAIN_DIR) is False:
    os.mkdir(TRAIN_DIR)

In [None]:
# Create task.py file with training code

task_template = """import argparse
import datetime
import os

import tensorflow as tf


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--job-dir',
                        default='',
                        help='URL to store the job output')
    parser.add_argument('--batch-size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--version',
                        type=str,
                        default=datetime.datetime.now().strftime('%Y%m%d_%H%M%S'),
                        help='Subdirectory where the model files will be saved')
                        
    args = parser.parse_args()
    print(args)
    return args


def get_model():
    model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation='softmax')])

    return model


def main():
    # Parse arguments
    args = get_args()
    batch_size = args.batch_size
    epochs = args.epochs
    job_dir = args.job_dir
    version = args.version
    print('args: ', args)

    # Load data
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Train model
    model = get_model()
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
    model.evaluate(x_test,  y_test, verbose=2)

    # Export the model
    export_path = os.path.join(job_dir, 'export', version)
    model.save(export_path)


if __name__ == '__main__':
    main()
"""

with open(f'{TRAIN_DIR}/task.py', 'w') as f:
    f.write(task_template.format(**globals()))

In [None]:
# Create a version identifier based on the current date and time

VERSION = '{:%Y%m%d_%H%M%S}'.format(datetime.datetime.now())
VERSION

In [None]:
# Test training code before deploying it

!python $TRAIN_DIR/task.py --version $VERSION --epochs 1 --job-dir $JOB_DIR

## Create custom container

In [None]:
# Create Dockerfile

dockerfile_template = f"""FROM gcr.io/{TF_IMAGE_PROJECT}/{TF_IMAGE_FAMILY}
WORKDIR /root
COPY {TRAIN_DIR}/task.py /root/task.py
ENTRYPOINT ["python", "task.py"]
"""

with open(f'{TRAIN_DIR}/Dockerfile', 'w') as f:
    f.write(dockerfile_template.format(**globals()))

In [None]:
# Build container image

!docker build -f $TRAIN_DIR/Dockerfile -t $IMAGE_URI ./

In [None]:
# Test container locally

!docker run $IMAGE_URI --job-dir $JOB_DIR --epochs 1

In [None]:
# Push image to container registry if local test is successful

!docker push $IMAGE_URI

## Submit AI Platform training job

In [None]:
# Submit training job

JOB_NAME = 'custom_container_job_' + VERSION

!gcloud ai-platform jobs submit training $JOB_NAME \
  --region $REGION \
  --master-image-uri $IMAGE_URI \
  -- \
  --version=$VERSION \
  --job-dir=$JOB_DIR \
  --epochs=10

In [None]:
# Check the job status, to ensure it has completed before continuing.

!gcloud ai-platform jobs describe $JOB_NAME

## Deploy Model to Prediction service

In [None]:
# Create AI Platform Prediction model

!gcloud ai-platform models create '{MODEL_NAME}' \
  --region='{REGION}'

In [None]:
# Create model version string with the current datetime

now = datetime.datetime.now()
MODEL_VERSION = 'v' + datetime.datetime.strftime(now, '%m%d%Y%H%M%S')
MODEL_VERSION

In [None]:
# Specify location of the model that was created by the training job

MODEL_URI = os.path.join(JOB_DIR, 'export', VERSION)
MODEL_URI

In [None]:
# Create a new model version. This may take several minutes.

!gcloud ai-platform versions create {MODEL_VERSION} \
  --model={MODEL_NAME} \
  --region={REGION} \
  --origin={MODEL_URI} \
  --staging-bucket={BUCKET} \
  --runtime-version=2.3 \
  --framework='TENSORFLOW' \
  --python-version=3.7

### Use service to make predictions

In [None]:
# Imports

import googleapiclient.discovery
import matplotlib.pyplot as plt
import tensorflow as tf
from google.api_core.client_options import ClientOptions

In [None]:
# Helper function to invoke the prediction service from
# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/ml_engine/online_prediction/predict.py

def predict_json(project, model, instances, version=None):
    """Send json data to a deployed model for prediction.
    Args:
        project (str): project where the AI Platform Model is deployed.
        model (str): model name.
        instances ([Mapping[str: Any]]): Keys should be the names of Tensors
            your deployed model expects as inputs. Values should be datatypes
            convertible to Tensors, or (potentially nested) lists of datatypes
            convertible to tensors.
        version: str, version of the model to target.
    Returns:
        Mapping[str: any]: dictionary of prediction results defined by the
            model.
    """

    name = 'projects/{}/models/{}'.format(project, model)

    if version is not None:
        name += '/versions/{}'.format(version)

    response = service.projects().predict(
        name=name,
        body={'instances': instances}
    ).execute()

    if 'error' in response:
        raise RuntimeError(response['error'])

    return response['predictions']

In [None]:
# Initialize client

endpoint = f'https://{REGION}-ml.googleapis.com'  # Use regional endpoint
client_options = ClientOptions(api_endpoint=endpoint)
service = googleapiclient.discovery.build('ml', 'v1', client_options=client_options, cache_discovery=False)

In [None]:
# Load data

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

In [None]:
# Plot a sample image from the test data

sample = x_test[0]

plt.figure()
plt.imshow(sample)
plt.colorbar()
plt.grid(False)
plt.show()

In [None]:
# Invoke the prediction service with the image data

response = predict_json(PROJECT, MODEL_NAME, sample.tolist())
max_value = max(response[0])
max_index = response[0].index(max_value)

print(f'Predicted value: {max_index}')
print(f'Confidence:      {round(max_value, 5)}')

## Cleanup

In [None]:
# Delete model version resource
!gcloud ai-platform versions delete {MODEL_VERSION} --model {MODEL_NAME} --region {REGION} --quiet 

# Delete model resource
!gcloud ai-platform models delete {MODEL_NAME} --region {REGION} --quiet