In [1]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Custom training and online prediction

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/official/custom-training-online-prediction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/official/custom-training-online-prediction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use the AI Platform (Unified) Python client library to train and deploy a custom tabular classification model for online prediction.

### Dataset

The dataset used for this tutorial is the penguins dataset from [BigQuery public datasets](https://cloud.google.com/bigquery/public-data). The version of the dataset you will use only the fields culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g to predict the penguins species (species).

### Objective

In this notebook, you create a custom-trained model from a Python script in a Docker container using the AI Platform (Unified) client library, and then do a prediction on the deployed model by sending data. Alternatively, you can create custom-trained models using `gcloud` command-line tool, or online using the Cloud Console.

The steps performed include:

- Create an AI Platform (Unified) custom job for training a model.
- Train a TensorFlow model.
- Deploy the `Model` resource to a serving `Endpoint` resource.
- Make a prediction.
- Undeploy the `Model` resource.

### Costs

This tutorial uses billable components of Google Cloud (GCP):

* AI Platform (Unified)
* Cloud Storage

Learn about [Cloud AI Platform
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installation

Install the latest (preview) version of AI Platform (Unified) client library.

In [2]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

In [3]:
! pip3 install {USER_FLAG} --upgrade google-cloud-aiplatform

You should consider upgrading via the '/Users/ivanmkc/Documents/code/ai-platform-samples/env2/bin/python -m pip install --upgrade pip' command.[0m


Install the latest GA version of *google-cloud-storage* library as well.

In [4]:
! pip3 install {USER_FLAG} -U google-cloud-storage

You should consider upgrading via the '/Users/ivanmkc/Documents/code/ai-platform-samples/env2/bin/python -m pip install --upgrade pip' command.[0m


In [35]:
! pip3 install {USER_FLAG} -U "google-cloud-bigquery[all]"

Collecting google-cloud-bigquery[all]
  Using cached google_cloud_bigquery-2.17.0-py2.py3-none-any.whl (223 kB)
Collecting opentelemetry-instrumentation>=0.11b0
  Using cached opentelemetry_instrumentation-0.21b0-py3-none-any.whl (16 kB)
Collecting opentelemetry-sdk>=0.11b0
  Using cached opentelemetry_sdk-1.10a0-py3-none-any.whl (50 kB)
Collecting opentelemetry-api>=0.11b0
  Using cached opentelemetry_api-1.10a0-py3-none-any.whl (40 kB)
  Using cached opentelemetry_api-1.2.0-py3-none-any.whl (37 kB)
Collecting Deprecated>=1.2.6
  Using cached Deprecated-1.2.12-py2.py3-none-any.whl (9.5 kB)
Collecting opentelemetry-sdk>=0.11b0
  Using cached opentelemetry_sdk-1.2.0-py3-none-any.whl (36 kB)
Collecting opentelemetry-semantic-conventions==0.21b0
  Using cached opentelemetry_semantic_conventions-0.21b0-py3-none-any.whl (20 kB)
Installing collected packages: Deprecated, opentelemetry-semantic-conventions, opentelemetry-api, opentelemetry-sdk, opentelemetry-instrumentation, google-cloud-bigq

Install the *pillow* library for loading images.

In [5]:
! pip3 install {USER_FLAG} -U pillow

You should consider upgrading via the '/Users/ivanmkc/Documents/code/ai-platform-samples/env2/bin/python -m pip install --upgrade pip' command.[0m


Install the *numpy* library for manipulation of image data.

In [6]:
! pip3 install {USER_FLAG} -U numpy

You should consider upgrading via the '/Users/ivanmkc/Documents/code/ai-platform-samples/env2/bin/python -m pip install --upgrade pip' command.[0m


### Restart the kernel

Once you've installed everything, you need to restart the notebook kernel so it can find the packages.

In [7]:
# TODO: Remove this
%env IS_TESTING=1

env: IS_TESTING=1


In [8]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### Select a GPU runtime

**Make sure you're running this notebook in a GPU runtime if you have that option. In Colab, select "Runtime --> Change runtime type > GPU"**

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable the AI Platform (Unified) API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component).

4. If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk).

5. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [9]:
PROJECT_ID = "python-docs-samples-tests"

if not os.getenv("IS_TESTING"):
    # Get your Google Cloud project ID from gcloud
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Otherwise, set your project ID here.

In [10]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append it onto the name of resources you create in this tutorial.

In [11]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your Google Cloud account

**If you are using AI Platform Notebooks**, your environment is already
authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "AI Platform"
into the filter box, and select
   **AI Platform Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click *Create*. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [12]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# If on AI Platform, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you submit a training job using the Cloud SDK, you upload a Python package
containing your training code to a Cloud Storage bucket. AI Platform runs
the code from this package. In this tutorial, AI Platform also saves the
trained model that results from your job in the same bucket. Using this model artifact, you can then
create AI Platform model and endpoint resources in order to serve
online predictions.

Set the name of your Cloud Storage bucket below. It must be unique across all
Cloud Storage buckets.

You may also change the `REGION` variable, which is used for operations
throughout the rest of this notebook. Make sure to [choose a region where AI Platform (Unified) services are
available](https://cloud.google.com/vertex-ai/docs/general/locations#available_regions). You may
not use a Multi-Regional Storage bucket for training with AI Platform.

In [13]:
BUCKET_NAME = "gs://ivanmkc-test2"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}

In [14]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "gs://[your-bucket-name]":
    BUCKET_NAME = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [15]:
! gsutil mb -l $REGION $BUCKET_NAME

Creating gs://ivanmkc-test2/...
ServiceException: 409 A Cloud Storage bucket named 'ivanmkc-test2' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


Finally, validate access to your Cloud Storage bucket by examining its contents:

In [16]:
! gsutil ls -al $BUCKET_NAME

      2073  2021-05-07T17:20:31Z  gs://ivanmkc-test2/aiplatform-2021-05-07-13:20:30.406-aiplatform_custom_trainer_script-0.1.tar.gz#1620408031014708  metageneration=1
      2085  2021-05-11T01:39:31Z  gs://ivanmkc-test2/aiplatform-2021-05-10-21:39:30.859-aiplatform_custom_trainer_script-0.1.tar.gz#1620697171325511  metageneration=1
      3165  2021-05-18T01:01:54Z  gs://ivanmkc-test2/aiplatform-2021-05-17-21:01:54.120-aiplatform_custom_trainer_script-0.1.tar.gz#1621299714675620  metageneration=1
      3188  2021-05-18T18:00:44Z  gs://ivanmkc-test2/aiplatform-2021-05-18-14:00:43.706-aiplatform_custom_trainer_script-0.1.tar.gz#1621360844345827  metageneration=1
      3174  2021-05-18T19:22:16Z  gs://ivanmkc-test2/aiplatform-2021-05-18-15:22:15.934-aiplatform_custom_trainer_script-0.1.tar.gz#1621365736630406  metageneration=1
      3182  2021-05-18T19:22:56Z  gs://ivanmkc-test2/aiplatform-2021-05-18-15:22:56.161-aiplatform_custom_trainer_script-0.1.tar.gz#1621365776739206  metageneration=

### Set up variables

Next, set up some variables used throughout the tutorial.

#### Import AI Platform (Unified) client library

Import the AI Platform (Unified) client library into your Python environment and initialize it.

In [17]:
import os
import sys

from google.cloud import aiplatform
from google.cloud.aiplatform import gapic as aip

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME)

#### Set hardware accelerators

You can set hardware accelerators for both training and prediction.

Set the variables `TRAIN_GPU/TRAIN_NGPU` and `DEPLOY_GPU/DEPLOY_NGPU` to use a container image supporting a GPU and the number of GPUs allocated to the virtual machine (VM) instance. For example, to use a GPU container image with 4 Nvidia Tesla K80 GPUs allocated to each VM, you would specify:

    (aip.AcceleratorType.NVIDIA_TESLA_K80, 4)

See the [locations where accelerators are available](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators).

Otherwise specify `(None, None)` to use a container image to run on a CPU.

*Note*: TensorFlow releases earlier than 2.3 for GPU support fail to load the custom model in this tutorial. This issue is caused by static graph operations that are generated in the serving function. This is a known issue, which is fixed in TensorFlow 2.3. If you encounter this issue with your own custom models, use a container image for TensorFlow 2.3 or later with GPU support.

In [18]:
TRAIN_GPU, TRAIN_NGPU = (aip.AcceleratorType.NVIDIA_TESLA_K80, 1)

DEPLOY_GPU, DEPLOY_NGPU = (aip.AcceleratorType.NVIDIA_TESLA_K80, 1)

#### Set pre-built containers

AI Platform (Unified) provides pre-built containers to run training and prediction.

For the latest list, see [Pre-built containers for training](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers) and [Pre-built containers for prediction](https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers)

In [19]:
TRAIN_VERSION = "tf-gpu.2-4"
DEPLOY_VERSION = "tf2-gpu.2-4"

TRAIN_IMAGE = "gcr.io/cloud-aiplatform/training/{}:latest".format(TRAIN_VERSION)
DEPLOY_IMAGE = "gcr.io/cloud-aiplatform/prediction/{}:latest".format(DEPLOY_VERSION)

print("Training:", TRAIN_IMAGE, TRAIN_GPU, TRAIN_NGPU)
print("Deployment:", DEPLOY_IMAGE, DEPLOY_GPU, DEPLOY_NGPU)

Training: gcr.io/cloud-aiplatform/training/tf-gpu.2-4:latest AcceleratorType.NVIDIA_TESLA_K80 1
Deployment: gcr.io/cloud-aiplatform/prediction/tf2-gpu.2-4:latest AcceleratorType.NVIDIA_TESLA_K80 1


#### Set machine types

Next, set the machine types to use for training and prediction.

- Set the variables `TRAIN_COMPUTE` and `DEPLOY_COMPUTE` to configure your compute resources for training and prediction.
 - `machine type`
     - `n1-standard`: 3.75GB of memory per vCPU
     - `n1-highmem`: 6.5GB of memory per vCPU
     - `n1-highcpu`: 0.9 GB of memory per vCPU
 - `vCPUs`: number of \[2, 4, 8, 16, 32, 64, 96 \]

*Note: The following is not supported for training:*

 - `standard`: 2 vCPUs
 - `highcpu`: 2, 4 and 8 vCPUs

*Note: You may also use n2 and e2 machine types for training and deployment, but they do not support GPUs*.

In [20]:
MACHINE_TYPE = "n1-standard"

VCPU = "4"
TRAIN_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Train machine type", TRAIN_COMPUTE)

MACHINE_TYPE = "n1-standard"

VCPU = "4"
DEPLOY_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Deploy machine type", DEPLOY_COMPUTE)

Train machine type n1-standard-4
Deploy machine type n1-standard-4


# Tutorial

Now you are ready to start creating your own custom-trained model with CIFAR10.

## Create a Managed Tabular Dataset from Big Query Dataset

Your first step in training a model is to create a managed dataset instance.

In [75]:
BQ_SOURCE = "bq://bigquery-public-data.ml_datasets.penguins"

dataset = aiplatform.TabularDataset.create(
    display_name="sample-penguins", 
    bq_source=BQ_SOURCE)

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/1012616486416/locations/us-central1/datasets/65944309387493376/operations/55106973028319232
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/1012616486416/locations/us-central1/datasets/65944309387493376
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/1012616486416/locations/us-central1/datasets/65944309387493376')


## Train a model

There are two ways you can train a custom model using a container image:

- **Use a Google Cloud prebuilt container**. If you use a prebuilt container, you will additionally specify a Python package to install into the container image. This Python package contains your code for training a custom model.

- **Use your own custom container image**. If you use your own container, the container needs to contain your code for training a custom model.

### Define the command args for the training script

Prepare the command-line arguments to pass to your training script.
- `args`: The command line arguments to pass to the corresponding Python module. In this example, they will be:
  - `"--epochs=" + EPOCHS`: The number of epochs for training.
  - `"--steps=" + STEPS`: The number of steps (batches) per epoch.
  - `"--distribute=" + TRAIN_STRATEGY"` : The training distribution strategy to use for single or distributed training.
     - `"single"`: single device.
     - `"mirror"`: all GPU devices on a single compute instance.
     - `"multi"`: all GPU devices on all compute instances.

In [76]:
JOB_NAME = "custom_job_" + TIMESTAMP
MODEL_DIR = "{}/{}".format(BUCKET_NAME, JOB_NAME)

if not TRAIN_NGPU or TRAIN_NGPU < 2:
    TRAIN_STRATEGY = "single"
else:
    TRAIN_STRATEGY = "mirror"

EPOCHS = 20
STEPS = 100

CMDARGS = [
    "--epochs=" + str(EPOCHS),
    "--steps=" + str(STEPS),
    "--distribute=" + TRAIN_STRATEGY,
]

#### Training script

In the next cell, you will write the contents of the training script, `task.py`. In summary:

- Get the directory where to save the model artifacts from the environment variable `AIP_MODEL_DIR`. This variable is set by the training service.
- Loads CIFAR10 dataset from TF Datasets (tfds).
- Builds a model using TF.Keras model API.
- Compiles the model (`compile()`).
- Sets a training distribution strategy according to the argument `args.distribute`.
- Trains the model (`fit()`) with epochs and steps according to the arguments `args.epochs` and `args.steps`
- Saves the trained model (`save(MODEL_DIR)`) to the specified model directory.

In [77]:
%%writefile task.py
import tensorflow as tf
import numpy as np
import os

import pandas as pd
import tensorflow as tf

from google.cloud import bigquery

# Read environmental variables
training_data_uri = os.environ["AIP_TRAINING_DATA_URI"]
validation_data_uri = os.environ["AIP_VALIDATION_DATA_URI"]
test_data_uri = os.environ["AIP_TEST_DATA_URI"]

# Set up training variables
EPOCHS = 20
BATCH_SIZE = 10
LABEL_COLUMN = "species"
UNUSED_COLUMNS = []
NA_VALUES = ["NA", "."]

# Possible categorical values
SPECIES = ['Adelie Penguin (Pygoscelis adeliae)',
           'Chinstrap penguin (Pygoscelis antarctica)',
           'Gentoo penguin (Pygoscelis papua)']
ISLANDS = ['Dream', 'Biscoe', 'Torgersen']
SEXES = ['FEMALE', 'MALE']

# Set up BigQuery clients
bqclient = bigquery.Client()

# Download a table
def download_table(bq_table_uri: str):
    # Remove bq:// prefix if present
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix) :]

    table = bigquery.TableReference.from_string(bq_table_uri)
    rows = bqclient.list_rows(
        table,
    )
    return rows.to_dataframe()


df_train = download_table(training_data_uri)
df_validation = download_table(validation_data_uri)
df_test = download_table(test_data_uri)

# Remove NA values
def clean_dataframe(df):
    return df.replace(to_replace=NA_VALUES, value=np.NaN).dropna()


df_train = clean_dataframe(df_train)
df_validation = clean_dataframe(df_validation)
df_test = clean_dataframe(df_test)

_CATEGORICAL_TYPES = {
    "island": pd.api.types.CategoricalDtype(categories=ISLANDS),
    "species": pd.api.types.CategoricalDtype(categories=SPECIES),
    "sex": pd.api.types.CategoricalDtype(categories=SEXES),
}


def standardize(df):
    """Scales numerical columns using their means and standard deviation to get
    z-scores: the mean of each numerical column becomes 0, and the standard
    deviation becomes 1. This can help the model converge during training.

    Args:
      df: Pandas df

    Returns:
      Input df with the numerical columns scaled to z-scores
    """
    dtypes = list(zip(df.dtypes.index, map(str, df.dtypes)))
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == "float32":
            df[column] -= df[column].mean()
            df[column] /= df[column].std()
    return df


def preprocess(df):
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      df: Pandas df with raw data

    Returns:
      df with preprocessed data
    """
    df = df.drop(columns=UNUSED_COLUMNS)

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    df[numeric_columns] = df[numeric_columns].astype("float32")

    # Convert categorical columns to numeric
    cat_columns = df.select_dtypes(["object"]).columns

    df[cat_columns] = df[cat_columns].apply(
        lambda x: x.astype(_CATEGORICAL_TYPES[x.name])
    )
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df


def convert_dataframe_to_dataset(
    df_train,
    df_test,
):
    df_train = preprocess(df_train)
    df_test = preprocess(df_test)

    df_train_x, df_train_y = df_train, df_train.pop(LABEL_COLUMN)
    df_test_x, df_test_y = df_test, df_test.pop(LABEL_COLUMN)

#     # This is recommended to improve training but not necessary
#     # Join train_x and eval_x to normalize on overall means and standard
#     # deviations. Then separate them again.
#     all_x = pd.concat([df_train_x, df_test_x], keys=["train", "eval"])
#     all_x = standardize(all_x)
#     df_train_x, df_test_x = all_x.xs("train"), all_x.xs("eval")

    y_train = np.asarray(df_train_y).astype("float32")
    y_test = np.asarray(df_test_y).astype("float32")

    # Convert to numpy representation
    x_train = np.asarray(df_train_x)
    x_test = np.asarray(df_test_x)

    # Convert to one-hot representation
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=len(SPECIES))
    y_test = tf.keras.utils.to_categorical(y_test, num_classes=len(SPECIES))

    dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    return (dataset_train, dataset_test)


dataset_train, dataset_test = convert_dataframe_to_dataset(df_train, df_test)

# Shuffle train set
dataset_train = dataset_train.shuffle(len(df_train))

NUM_FEATURES = dataset_train._flat_shapes[0].dims[0].value

# Batch datasets
dataset_train = dataset_train.batch(BATCH_SIZE)
dataset_test = dataset_test.batch(BATCH_SIZE)

# Create model
Dense = tf.keras.layers.Dense
model = tf.keras.Sequential(
    [
        Dense(
            100,
            activation=tf.nn.relu,
            kernel_initializer="uniform",
            input_dim=NUM_FEATURES,
        ),
        Dense(75, activation=tf.nn.relu),
        Dense(50, activation=tf.nn.relu),
        Dense(25, activation=tf.nn.relu),
        Dense(3, activation=tf.nn.softmax),
    ]
)

# Compile Keras model
optimizer = tf.keras.optimizers.RMSprop(lr=0.001)
model.compile(
    loss="categorical_crossentropy", metrics=["accuracy"], optimizer=optimizer
)

model.fit(dataset_train, epochs=EPOCHS, validation_data=dataset_test)

tf.saved_model.save(model, os.environ["AIP_MODEL_DIR"])

Overwriting task.py


### Train the model

Define your custom training job on AI Platform (Unified).

Use the `CustomTrainingJob` class to define the job, which takes the following parameters:

- `display_name`: The user-defined name of this training pipeline.
- `script_path`: The local path to the training script.
- `container_uri`: The URI of the training container image.
- `requirements`: The list of Python package dependencies of the script.
- `model_serving_container_image_uri`: The URI of a container that can serve predictions for your model â€” either a prebuilt container or a custom container.

Use the `run` function to start training, which takes the following parameters:

- `args`: The command line arguments to be passed to the Python script.
- `replica_count`: The number of worker replicas.
- `model_display_name`: The display name of the `Model` if the script produces a managed `Model`.
- `machine_type`: The type of machine to use for training.
- `accelerator_type`: The hardware accelerator type.
- `accelerator_count`: The number of accelerators to attach to a worker replica.

The `run` function creates a training pipeline that trains and creates a `Model` object. After the training pipeline completes, the `run` function returns the `Model` object.

In [None]:
job = aiplatform.CustomTrainingJob(
    display_name=JOB_NAME,
    script_path="task.py",
    container_uri=TRAIN_IMAGE,
    requirements=["google-cloud-bigquery[all]"],
    model_serving_container_image_uri=DEPLOY_IMAGE,
)

MODEL_DISPLAY_NAME = "penguins-" + TIMESTAMP

# Start the training
if TRAIN_GPU:
    model = job.run(
        dataset=dataset,
        model_display_name=MODEL_DISPLAY_NAME,
        bigquery_destination=f'bq://{PROJECT_ID}',
        args=CMDARGS,
        replica_count=1,
        machine_type=TRAIN_COMPUTE,
        accelerator_type=TRAIN_GPU.name,
        accelerator_count=TRAIN_NGPU,
    )
else:
    model = job.run(
        dataset=dataset,
        model_display_name=MODEL_DISPLAY_NAME,
        bigquery_destination=f'bq://{PROJECT_ID}',
        args=CMDARGS,
        replica_count=1,
        machine_type=TRAIN_COMPUTE,
        accelerator_count=0,
    )

INFO:google.cloud.aiplatform.utils.source_utils:Training script copied to:
gs://ivanmkc-test2/aiplatform-2021-05-28-13:57:40.649-aiplatform_custom_trainer_script-0.1.tar.gz.
INFO:google.cloud.aiplatform.training_jobs:Training Output directory:
gs://ivanmkc-test2/aiplatform-custom-training-2021-05-28-13:57:41.006 
INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1649997517384843264?project=1012616486416
INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/1012616486416/locations/us-central1/trainingPipelines/1649997517384843264 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/1012616486416/locations/us-central1/trainingPipelines/1649997517384843264 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/1012616486416/locations/us-central1/trainingPip

### Deploy the model

Before you use your model to make predictions, you need to deploy it to an `Endpoint`. You can do this by calling the `deploy` function on the `Model` resource. This will do two things:

1. Create an `Endpoint` resource for deploying the `Model` resource to.
2. Deploy the `Model` resource to the `Endpoint` resource.


The function takes the following parameters:

- `deployed_model_display_name`: A human readable name for the deployed model.
- `traffic_split`: Percent of traffic at the endpoint that goes to this model, which is specified as a dictionary of one or more key/value pairs.
   - If only one model, then specify as **{ "0": 100 }**, where "0" refers to this model being uploaded and 100 means 100% of the traffic.
   - If there are existing models on the endpoint, for which the traffic will be split, then use `model_id` to specify as **{ "0": percent, model_id: percent, ... }**, where `model_id` is the model id of an existing model to the deployed endpoint. The percents must add up to 100.
- `machine_type`: The type of machine to use for training.
- `accelerator_type`: The hardware accelerator type.
- `accelerator_count`: The number of accelerators to attach to a worker replica.
- `starting_replica_count`: The number of compute instances to initially provision.
- `max_replica_count`: The maximum number of compute instances to scale to. In this tutorial, only one instance is provisioned.

### Traffic split

The `traffic_split` parameter is specified as a Python dictionary. You can deploy more than one instance of your model to an endpoint, and then set the percentage of traffic that goes to each instance.

You can use a traffic split to introduce a new model gradually into production. For example, if you had one existing model in production with 100% of the traffic, you could deploy a new model to the same endpoint, direct 10% of traffic to it, and reduce the original model's traffic to 90%. This allows you to monitor the new model's performance while minimizing the distruption to the majority of users.

### Compute instance scaling

You can specify a single instance (or node) to serve your online prediction requests. This tutorial uses a single node, so the variables `MIN_NODES` and `MAX_NODES` are both set to `1`.

If you want to use multiple nodes to serve your online prediction requests, set `MAX_NODES` to the maximum number of nodes you want to use. AI Platform (Unified) autoscales the number of nodes used to serve your predictions, up to the maximum number you set. Refer to the [pricing page](https://cloud.google.com/vertex-ai/pricing#prediction-prices) to understand the costs of autoscaling with multiple nodes.

### Endpoint

The method will block until the model is deployed and eventually return an `Endpoint` object. If this is the first time a model is deployed to the endpoint, it may take a few additional minutes to complete provisioning of resources.

In [25]:
DEPLOYED_NAME = "penguins_deployed-" + TIMESTAMP

TRAFFIC_SPLIT = {"0": 100}

MIN_NODES = 1
MAX_NODES = 1

if DEPLOY_GPU:
    endpoint = model.deploy(
        deployed_model_display_name=DEPLOYED_NAME,
        traffic_split=TRAFFIC_SPLIT,
        machine_type=DEPLOY_COMPUTE,
        accelerator_type=DEPLOY_GPU.name,
        accelerator_count=DEPLOY_NGPU,
        min_replica_count=MIN_NODES,
        max_replica_count=MAX_NODES,
    )
else:
    endpoint = model.deploy(
        deployed_model_display_name=DEPLOYED_NAME,
        traffic_split=TRAFFIC_SPLIT,
        machine_type=DEPLOY_COMPUTE,
        accelerator_type=DEPLOY_COMPUTE.name,
        accelerator_count=0,
        min_replica_count=MIN_NODES,
        max_replica_count=MAX_NODES,
    )

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/1012616486416/locations/us-central1/endpoints/946934598212780032/operations/7456210010658308096
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/1012616486416/locations/us-central1/endpoints/946934598212780032
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/1012616486416/locations/us-central1/endpoints/946934598212780032')
INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/1012616486416/locations/us-central1/endpoints/946934598212780032
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/1012616486416/locations/us-central1/endpoints/946934598212780032/operations/1028447442493767680
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/10126164864

## Make an online prediction request

Send an online prediction request to your deployed model.

### Get test data

Download test data from and preprocess them.

#### Download the test images

Download the test dataset:

In [43]:
from google.cloud import bigquery

LABEL_COLUMN = "species"
UNUSED_COLUMNS = []
NA_VALUES = ["NA", "."]

# Possible categorical values
SPECIES = ['Adelie Penguin (Pygoscelis adeliae)',
           'Chinstrap penguin (Pygoscelis antarctica)',
           'Gentoo penguin (Pygoscelis papua)']
ISLANDS = ['Dream', 'Biscoe', 'Torgersen']
SEXES = ['FEMALE', 'MALE']

# Set up BigQuery clients
bqclient = bigquery.Client()

# Download a table
def download_table(bq_table_uri: str):
    # Remove bq:// prefix if present
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix) :]

    table = bigquery.TableReference.from_string(bq_table_uri)
    rows = bqclient.list_rows(
        table,
    )
    return rows.to_dataframe()

df = download_table(BQ_SOURCE)

# Remove NA values
def clean_dataframe(df):
    return df.replace(to_replace=NA_VALUES, value=np.NaN).dropna()


df = clean_dataframe(df)

_CATEGORICAL_TYPES = {
    "island": pd.api.types.CategoricalDtype(categories=ISLANDS),
    "species": pd.api.types.CategoricalDtype(categories=SPECIES),
    "sex": pd.api.types.CategoricalDtype(categories=SEXES),
}


def standardize(df):
    """Scales numerical columns using their means and standard deviation to get
    z-scores: the mean of each numerical column becomes 0, and the standard
    deviation becomes 1. This can help the model converge during training.

    Args:
      df: Pandas df

    Returns:
      Input df with the numerical columns scaled to z-scores
    """
    dtypes = list(zip(df.dtypes.index, map(str, df.dtypes)))
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == "float32":
            df[column] -= df[column].mean()
            df[column] /= df[column].std()
    return df


def preprocess(df):
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      df: Pandas df with raw data

    Returns:
      df with preprocessed data
    """
    df = df.drop(columns=UNUSED_COLUMNS)

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    df[numeric_columns] = df[numeric_columns].astype("float32")

    # Convert categorical columns to numeric
    cat_columns = df.select_dtypes(["object"]).columns

    df[cat_columns] = df[cat_columns].apply(
        lambda x: x.astype(_CATEGORICAL_TYPES[x.name])
    )
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df


def convert_dataframe_to_list(
    df,
):
    df = preprocess(df)

    df_x, df_y = df, df.pop(LABEL_COLUMN)

#     # This is recommended to improve training but not necessary
#     # Join train_x and eval_x to normalize on overall means and standard
#     # deviations. Then separate them again.
#     all_x = pd.concat([df_train_x, df_test_x], keys=["train", "eval"])
#     all_x = standardize(all_x)
#     df_train_x, df_test_x = all_x.xs("train"), all_x.xs("eval")

    y = np.asarray(df_y).astype("float32")

    # Convert to numpy representation
    x = np.asarray(df_x)

    # Convert to one-hot representation
#     y = tf.keras.utils.to_categorical(y, num_classes=len(SPECIES))
    return x, y


x, y = convert_dataframe_to_list(df)

In [72]:
asdf = np.array(x).astype("int").tolist()

In [74]:
predictions = endpoint.predict(instances=asdf)
# y_predicted = np.argmax(predictions.predictions, axis=1)
# y_predicted
predictions

Prediction(predictions=[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0

In [26]:
# Download the test file
! gsutil -m cp -r gs://cloud-samples-data/ai-platform/penguins/penguins.test.csv .

If you experience problems with multiprocessing on MacOS, they might be related to https://bugs.python.org/issue33725. You can disable multiprocessing by editing your .boto config or by adding the following flag to your command: `-o "GSUtil:parallel_process_count=1"`. Note that multithreading is still available even if you disable multiprocessing.

Copying gs://cloud-samples-data/ai-platform/penguins/penguins.test.csv...
/ [1/1 files][  7.1 KiB/  7.1 KiB] 100% Done                                    
Operation completed over 1 objects/7.1 KiB.                                      


#### Preprocess the images
Before you can run the data through the endpoint, you need to preprocess it to match the format that your custom model defined in `task.py` expects.

`x_test`:
Normalize (rescale) the pixel data by dividing each pixel by 255. This replaces each single byte integer pixel with a 32-bit floating point number between 0 and 1.

`y_test`:
You can extract the labels from the image filenames. Each image's filename format is "image_{LABEL}_{IMAGE_NUMBER}.jpg"

In [27]:
import pandas as pd

df = pd.read_csv("penguins.test.csv", header=0)
x_test, y_test = df.iloc[:, 2:6], df.iloc[:, [0]]
x_test = x_test.values.tolist()
y_test = y_test.values.tolist()

In [28]:
'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'

('culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g')

In [29]:
instances = [{"culmen_length_mm": instance[0], "culmen_depth_mm": instance[1], "flipper_length_mm": instance[2], "body_mass_g": instance[3]}  for instance in x_test]

### Send the prediction request

Now that you have test images, you can use them to send a prediction request. Use the `Endpoint` object's `predict` function, which takes the following parameters:

- `instances`: A list of image instances. According to your custom model, each image instance should be a 3-dimensional matrix of floats. This was prepared in the previous step.

The `predict` function returns a list, where each element in the list corresponds to the corresponding image in the request. You will see in the output for each prediction:

- Confidence level for the prediction (`predictions`), between 0 and 1, for each of the ten classes.

You can then run a quick evaluation on the prediction results:
1. `np.argmax`: Convert each list of confidence levels to a label
2. Compare the predicted labels to the actual labels
3. Calculate `accuracy` as `correct/total`

In [30]:
import numpy as np

sample = {
    "island": "Biscoe",
    "culmen_length_mm": 37.8,
    "culmen_depth_mm": 17.1,
    "flipper_length_mm": 186.0,
    "body_mass_g": 3300.0,
    "sex": "MALE",
}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
instances = [input_dict]

predictions = endpoint.predict(instances=[sample])
y_predicted = np.argmax(predictions.predictions, axis=1)

# correct = sum(y_predicted == np.array(y_test))
# accuracy = len(y_predicted)
# print(
#     f"Correct predictions = {correct}, Total predictions = {accuracy}, Accuracy = {correct/accuracy}"
# )

NameError: name 'tf' is not defined

In [None]:
import numpy as np

sample = {
    "island": "Biscoe",
    "culmen_length_mm": 37.8,
    "culmen_depth_mm": 17.1,
    "flipper_length_mm": 186.0,
    "body_mass_g": 3300.0,
    "sex": "MALE",
}


In [None]:
np.array(y_test)

## Undeploy the model

To undeploy your `Model` resource from the serving `Endpoint` resource, use the endpoint's `undeploy` method with the following parameter:

- `deployed_model_id`: The model deployment identifier returned by the endpoint service when the `Model` resource was deployed. You can retrieve the deployed models using the endpoint's `deployed_models` property.

Since this is the only deployed model on the `Endpoint` resource, you can omit `traffic_split`.

In [None]:
deployed_model_id = endpoint.list_models()[0].id
endpoint.undeploy(deployed_model_id=deployed_model_id)

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Training Job
- Model
- Endpoint
- Cloud Storage Bucket

In [None]:
delete_training_job = True
delete_model = True
delete_endpoint = True

# Warning: Setting this to true will delete everything in your bucket
delete_bucket = False

# Delete the training job
job.delete()

# Delete the model
model.delete()

# Delete the endpoint
endpoint.delete()

if delete_bucket and "BUCKET_NAME" in globals():
    ! gsutil -m rm -r $BUCKET_NAME