In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/vertex_endpoints/optimized_tensorflow_runtime/tabular_optimized_online_prediction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/vertex_endpoints/optimized_tensorflow_runtime/tabular_optimized_online_prediction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/vertex_endpoints/optimized_tensorflow_runtime/tabular_optimized_online_prediction.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

# Training a tabular Criteo model and deploying it to Vertex AI Predictions using the optimized TensorFlow runtime

## Overview

In this sample you learn how to train a tabular model using TensorFlow Keras or Estimator API using Criteo Kaggle dataset.
Next, you export a trained model to the Vertex AI Prediction service using open source based TensorFlow 2.7 container and the optimized TensorFlow runtime container, run performance evaluation for those models side by side and compare predictions.

For additional information about Vertex AI Prediction optimized TensorFlow runtime containers, please refer to https://cloud.google.com/vertex-ai/docs/predictions/optimized-tensorflow-runtime.

### Dataset

In this sample you use Criteo Kaggle dataset, which takes about 4GB.


### Objective

In this notebook, you learn how to deploy a trained tabular model to Vertex AI Prediction using the optimized TensorFlow runtime, then compare its performance to open source based TensorFlow container.

The steps performed include:
* Download and unpack Criteo Kaggle dataset
* Build and train a model using the Keras API
* Setup private endpoints
* Deploy a model to Vertex AI Prediction using TensorFlow 2.7 container
* Deploy a model to Vertex AI Prediction using optimized TensorFlow container
* Benchmark both models and validate their predictions

You can train a model and upload it to Vertex AI Prediction using Colab. Since this walkthrough uses private endpoints to demonstrate Vertex AI Predictions, you must use Jupyter VM to run benchmarks.

### Costs

This tutorial uses the following billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Set up your local development environment

**If you are using Colab or Vertex AI Workbench Notebooks**, your environment meets the requirements to run this notebook. You can skip this step.

**If you are not using Colab or Vertex AI Workbench Notebooks**, you must have the following in your environment to meet this notebook's requirements.

* The Google Cloud SDK
* Git
* Python 3
* virtualenv
* Jupyter notebook running in a virtual environment with Python 3

The Google Cloud guide to [Setting up a Python development
environment](https://cloud.google.com/python/setup) and the [Jupyter
installation guide](https://jupyter.org/install) provide detailed instructions
for meeting these requirements. The following steps are condensed instructions:

1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)

1. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)

1. [Install
   virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)
   and create a virtual environment that uses Python 3. Activate the virtual environment.

1. To install Jupyter, run `pip3 install jupyter` on the
command-line in a terminal shell.

1. To launch Jupyter, run `jupyter notebook` on the command-line in a terminal shell.

1. Open this notebook in the Jupyter Notebook Dashboard.

### Install additional packages

Install additional package dependencies not installed in your notebook environment, such as TensorFlow Serving API, Vertex AI SDK.

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Vertex AI Workbench Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

In [None]:
! pip3 install {USER_FLAG} --upgrade tensorflow==2.7.0 -q
! pip3 install {USER_FLAG} --upgrade tensorflow-serving-api==2.7.0 -q
! pip3 install {USER_FLAG} --upgrade google-cloud-aiplatform -q
! pip3 install {USER_FLAG} --upgrade google-cloud-storage -q

### Restart the kernel

After you install the additional packages, you must restart the notebook kernel so it can find the packages.

In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

## Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 credit towards your compute and storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). 

1. [Enable the Service Networking API](https://console.cloud.google.com/flows/enableapi?apiid=servicenetworking.googleapis.com). 

1. [Enable the Cloud DNS API](https://console.cloud.google.com/flows/enableapi?apiid=dns.googleapis.com). 

1. If you are running this notebook locally, you must install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the following cell. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Set your project ID

**If you don't know your project ID**, you can try to get your project ID using `gcloud`.

In [None]:
import os

PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Set your project ID here.

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, create a timestamp for each instance session, then append it to the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your Google Cloud account

**If you are using Vertex AI Workbench Notebooks**, your environment is already
authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "Vertex AI"
into the filter box, and select
   **Vertex AI Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click **Create**. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the following cell, then run the cell.

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# The Vertex AI Workbench Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# If on Vertex AI Workbench Notebooks, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required for all notebook environments.**

For Vertex AI Prediction to serve your model, it must be uploaded to Cloud Storage bucket first.

Set the name of your Cloud Storage bucket in the following cell. It must be unique across all
Cloud Storage buckets.

You can change the `REGION` variable, which is used for operations
throughout the rest of this notebook. We suggest that you [choose a region where Vertex AI services are
available](https://cloud.google.com/vertex-ai/docs/general/locations#available_regions).

In [None]:
BUCKET_URI = "gs://[your-bucket-name]"  # @param {type:"string"}
REGION = "[your-region]"  # @param {type:"string"}

In [None]:
if BUCKET_URI == "" or BUCKET_URI is None or BUCKET_URI == "gs://[your-bucket-name]":
    BUCKET_URI = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

if REGION == "[your-region]":
    REGION = "us-central1"

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

The final step for your Cloud Storage bucket is to validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

### Import libraries and define constants

In [None]:
import json
import math
import os
import re
import sys
from urllib.parse import urlparse

import grpc
import numpy as np
import requests as r
import tensorflow as tf
from tensorflow_serving.apis import (predict_pb2, prediction_log_pb2,
                                     prediction_service_pb2_grpc)

logging = tf.get_logger()
logging.propagate = False
logging.setLevel("INFO")

In [None]:
LOCAL_DIRECTORY = "~/criteo"  # @param {type:"string"}
HIDDEN_LAYERS_STR = "2048,2048,1024,512,256"  # @param {type:"string"}

HIDDEN_LAYERS = list(map(lambda x: int(x), HIDDEN_LAYERS_STR.split(",")))
LOCAL_DIRECTORY_FULL = os.path.expanduser(LOCAL_DIRECTORY)

## Download the dataset

Follow the instructions at the [Criteo website](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/) to download the data.

If the data is not available, you can download it using the URL below.

In [None]:
!mkdir -p $LOCAL_DIRECTORY_FULL/data

In [None]:
!cd $LOCAL_DIRECTORY_FULL/data && curl -O https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/10082655/dac.tar.gz

In [None]:
!cd $LOCAL_DIRECTORY_FULL/data && tar xvzf dac.tar.gz

In [None]:
!head -n 3 $LOCAL_DIRECTORY_FULL/data/train.txt

## Read and transform the dataset

Before the model can be trained, the variables must be pre-processed.

Numerical values are normalized by subtracting their average and dividing by their standard deviation.
The average values and the standard deviations are precalculated for each numerical feature. The vocabulary sizes are precalculated for each categorical feature.

In [None]:
COLUMN_NAMES = [
    "label",
    "int1",
    "int2",
    "int3",
    "int4",
    "int5",
    "int6",
    "int7",
    "int8",
    "int9",
    "int10",
    "int11",
    "int12",
    "int13",
    "cat1",
    "cat2",
    "cat3",
    "cat4",
    "cat5",
    "cat6",
    "cat7",
    "cat8",
    "cat9",
    "cat10",
    "cat11",
    "cat12",
    "cat13",
    "cat14",
    "cat15",
    "cat16",
    "cat17",
    "cat18",
    "cat19",
    "cat20",
    "cat21",
    "cat22",
    "cat23",
    "cat24",
    "cat25",
    "cat26",
]

# Precalculated, see
# https://github.com/vlasenkoalexey/criteo_nbdev/blob/master/04_data_reader.ipynb
NUM_AVERAGE = {
    "int1": 3.5024133170753995,
    "int2": 105.8484197976657,
    "int3": 26.91304102061112,
    "int4": 7.322680248873331,
    "int5": 18538.99166487135,
    "int6": 116.06185085211605,
    "int7": 16.333130032135013,
    "int8": 12.517042137556762,
    "int9": 106.10982343805145,
    "int10": 0.6175294977722183,
    "int11": 2.7328343170173173,
    "int12": 0.9910356287721245,
    "int13": 8.21746116117401,
}
NUM_STDDEV = {
    "int1": 9.429076407105086,
    "int2": 391.4578226870704,
    "int3": 397.97258302273474,
    "int4": 8.793230712645805,
    "int5": 69394.60184622335,
    "int6": 382.5664493712363,
    "int7": 66.0497552451171,
    "int8": 16.688884567787586,
    "int9": 220.28309398647906,
    "int10": 0.6840505553977025,
    "int11": 5.199070884811354,
    "int12": 5.597723872237179,
    "int13": 16.211932558173785,
}
VOCABULARY_SIZE = {
    "cat1": 1460,
    "cat2": 583,
    "cat3": 10131226,
    "cat4": 2202607,
    "cat5": 305,
    "cat6": 23,
    "cat7": 12517,
    "cat8": 633,
    "cat9": 3,
    "cat10": 93145,
    "cat11": 5683,
    "cat12": 8351592,
    "cat13": 3194,
    "cat14": 27,
    "cat15": 14992,
    "cat16": 5461305,
    "cat17": 10,
    "cat18": 5652,
    "cat19": 2172,
    "cat20": 3,
    "cat21": 7046546,
    "cat22": 17,
    "cat23": 15,
    "cat24": 286180,
    "cat25": 104,
    "cat26": 142571,
}

In [None]:
@tf.function
def transform_row(*row_tuple):
    row_dict = dict(
        zip(list(column_name for column_name in COLUMN_NAMES), list(row_tuple))
    )
    dict_without_label = dict(row_dict)
    label = dict_without_label.pop("label")
    return (dict_without_label, label)


def read_gcs(batch_size=64):
    file_name = os.path.join(LOCAL_DIRECTORY_FULL, "data", "train.txt")
    record_defaults = list(
        tf.int64
        if column_name == "label"
        else tf.constant(0, dtype=tf.int64)
        if column_name.startswith("int")
        else tf.constant("", dtype=tf.string)
        for column_name in COLUMN_NAMES
    )
    dataset = tf.data.experimental.CsvDataset(
        file_name, record_defaults, field_delim="\t", header=False
    )

    transformed_ds = (
        dataset.batch(batch_size).shuffle(500).map(transform_row).prefetch(50)
    )

    return transformed_ds

In [None]:
for row in read_gcs(batch_size=3).take(2):
    print(row)

## Train and Save Keras Model

For an overview of how to train tabular models using TensorFlow Keras APIs, see https://github.com/tensorflow/docs/blob/r2.4/site/en/tutorials/structured_data/feature_columns.ipynb

In [None]:
def make_norm_fn(column_name):
    avg = NUM_AVERAGE[column_name]
    stddev = NUM_STDDEV[column_name]
    return lambda v: (tf.dtypes.cast(v, tf.float32) - avg) / stddev


def create_feature_columns():
    linear_feature_columns = []
    categorical_feature_columns = []

    for column_name in COLUMN_NAMES:
        if column_name.startswith("int"):
            linear_feature_columns.append(
                tf.feature_column.numeric_column(
                    column_name,
                    dtype=tf.dtypes.int64,
                    normalizer_fn=make_norm_fn(column_name),
                )
            )

        if column_name.startswith("cat"):
            column_vocabulary_size = VOCABULARY_SIZE[column_name]
            hash_bucket_size = min(column_vocabulary_size, 100000)
            embedding_dimension = int(
                min(50, math.floor(6 * column_vocabulary_size**0.25))
            )
            categorical_feature_columns.append(
                tf.feature_column.embedding_column(
                    tf.feature_column.categorical_column_with_hash_bucket(
                        column_name, hash_bucket_size, dtype=tf.dtypes.string
                    ),
                    embedding_dimension,
                )
            )

    return linear_feature_columns + categorical_feature_columns

In [None]:
def create_keras_model_sequential():
    feature_columns = create_feature_columns()

    feature_layer = tf.keras.layers.DenseFeatures(feature_columns, name="feature_layer")
    Dense = tf.keras.layers.Dense
    Dropout = tf.keras.layers.Dropout
    BatchNormalization = tf.keras.layers.BatchNormalization
    dense_layers = []
    for c in HIDDEN_LAYERS:
        dense_layers.append(BatchNormalization())
        dense_layers.append(Dense(c, activation=tf.nn.relu))
        dense_layers.append(Dropout(0.05))
    model = tf.keras.Sequential(
        [feature_layer] + dense_layers + [Dense(1, activation=tf.nn.sigmoid)]
    )

    logging.info("compiling sequential keras model")
    # Compile Keras model
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.01),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=["accuracy"],
    )
    return model


model = create_keras_model_sequential()

Train the model. The expected loss is about 0.35.

In [None]:
model.fit(read_gcs(batch_size=256).take(1000), epochs=3)

Validate the model. The expected loss is about 0.45.

In [None]:
model.evaluate(read_gcs(batch_size=256).skip(1000).take(1000))

In [None]:
model.summary()

In [None]:
model.save(os.path.join(LOCAL_DIRECTORY_FULL, "keras"), include_optimizer=False)

Check the model signature to see which fields prediction request should have.

In [None]:
!saved_model_cli show --dir $LOCAL_DIRECTORY_FULL/keras --all

## (Optional) Train and Save Estimator Model

Another option to train a model is to use the TensorFlow Estimator API. For more information, see
https://github.com/tensorflow/docs/blob/r2.4/site/en/tutorials/estimator/premade.ipynb

The following code is provided only for illustration purposes. You use the Keras model for deployment.

In [None]:
feature_columns = create_feature_columns()
estimator = tf.estimator.DNNClassifier(
    optimizer=tf.optimizers.Adam(learning_rate=0.01),
    feature_columns=feature_columns,
    hidden_units=HIDDEN_LAYERS,
    dropout=0.05,
    batch_norm=True,
    n_classes=2,
)

In [None]:
tf.estimator.train_and_evaluate(
    estimator,
    train_spec=tf.estimator.TrainSpec(
        input_fn=lambda: read_gcs(batch_size=256).take(2000)
    ),
    eval_spec=tf.estimator.EvalSpec(input_fn=lambda: read_gcs().skip(500).take(100)),
)

In [None]:
!rm -r -f $LOCAL_DIRECTORY_FULL/estimator

In [None]:
tf.compat.v1.disable_eager_execution()  # You'll have to restart Runtime after running this
spec_dict = {}
for column_name in COLUMN_NAMES:
    if column_name.startswith("int"):
        spec_dict[column_name] = tf.compat.v1.placeholder(
            name=column_name, shape=(1,), dtype=tf.int64
        )
    if column_name.startswith("cat"):
        spec_dict[column_name] = tf.compat.v1.placeholder(
            name=column_name, shape=(), dtype=tf.string
        )

serving_input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(spec_dict)
estimator_base_path = os.path.join(LOCAL_DIRECTORY_FULL, "estimator")
estimator_path = estimator.export_saved_model(estimator_base_path, serving_input_fn)
estimator_path = estimator_path.decode("ascii")
estimator_path

In [None]:
!saved_model_cli show --dir $estimator_path --all

## Generate prediction requests

Now we can generate requests to send to our model for inference.
Requests are generated in the JSON Lines format, one request per line.

In [None]:
!mkdir -p $LOCAL_DIRECTORY_FULL/requests

In [None]:
def wrap_value(value, wrap_value):
    if wrap_value:
        return [value]
    else:
        return value


def row_to_dict(row, wrap_values):
    d = {}
    for key, value in row[0].items():
        if "int" in key:
            d[key] = [wrap_value(v, wrap_values) for v in value.numpy().tolist()]
        if "cat" in key:
            d[key] = [
                wrap_value(v.decode(), wrap_values) for v in value.numpy().tolist()
            ]
    return d


def export_requests_jsonl(file_name, rows=100, batch_size=64, wrap_values=True):
    with tf.io.gfile.GFile(file_name, mode="w") as f:
        for row in read_gcs(batch_size):
            d = row_to_dict(row, wrap_values)
            f.write(json.dumps(d))
            f.write("\n")
            rows -= 1
            if rows == 0:
                break

In [None]:
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_1_1.jsonl"),
    rows=1,
    batch_size=1,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_1_512.jsonl"),
    rows=1,
    batch_size=512,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_10_1.jsonl"),
    rows=10,
    batch_size=1,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_10_512.jsonl"),
    rows=10,
    batch_size=512,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_10_1024.jsonl"),
    rows=10,
    batch_size=1024,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_100_1.jsonl"),
    rows=100,
    batch_size=1,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_100_512.jsonl"),
    rows=100,
    batch_size=512,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_100_1024.jsonl"),
    rows=100,
    batch_size=1024,
)

If you want to export requests for the Estimator model, you must set `wrap_values` to `False`. 

In [None]:
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_estimator_10_1.jsonl"),
    rows=10,
    batch_size=1,
    wrap_values=False,
)

## (Optional) Generate warmup requests

The TensorFlow runtime has components that are lazily initialized. Lazy initialization might result in high latency for the first requests that are sent to a model after it's loaded. This latency can be several orders of magnitude higher than that of a single inference request.

For more information about SavedModel warmup, see https://www.tensorflow.org/tfx/serving/saved_model_warmup.

For Vertex AI Prediction using the optimized TensorFlow runtime, when the model is precompiled the first request for each new batch size has higher latency. Precompilation is enabled when the `allow_precompilation` flag is set to true.

To mitigate high latency, provide a warmup request for the runtime to load when it starts.
The warmup file should include the various batch sizes you expect your model to receive in production.

Note that providing a warmup request with multiple batch sizes increase the time for each node to start.

If you expect the model to receive multiple batch sizes, you can use automatic server-side request batching with a set of `allowed_batch_sizes`. For more information, see https://www.tensorflow.org/tfx/serving/serving_config#batching_configurationß.

To enable auto-batching for a model running on Vertex AI Prediction, put your batching configuration into the [config/batching_parameters_config](https://cloud.google.com/vertex-ai/docs/training/exporting-model-artifacts#enable_server-side_request_batching_for_tensorflow) file in the same GCS directory as saved_model.pb.

In [None]:
!mkdir -p $LOCAL_DIRECTORY_FULL/keras/assets.extra

In [None]:
def build_grpc_request(
    row_dict, model_name="default", signature_name="serving_default"
):
    """Generate gRPC inference request with payload."""

    request = predict_pb2.PredictRequest()
    request.model_spec.name = model_name
    request.model_spec.signature_name = signature_name
    for key, value in row_dict.items():
        proto = None
        if "cat" in key:
            proto = tf.make_tensor_proto(value, dtype=tf.string)
        else:
            proto = tf.make_tensor_proto(value, dtype=tf.int64)
        request.inputs[key].CopyFrom(proto)
    return request


def export_warmup_file(
    request_files, export_path, model_name="default", signature_name="serving_default"
):
    with tf.io.TFRecordWriter(export_path) as writer:
        for request_file_path in request_files:
            with open(request_file_path) as f:
                row_dict = json.loads(f.readline())
                request = build_grpc_request(row_dict, model_name, signature_name)
            log = prediction_log_pb2.PredictionLog(
                predict_log=prediction_log_pb2.PredictLog(request=request)
            )
            writer.write(log.SerializeToString())


export_warmup_file(
    [
        os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_1_1.jsonl"),
        os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_1_512.jsonl"),
    ],
    os.path.join(
        LOCAL_DIRECTORY_FULL, "keras", "assets.extra", "tf_serving_warmup_requests"
    ),
)

## Deploy model to Vertex AI Prediction

To deploy a model to Vertex AI Prediction service, you must put it in a GCS bucket.

In [None]:
!gsutil rm -r $BUCKET_URI/*

In [None]:
!gsutil cp -r $LOCAL_DIRECTORY_FULL/keras/* $BUCKET_URI

Import the Vertex AI Python client library into your notebook environment.

In [None]:
from google.cloud.aiplatform import gapic as aip

Define the node type to use for deployments. To learn about Vertex AI Prediction options, see [configure compute resources](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute). 

In [None]:
DEPLOY_COMPUTE = "n1-standard-16"
DEPLOY_GPU = aip.AcceleratorType.NVIDIA_TESLA_T4

The AI Platform Python client library works as a client/server model. 

The following clients are used in this sample:
- Model Service for managing models.
- Endpoint Service for deployment.

In [None]:
API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"
PARENT = f"projects/{PROJECT_ID}/locations/{REGION}"

client_options = {"api_endpoint": API_ENDPOINT}
model_service_client = aip.ModelServiceClient(client_options=client_options)
endpoint_service_client = aip.EndpointServiceClient(client_options=client_options)

### Setup private endpoint for online prediction

The throughput and latency of the Criteo model you trained is sensitive to network performance.

Notice that a single request with a batch size of 512 takes ~200Kb.

In [None]:
!ls -alh $LOCAL_DIRECTORY_FULL/requests/requests_1_512.jsonl

For the best performance, use a Vertex AI Prediction private endpoint.

To use a private endpoint, setup a VPC peering network between your project and the Vertex AI Prediction service project that is hosting VMs running your model. This eliminates additional hops in network traffic and allows using efficient gRPC protocol.

For more information about private endpoints, see https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints.

For more information about VPC peering in Vertex AI, see https://cloud.google.com/vertex-ai/docs/general/vpc-peering.

**IMPORTANT: you can only setup one VPC peering to servicenetworking.googleapis.com per VPC network.**

For simplicity, you setup VPC peering to the default network. You can create a different network for your project.

If you setup VPC peering with any other network, make sure that the network already exists and that your VM is running on that network.

In [None]:
# This is for display only; you can name the range anything.
PEERING_RANGE_NAME = "vertex-ai-prediction-peering-range"
NETWORK = "default"

In [None]:
# NOTE: `prefix-length=16` means a CIDR block with mask /16 will be
# reserved for use by Google services, such as Vertex AI.
!gcloud compute addresses create $PEERING_RANGE_NAME \
  --global \
  --prefix-length=16 \
  --description="peering range for Google service" \
  --network=$NETWORK \
  --purpose=VPC_PEERING

Create the VPC connection.

In [None]:
!gcloud services vpc-peerings connect \
  --service=servicenetworking.googleapis.com \
  --network=$NETWORK \
  --ranges=$PEERING_RANGE_NAME \
  --project=$PROJECT_ID

If you receive a permission error when running this command, then try running it with your user account.

To run this command with your user account, do the following:
- add `echo` before the command in the cell above (`echo gcloud services vpc-peering ...`).
- run the cell and copy its output
- start new terminal window, and run `gcloud auth login` to authenticate using your user account.
- paste and run the copied command in the terminal.

Check the status of your peering connections.

In [None]:
!gcloud compute networks peerings list --network $NETWORK

### Upload model to Vertex AI Prediction

Learn more about [model_service.upload_model](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.services.model_service.ModelServiceClient#google_cloud_aiplatform_v1_services_model_service_ModelServiceClient_upload_model).


`artifact_uri` argument should point to a GCS path where `saved_model.pb` file is located for your model.

`image_uri` specifies which docker image to use. Here we upload the same model using TF2.7 GPU and Vertex AI Prediction optimized TensorFlow runtime images.

In order to be able to send requests to your models over gRPC, you need to set `model_name` argument and update `predict_route` and `health_route` accordingly.

Please note that gRPC support in Vertex AI Prediction is still experimental.

In [None]:
tf27_cpu_model_dict = {
    "display_name": "Criteo Kaggle TF2.7 CPU model",
    "artifact_uri": BUCKET_URI,
    "container_spec": {
        "image_uri": "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-7:latest",
        "args": [
            "--port=8500",
            "--rest_api_port=8080",
            "--model_name=default",
            "--model_base_path=$(AIP_STORAGE_URI)",
        ],
        "ports": [{"container_port": 8080}],
        "predict_route": "/v1/models/default:predict",
        "health_route": "/v1/models/default",
    },
}
tf27_cpu_model = (
    model_service_client.upload_model(parent=PARENT, model=tf27_cpu_model_dict)
    .result(timeout=180)
    .model
)
tf27_cpu_model

In [None]:
tf27_gpu_model_dict = {
    "display_name": "Criteo Kaggle TF2.7 GPU model",
    "artifact_uri": BUCKET_URI,
    "container_spec": {
        "image_uri": "us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-7:latest",
        "args": [
            "--port=8500",
            "--rest_api_port=8080",
            "--model_name=default",
            "--model_base_path=$(AIP_STORAGE_URI)",
        ],
        "ports": [{"container_port": 8080}],
        "predict_route": "/v1/models/default:predict",
        "health_route": "/v1/models/default",
    },
}
tf27_gpu_model = (
    model_service_client.upload_model(parent=PARENT, model=tf27_gpu_model_dict)
    .result(timeout=180)
    .model
)
tf27_gpu_model

For deploying a model using Vertex AI Prediction optimized TensorFlow runtime, use the `us-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.nightly:latest` container.

Two optimization options are applied to the model.
- *allow_precompilation* - turns on model pre-compilation for better performance. Note that model precompilation happens when the first request with the new batch size arrives, and the response for that request is sent after precompilation is complete. To mitigate this, specify a warmup file (see the section earlier in this colab). Model precompilation works for different kinds of models, and in most cases has a positive effect on performance. However, we recommend that you try it out for your model before you enable it in production.
- *allow_precision_affecting_optimizations* - enables precision affecting optimizations. In some cases this makes the model run significantly faster at the cost of very minimal loss to model prediction power. You should assess the precision impact to your model when using this optimization.

For the list of available optimized TensorFlow runtimer containers and options, see https://cloud.google.com/vertex-ai/docs/predictions/optimized-tensorflow-runtime.

In [None]:
tf_opt_gpu_model_dict = {
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU model",
    "artifact_uri": BUCKET_URI,
    "container_spec": {
        "image_uri": "us-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.nightly:latest",
        "args": [
            "--model_name=default",
            "--allow_precompilation=true",
            "--allow_precision_affecting_optimizations=false",
        ],
        "predict_route": "/v1/models/default:predict",
        "health_route": "/v1/models/default",
    },
}

tf_opt_gpu_model = (
    model_service_client.upload_model(parent=PARENT, model=tf_opt_gpu_model_dict)
    .result(timeout=180)
    .model
)
tf_opt_gpu_model

In [None]:
tf_opt_lossy_gpu_model_dict = {
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU model with lossy optimizations",
    "artifact_uri": BUCKET_URI,
    "container_spec": {
        "image_uri": "us-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.nightly:latest",
        "args": [
            "--model_name=default",
            "--allow_precompilation=true",
            "--allow_precision_affecting_optimizations=true",
        ],
        "predict_route": "/v1/models/default:predict",
        "health_route": "/v1/models/default",
    },
}

tf_opt_lossy_gpu_model = (
    model_service_client.upload_model(parent=PARENT, model=tf_opt_lossy_gpu_model_dict)
    .result(timeout=180)
    .model
)
tf_opt_lossy_gpu_model

List all models.

In [None]:
model_service_client.list_models(parent=PARENT)

### Create endpoints

Learn more about [endpoint_service.create_endpoint](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.services.endpoint_service.EndpointServiceClient#google_cloud_aiplatform_v1_services_endpoint_service_EndpointServiceClient_create_endpoint).

In [None]:
project_number = re.match(r"projects/(\d+)/.+", tf27_cpu_model)[1]
full_network_name = f"projects/{project_number}/global/networks/{NETWORK}"
full_network_name

In [None]:
tf27_cpu_endpoint_dict = {
    "display_name": "Criteo Kaggle TF2.7 CPU private endpoint",
    "network": full_network_name,
}
tf27_cpu_endpoint = (
    endpoint_service_client.create_endpoint(
        parent=PARENT, endpoint=tf27_cpu_endpoint_dict
    )
    .result(timeout=300)
    .name
)
tf27_cpu_endpoint

In [None]:
tf27_gpu_endpoint_dict = {
    "display_name": "Criteo Kaggle TF2.7 GPU private endpoint",
    "network": full_network_name,
}
tf27_gpu_endpoint = (
    endpoint_service_client.create_endpoint(
        parent=PARENT, endpoint=tf27_gpu_endpoint_dict
    )
    .result(timeout=300)
    .name
)
tf27_gpu_endpoint

In [None]:
tf_opt_gpu_endpoint_dict = {
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU private endpoint",
    "network": full_network_name,
}
tf_opt_gpu_endpoint = (
    endpoint_service_client.create_endpoint(
        parent=PARENT, endpoint=tf_opt_gpu_endpoint_dict
    )
    .result(timeout=300)
    .name
)
tf_opt_gpu_endpoint

In [None]:
tf_opt_lossy_gpu_endpoint_dict = {
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU with lossy optimizations private endpoint",
    "network": full_network_name,
}
tf_opt_lossy_gpu_endpoint = (
    endpoint_service_client.create_endpoint(
        parent=PARENT, endpoint=tf_opt_lossy_gpu_endpoint_dict
    )
    .result(timeout=300)
    .name
)
tf_opt_lossy_gpu_endpoint

### Deploy models to endpoints

Learn more about [enpoint_service.deploy_model](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.services.endpoint_service.EndpointServiceClient#google_cloud_aiplatform_v1_services_endpoint_service_EndpointServiceClient_deploy_model).

In [None]:
tf27_cpu_deployed_model_dict = {
    "model": tf27_cpu_model,
    "display_name": "Criteo Kaggle TF2.7 CPU deployed model",
    "dedicated_resources": {
        "min_replica_count": 1,
        "max_replica_count": 1,
        "machine_spec": {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_count": 0,
        },
    },
}

tf27_cpu_deployed_model = endpoint_service_client.deploy_model(
    endpoint=tf27_cpu_endpoint, deployed_model=tf27_cpu_deployed_model_dict
).result()
tf27_cpu_deployed_model

In [None]:
tf27_gpu_deployed_model_dict = {
    "model": tf27_gpu_model,
    "display_name": "Criteo Kaggle TF2.7 GPU deployed model",
    "dedicated_resources": {
        "min_replica_count": 1,
        "max_replica_count": 1,
        "machine_spec": {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_type": DEPLOY_GPU,
            "accelerator_count": 1,
        },
    },
}

tf27_gpu_deployed_model = endpoint_service_client.deploy_model(
    endpoint=tf27_gpu_endpoint, deployed_model=tf27_gpu_deployed_model_dict
).result()
tf27_gpu_deployed_model

In [None]:
tf_opt_gpu_deployed_model_dict = {
    "model": tf_opt_gpu_model,
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU model",
    "dedicated_resources": {
        "min_replica_count": 1,
        "max_replica_count": 1,
        "machine_spec": {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_type": DEPLOY_GPU,
            "accelerator_count": 1,
        },
    },
}

tf_opt_gpu_deployed_model = endpoint_service_client.deploy_model(
    endpoint=tf_opt_gpu_endpoint, deployed_model=tf_opt_gpu_deployed_model_dict
).result()
tf_opt_gpu_deployed_model

In [None]:
tf_opt_lossy_gpu_deployed_model_dict = {
    "model": tf_opt_lossy_gpu_model,
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU model with lossy optimizations",
    "dedicated_resources": {
        "min_replica_count": 1,
        "max_replica_count": 1,
        "machine_spec": {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_type": DEPLOY_GPU,
            "accelerator_count": 1,
        },
    },
}

tf_opt_lossy_gpu_deployed_model = endpoint_service_client.deploy_model(
    endpoint=tf_opt_lossy_gpu_endpoint,
    deployed_model=tf_opt_lossy_gpu_deployed_model_dict,
).result()
tf_opt_lossy_gpu_deployed_model

## (optional) Compare performance of deployed models

To access private endpoints, the VM used to send requests must be deployed in the same network where you setup VPC peering. Because of this, you can't send requests to your models that are deployed with private endpoints from Colab.

To get the best performance, be sure the VM is in the same region as your model.

Import helper functions for benchmarking models.

In [None]:
!curl https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/vertex_endpoints/optimized_tensorflow_runtime/benchmark.py -o benchmark.py

In [None]:
from benchmark import benchmark

This code sends a specified number of requests asynchronously and uniformly at a given QPS, then records the observed latency. Next, the latency results are aggregated and percentiles are calculated.
The `actual_qps` that the model can handle is calculated as the time it takes for a model to process the sent requests divided by the number of requests. 
By providing different implementations for `send_request` and `build_request` functions, the same code can be used for benchmarking models running locally or on Vertex AI Prediction using gRPC and REST protocols.

The main goal of this benchmark is to measure model latency on different loads, and maximum throughput the model can handle. In order to find maximum throughput, gradually increase QPS until `actual_qps` stops increasing and latency increases dramatically.

On the production deployment, the workload is not uniform, and therefore the maximum model throughput is likely to be lower.
We are not trying to simulate production workload here. This benchmark is meant to compare latency and throughput for same model running on different environments.

Details about deployed models can be accessed using the [endpoint_service_client.get_endpoint](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.services.endpoint_service.EndpointServiceClient#google_cloud_aiplatform_v1_services_endpoint_service_EndpointServiceClient_get_endpoint) API:

In [None]:
tf_opt_gpu_endpoint_dict = endpoint_service_client.get_endpoint(
    name=tf_opt_gpu_endpoint
)
tf_opt_gpu_endpoint_dict

First, verify that you can access your models.

In [None]:
health_url = tf_opt_gpu_endpoint_dict.deployed_models[
    0
].private_endpoints.health_http_uri
health_url

In [None]:
!curl $health_url

Define helper methods to run benchmarks against private endpoints using REST protocol.
The URI where requests should be sent can be found in `deployed_model.private_endpoints.predict_http_uri`.

In [None]:
def build_rest_request(
    row_dict, model_name="default", signature_name="serving_default"
):
    payload = json.dumps({"signature_name": signature_name, "inputs": row_dict})
    return payload


def benchmark_rest_private_endpoint(
    endpoint_name, qps_list, model_name=None, duration_seconds=5
):
    endpoint_dict = endpoint_service_client.get_endpoint(name=endpoint_name)
    predict_uri = endpoint_dict.deployed_models[0].private_endpoints.predict_http_uri

    def send_rest_request(request):
        res = r.post(predict_uri, data=request)
        assert res.status_code == 200
        return res

    return benchmark(
        send_rest_request,
        build_rest_request,
        f"{LOCAL_DIRECTORY_FULL}/requests/requests_100_512.jsonl",
        qps_list,
        duration_seconds,
        model_name=model_name,
    )

You can also benchmark your models deployed on private endpoint using gRPC protocol. 

gRPC address is same as host name of `predict_http_uri` or `predict_http_uri`.
gRPC destination has a format of `<endpoint_id>-<deployed_model_id>` and passed as a "grpc_destination" header.

Please note that gRPC support in Vertex AI Prediction is still experimental.

In [None]:
def parse_endpoint_dict(endpoint_dict):
    endpoint_id = re.match(r".+/endpoints/(\d+)", endpoint_dict.name)[1]
    deployed_model_id = endpoint_dict.deployed_models[0].id
    grpc_destination = f"{endpoint_id}-{deployed_model_id}"
    predict_uri = urlparse(
        endpoint_dict.deployed_models[0].private_endpoints.predict_http_uri
    )
    grpc_uri = f"{predict_uri.netloc}:8500"
    return (grpc_uri, grpc_destination)


def benchmark_grpc_private_endpoint(endpoint_name, qps_list, duration_seconds=5):
    endpoint_dict = endpoint_service_client.get_endpoint(name=endpoint_name)
    grpc_uri, grpc_destinaion = parse_endpoint_dict(endpoint_dict)

    grpc_metadata = []
    grpc_metadata.append(("grpc-destination", grpc_destinaion))
    grpc_channel = grpc.insecure_channel(grpc_uri)
    grpc_stub = prediction_service_pb2_grpc.PredictionServiceStub(grpc_channel)

    def send_grpc_request(request):
        return grpc_stub.Predict(request, 60, metadata=grpc_metadata)

    return benchmark(
        send_grpc_request,
        build_grpc_request,
        f"{LOCAL_DIRECTORY_FULL}/requests/requests_100_512.jsonl",
        qps_list,
        duration_seconds,
        model_name="default",
    )

Now we can run a benchmark test for each endpoint and compare the results.

In [None]:
tf27_cpu_results = benchmark_grpc_private_endpoint(
    tf27_cpu_endpoint, [10, 20, 30, 40, 50, 55]
)
tf27_cpu_results

In [None]:
tf27_gpu_results = benchmark_grpc_private_endpoint(
    tf27_gpu_endpoint, [10, 20, 30, 40, 50, 60, 70, 75]
)
tf27_gpu_results

In [None]:
tf_opt_gpu_results = benchmark_grpc_private_endpoint(
    tf_opt_gpu_endpoint, [10, 50, 100, 150, 200, 250, 275, 300, 325, 350]
)
tf_opt_gpu_results

In [None]:
tf_opt_lossy_gpu_results = benchmark_grpc_private_endpoint(
    tf_opt_lossy_gpu_endpoint, [10, 50, 100, 200, 300, 400, 500, 600, 700, 800]
)
tf_opt_lossy_gpu_results

Combine and visualize results.

In [None]:
import matplotlib
import matplotlib.pyplot as plt


def build_graph(x_key, y_key, results_dict, axis):
    matplotlib.rcParams["figure.figsize"] = [10.0, 7.0]

    fig, ax = plt.subplots(facecolor=(1, 1, 1))
    ax.set_xlabel("QPS")
    ax.set_ylabel("Latency(ms)")
    for title, results in results_dict.items():
        x = np.array(results[x_key])
        y = np.array(results[y_key])
        ax.plot(x, y, label=title)
    ax.legend()
    ax.axis(axis)
    ax.set_title(f"Criteo model {y_key} latency, batch size 512")
    return fig

In [None]:
fig = build_graph(
    "actual_qps",
    "p50",
    {
        "TF2.7 CPU": tf27_cpu_results,
        "TF2.7 GPU": tf27_gpu_results,
        "TF opt GPU": tf_opt_gpu_results,
        "TF opt GPU lossy": tf_opt_lossy_gpu_results,
    },
    (0, 800, 0, 60),
)
fig.savefig("criteo_p50_latency_512.png", bbox_inches="tight")

In [None]:
fig = build_graph(
    "actual_qps",
    "p99",
    {
        "TF2.7 CPU": tf27_cpu_results,
        "TF2.7 GPU": tf27_gpu_results,
        "TF opt GPU": tf_opt_gpu_results,
        "TF opt GPU lossy": tf_opt_lossy_gpu_results,
    },
    (0, 800, 0, 100),
)
fig.savefig("criteo_p99_latency_512.png", bbox_inches="tight")

You can see that the Vertex AI Prediction optimized TensorFlow runtime has signficantly higher throughput and lower latency compared to TensorFlow 2.7.

## (Optional) Compare performance of deployed models using MLPerf Inference loadgen

MLPerf Inference is a benchmark suite for measuring how fast systems can run models in a variety of deployment scenarios. MLPerf is now an industry standard way of measuring model performance. You can follow instructions at https://github.com/tensorflow/tpu/tree/master/models/experimental/inference/load_test to run MLPerf Inferenence benchmark for deployed models.


## (Optional) Compare prediction results

In this sample the Vertex Prediction optimized TensorFlow runtime is used with the `allow_precision_affecting_optimizations` flag set to `true` to gain additional speedup. Now let's check how those optimizations effect prediction results.

We compare the results of predictions for 51,200 requests for a model running on the optimized TensorFlow runtime with lossy optimizations and on TF2.7

In [None]:
def get_predictions(endpoint, requests_file_path):
    responses = []

    endpoint_dict = endpoint_service_client.get_endpoint(name=endpoint)
    pridict_uri = endpoint_dict.deployed_models[0].private_endpoints.predict_http_uri

    with tf.io.gfile.GFile(requests_file_path, "r") as f:
        for line in f:
            row_dict = json.loads(line)
            request = build_rest_request(row_dict)
            response = r.post(pridict_uri, data=request)
            for output in json.loads(response.text)["outputs"]:
                responses.append(output[0])

    return np.array(responses)

In [None]:
tf27_gpu_predictions = get_predictions(
    tf27_gpu_endpoint, f"{LOCAL_DIRECTORY_FULL}/requests/requests_100_512.jsonl"
)

In [None]:
tf_opt_lossy_gpu_predictions = get_predictions(
    tf_opt_lossy_gpu_endpoint, f"{LOCAL_DIRECTORY_FULL}/requests/requests_100_512.jsonl"
)

In [None]:
np.average(tf_opt_lossy_gpu_predictions - tf27_gpu_predictions) * 100

In [None]:
np.max(np.abs(tf_opt_lossy_gpu_predictions - tf27_gpu_predictions)) * 100

You can see that the average results are different for less than 0.0016%. In the worst case the difference is 0.05%.

## Cleanup

After you are done, it's safe to remove the endpoints you created and the model you deployed.

In [None]:
def cleanup(endpoint, model_name, deployed_model_id):
    response = endpoint_service_client.undeploy_model(
        endpoint=endpoint, deployed_model_id=deployed_model_id
    )
    print("running undeploy_model operation:", response.operation.name)
    print(response.result())

    response = endpoint_service_client.delete_endpoint(name=endpoint)
    print("running delete_endpoint operation:", response.operation.name)
    print(response.result())

    response = model_service_client.delete_model(name=model_name)
    print("running delete_model operation:", response.operation.name)
    print(response.result())

In [None]:
cleanup(tf27_cpu_endpoint, tf27_cpu_model, tf27_cpu_deployed_model)
cleanup(tf27_gpu_endpoint, tf27_gpu_model, tf27_gpu_deployed_model)
cleanup(tf_opt_gpu_endpoint, tf_opt_gpu_model, tf_opt_gpu_deployed_model)
cleanup(
    tf_opt_lossy_gpu_endpoint, tf_opt_lossy_gpu_model, tf_opt_lossy_gpu_deployed_model
)

You can now also remove model from GCS bucket as well.

In [None]:
# Set this to true only if you'd like to delete your bucket
delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    !gsutil rm -r $BUCKET_URI