In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden: Google Proprietary Model Image Object Detection

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_proprietary_image_object_detection.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>

  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_proprietary_image_object_detection.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_proprietary_image_object_detection.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a>
  </td>
</table>

**_NOTE_**: This notebook has been tested in the following environment:

* Python version = 3.9

## Overview

This notebook demonstrates how to use Google proprietary image object detection model training/deployment in [Vertex AI Model Garden](https://cloud.google.com/model-garden).

### Objective

* Train new models using Vertex SDK

* Test trained models
  * View the trained model in [Vertex AI Model Registry](https://cloud.google.com/vertex-ai/docs/model-registry/introduction)
  * Deploy uploaded models
  * Run predictions

* Cleanup resources

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Dataset

The dataset used for this tutorial is the Salads category of the [OpenImages dataset](https://www.tensorflow.org/datasets/catalog/open_images_v4) from [TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/overview). This dataset does not require any feature engineering. The version of the dataset you will use in this tutorial is stored in a public Cloud Storage bucket. The trained model predicts the bounding box locations and corresponding type of salad items in an image from a class of five items: salad, seafood, tomato, baked goods, or cheese.

## Before you begin

In [None]:
! pip3 install --upgrade google-cloud-aiplatform

# Automatically restart kernel after installs
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)
if "google.colab" in str(get_ipython()):
    from google.colab import auth as google_auth

    google_auth.authenticate_user()

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component).

1. If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

In [None]:
import os

from google.cloud import aiplatform

# The project and bucket are for experiments below.
PROJECT_ID = ""  # @param {type:"string"}
BUCKET_URI = ""  # @param {type:"string"}

# You can choose a region from https://cloud.google.com/about/locations.
# Only regions prefixed by "us", "europe", or "asia" are supported.
REGION = "us-central1"  # @param {type:"string"}
REGION_PREFIX = REGION.split("-")[0]
assert REGION_PREFIX in (
    "us",
    "europe",
    "asia",
), f'{REGION} is not supported. It must be prefixed by "us", "europe", or "asia".'

! gcloud config set project $PROJECT_ID

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### Define constants

In [None]:
OBJECTIVE = "iod"

# Dataset constants.
DATASET_PREFIX = "dataset-iod"

# Training constants.
TRAINING_JOB_PREFIX = "train"
# The image object detection salad dataset used to train the model
DATASET_FILE = "gs://cloud-samples-data/vision/salads.csv"

# Evaluation constants.
EVALUATION_METRIC = "AP50"

DEPLOY_JOB_PREFIX = "deploy"

### Define common libraries

In [None]:
import base64
import os
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from google.cloud import aiplatform
from PIL import Image, ImageColor, ImageDraw, ImageFont


def get_job_name_with_datetime(prefix: str):
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def load_img(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    return Image.fromarray(np.uint8(img)).convert("RGB")


def display_image(image):
    _ = plt.figure(figsize=(20, 15))
    plt.grid(False)
    plt.imshow(image)


def draw_bounding_box_on_image(
    image, ymin, xmin, ymax, xmax, color, font, thickness=4, display_str_list=()
):
    """Adds a bounding box to an image."""
    draw = ImageDraw.Draw(image)
    im_width, im_height = image.size
    (left, right, top, bottom) = (
        xmin * im_width,
        xmax * im_width,
        ymin * im_height,
        ymax * im_height,
    )
    draw.line(
        [(left, top), (left, bottom), (right, bottom), (right, top), (left, top)],
        width=thickness,
        fill=color,
    )

    # If the total height of the display strings added to the top of the bounding
    # box exceeds the top of the image, stack the strings below the bounding box
    # instead of above.
    display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
    # Each display_str has a top and bottom margin of 0.05x.
    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)

    if top > total_display_str_height:
        text_bottom = top
    else:
        text_bottom = top + total_display_str_height
    # Reverse list and print from bottom to top.
    for display_str in display_str_list[::-1]:
        text_width, text_height = font.getsize(display_str)
        margin = np.ceil(0.05 * text_height)
        draw.rectangle(
            [
                (left, text_bottom - text_height - 2 * margin),
                (left + text_width, text_bottom),
            ],
            fill=color,
        )
        draw.text(
            (left + margin, text_bottom - text_height - margin),
            display_str,
            fill="black",
            font=font,
        )
        text_bottom -= text_height - 2 * margin


def draw_boxes(image, boxes, class_names, scores, max_boxes=40, min_score=0.05):
    """Overlay labeled boxes on an image with formatted scores and label names."""
    colors = list(ImageColor.colormap.values())
    try:
        font = ImageFont.truetype(
            "/usr/share/fonts/truetype/liberation/LiberationSansNarrow-Regular.ttf", 25
        )
    except OSError:
        print("Font not found, using default font.")
        font = ImageFont.load_default()

    for i in range(min(len(boxes), max_boxes)):
        if scores[i] >= min_score:
            ymin, xmin, ymax, xmax = boxes[i]
            display_str = "{}: {}%".format(class_names[i], int(100 * scores[i]))
            color = colors[hash(class_names[i]) % len(colors)]
            draw_bounding_box_on_image(
                image,
                ymin,
                xmin,
                ymax,
                xmax,
                color,
                font,
                display_str_list=[display_str],
            )
    return image

## Create a dataset

This tutorial uses a version of the Salads dataset that is stored in a public Cloud Storage bucket, using a CSV index file.

Start by doing a quick peek at the data. You count the number of examples by counting the number of rows in the CSV index file  (`wc -l`) and then peek at the first few rows.

In [None]:
count = ! gsutil cat $DATASET_FILE | wc -l
print("Number of Examples", int(count[0]))

print("First 10 rows")
! gsutil cat $DATASET_FILE | head

Next, create the `Dataset` resource using the `create` method for the `ImageDataset` class, which takes the following parameters:

- `display_name`: The human readable name for the `Dataset` resource.
- `gcs_source`: A list of one or more dataset index files to import the data items into the `Dataset` resource.
- `import_schema_uri`: The data labeling schema for the data items.

This operation may take several minutes.

In [None]:
dataset = aiplatform.ImageDataset.create(
    display_name=DATASET_PREFIX + "_salads",
    gcs_source=[DATASET_FILE],
    import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
)

print(dataset.resource_name)

## Train new models

### Create and run training pipeline

To train an AutoML model, you perform two steps:
1.  Create a training pipeline.
2.  Run the pipeline.

#### Create training pipeline

An AutoML training pipeline is created with the `AutoMLImageTrainingJob` class, with the following parameters:

- `display_name`: The human readable name for the `TrainingJob` resource.
- `prediction_type`: The type task to train the model for.
  - `classification`: An image classification model.
  - `object_detection`: An image object detection model.
- `model_type`: The type of model for deployment. For image object detection, we current support the following:
  - `SPINENET`: A model that is available in Vertex Model Garden image object detection training with customizable hyperparameters. Best tailored to be used within Google Cloud, and cannot be exported externally.
  - `YOLO`: A model that is available in Vertex Model Garden image object detection training with customizable hyperparameters. Best tailored to be used within Google Cloud, and cannot be exported externally.
- `checkpoint_name`: Optional. The field is reserved for Model Garden model training, based on the provided pre-trained model checkpoint.
- `trainer_config`: Optional. The field is usually used together with the Model Garden model training, when passing the customized configs for the trainer. `anchor_size` cannot be used with `YOLO`.

  Example with all supported parameters:
```
  trainer_config = {
    'global_batch_size': '8',
    'learning_rate': '0.001',
    'optimizer_type': 'sgd',
    'optimizer_momentum': '0.9',
    'train_steps': '10000',
    'accelerator_count': '2',
    'anchor_size': '8',
  }
```
  The global_batch_size should be divisible by accelerator_count.
  Supported values for optimizer_type are 'sgd', 'adam', 'adamw', 'lamb', 'rmsprop', 'lars', 'adagrad', and 'slide'.
  Supported values for accelerator_count are '2', '4', and '8'.
- `metric_spec`: Dictionary representing metrics to optimize. The dictionary key is the metric_id, which is reported by your training job, with possible values being ('loss', 'AP50') and the dictionary value is the optimization goal of the metric('minimize' or 'maximize').
For example:  `metric_spec = {'loss': 'minimize', 'AP50': 'maximize'}`
- `parameter_spec`:Dictionary representing parameters to optimize. The dictionary key is the `metric_id`, which is passed into your training job as a command line key word argument, and the dictionary value is the parameter
specification of the metric. Supported parameter specifications can be found in aiplatform.hyperparameter_tuning.
```
  from google.cloud.aiplatform.aiplatform import hpt as hpt

  parameter_spec = {
    'learning_rate': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear'), \
  }
```
- `search_algorithm`: The search algorithm specified for the Study. Accepts one of the following:
  - `None`: If you do not specify an algorithm, your job uses the default
  Vertex AI algorithm. The default algorithm applies Bayesian optimization
  to arrive at the optimal solution with a more effective search over the
  parameter space.
  - `grid`: A simple grid search within the feasible space. This option is
  particularly useful if you want to specify a quantity of trials that is greater than the number of points in the feasible space. In such cases, if you do not specify a grid search, the Vertex AI default algorithm may generate duplicate suggestions. To use grid search, all parameter specs must be of type `IntegerParameterSpec`, `CategoricalParameterSpec`, or `DiscreteParameterSpec`.
  - `random`: A simple random search within the feasible space.
- `measurement_selection`: This indicates which measurement to use
if/when the service automatically selects the final measurement from
previously reported intermediate measurements.
  Accepts: `best`, `last` Choose this based on two considerations:
    - A): Do you expect your measurements to monotonically improve? If so,
    choose `last`. On the other hand, if you\'re in a situation where
    your system can **over-train** and you expect the performance to get
    better for a while but then start declining, choose `best`.
    - B): Are your measurements significantly noisy and/or irreproducible? If
    so, `best` will tend to be over-optimistic, and it may be better
    to choose `last`. If both or neither of (A) and (B) apply, it
    doesn't matter which selection type is chosen.


In [None]:
from google.cloud.aiplatform import hyperparameter_tuning as hpt

TRAINER_CONFIG = {
    "global_batch_size": "8",
    "learning_rate": "0.001",
    "train_steps": "10000",
    "accelerator_count": "2",
}
METRIC_SPEC_KEY = "AP50"
METRIC_SPEC_VALUE = "maximize"
SEARCH_ALGORITHM = "random"
MEASUREMENT_SELECTION = "best"
MODEL_TYPE = "SPINENET"  # @param {type:"string"} one of the values ["SPINENET", "YOLO"]

PARAMETER_SPEC = {}
if MODEL_TYPE == "YOLO":
    PARAMETER_SPEC = {
        "learning_rate": hpt.DiscreteParameterSpec(
            values=[0.001, 0.1],
            scale="linear",
        ),
        "weight_decay": hpt.DiscreteParameterSpec(
            values=[0.0001, 0.001],
            scale="linear",
        ),
    }
else:
    PARAMETER_SPEC = {
        "learning_rate": hpt.DiscreteParameterSpec(
            values=[0.001, 0.01], scale="linear"
        ),
        "anchor_size": hpt.DiscreteParameterSpec(values=[2, 4], scale="reverse_log"),
    }

job = aiplatform.AutoMLImageTrainingJob(
    display_name=get_job_name_with_datetime(TRAINING_JOB_PREFIX),
    prediction_type="object_detection",
    model_type=MODEL_TYPE,
    base_model=None,
    trainer_config=TRAINER_CONFIG,
    metric_spec={METRIC_SPEC_KEY: METRIC_SPEC_VALUE},
    parameter_spec=PARAMETER_SPEC,
    search_algorithm=SEARCH_ALGORITHM,
    measurement_selection=MEASUREMENT_SELECTION,
)

print(job)

#### Run the training pipeline

Next, run the DAG to start the training job by invoking the method `run`, with the following parameters:

- `dataset`: The `Dataset` resource to train the model.
- `model_display_name`: The human readable name for the trained model.
- `training_fraction_split`: The percentage of the dataset to use for training.
- `test_fraction_split`: The percentage of the dataset to use for test (holdout data).
- `validation_fraction_split`: The percentage of the dataset to use for validation.
- `budget_milli_node_hours`: (optional) Maximum training time specified in unit of millihours (1000 = hour).
- `disable_early_stopping`: If `True`, training may be completed before using the entire budget if the service believes it cannot further improve on the model objective measurements.

The `run` method when completed returns the `Model` resource.

The execution of the training pipeline will take up to 60 minutes.

In [None]:
model = job.run(
    dataset=dataset,
    model_display_name=get_job_name_with_datetime("salads"),
    training_fraction_split=0.8,
    validation_fraction_split=0.1,
    test_fraction_split=0.1,
    budget_milli_node_hours=20000,
    disable_early_stopping=False,
)

print("Model is: ", model)

## Test trained models
This section shows how to test the trained models.
1. Deploy models from Model Registry
2. Run online predictions

In [None]:
# @title Deploy model from Model Registry
# Model does not support dedicated deployment resources.
# An n1-standard-4 machine with 1 P100 GPU will be used.

deploy_model_name = get_job_name_with_datetime(DEPLOY_JOB_PREFIX + "_" + OBJECTIVE)
print("The deployed job name is: ", deploy_model_name)

endpoint = model.deploy(
    deployed_model_display_name=deploy_model_name,
    traffic_split={"0": 100},
    min_replica_count=1,
    max_replica_count=1,
)

endpoint_id = endpoint.name
print("endpoint id is: ", endpoint_id)

In [None]:
# @title Run online predictions

# test image file path from a Cloud Storage bucket
test_filepath = ""  # @param {type:"string"}

with tf.io.gfile.GFile(test_filepath, "rb") as f:
    content = f.read()

# The format of each instance should conform to the deployed model's prediction input schema.
instances = [{"content": base64.b64encode(content).decode("utf-8")}]

prediction = endpoint.predict(instances=instances)

img = load_img(test_filepath)
display_image(img)
print(prediction)

# Run batch predictions
Now that your Model resource is trained, you can make a batch prediction by invoking the `batch_predict()` method, with the following parameters:

* `job_display_name`: The human readable name for the batch prediction job.
* `gcs_source`: A jsonl file path from a Cloud Storage bucket, with a list of one or more images.
* `gcs_destination_prefix`: The Cloud Storage location for storing the batch prediction resuls.
* `sync`: If set to True, the call block while waiting for the asynchronous batch job to complete.

In [None]:
# A jsonl file path from a Cloud Storage bucket, with all the to-be-predicted images.
gcs_source = ""  # @param {type:"string"}

batch_predict_job = model.batch_predict(
    job_display_name=get_job_name_with_datetime("flowers_bp"),
    gcs_source=gcs_source,
    gcs_destination_prefix=f"gs://{BUCKET_URI}",
    sync=False,
)
print(batch_predict_job)

# Wait for the batch prediction job to finish
batch_predict_job.wait()


# Get the batch prediction results
import json

import tensorflow as tf

bp_iter_outputs = batch_predict_job.iter_outputs()

prediction_results = list()
for blob in bp_iter_outputs:
    if blob.name.split("/")[-1].startswith("prediction"):
        prediction_results.append(blob.name)

tags = list()
for prediction_result in prediction_results:
    gfile_name = f"gs://{bp_iter_outputs.bucket.name}/{prediction_result}"
    with tf.io.gfile.GFile(name=gfile_name, mode="r") as gfile:
        for line in gfile.readlines():
            line = json.loads(line)
            print(line)
            break

## Clean up

In [None]:
# Delete the dataset.
if "dataset" in globals():
    dataset.delete()

# Undeploy model and delete endpoint.
if "endpoint" in globals():
    endpoint.undeploy_all()
    endpoint.delete(force=True)

# Delete models.
if "model" in globals():
    model.delete()

# Delete the batch predictio job.
if "batch_prediction_job" in globals():
    batch_predict_job.delete()