In [None]:
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Detectron2

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_detectron2.ipynb">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_detectron2.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_detectron2.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates finetuning Detectron2 based [Faster R-CNN](https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md#faster-r-cnn) and
[RetinaNet](https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md#retinanet)
for image detection task and [Mask R-CNN](https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md#coco-instance-segmentation-baselines-with-mask-r-cnn) for segmentation task and deploying them on Vertex AI for online prediction. This notebook assumes that the input training data is in [COCO format](https://opencv.org/introduction-to-the-coco-dataset/). If you do not have your own dataset, this notebook also shows how to download and prepare the Balloon dataset for training.

### Objective

- Finetune a Detectron2 based Faster R-CNN, RetinaNet, or Mask R-CNN model.
- Upload the model to [Model Registry](https://cloud.google.com/vertex-ai/docs/model-registry/introduction).
- Deploy the model on [Endpoint](https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints).
- Run online predictions for image object detection and segmentation.

### File a bug

File a bug on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new) if you encounter any issue with the notebook.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Import the necessary packages

# Install libraries for COCO format conversion of datasets.
!pip install --upgrade --quiet pycocotools==2.0.6
!pip install --upgrade --quiet opencv-python==4.7.0.72

Restart the notebook kernel after installs.

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. For finetuning, **[click here](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Frestricted_image_training_nvidia_a100_80gb_gpus)** to check if your project already has the required 8 Nvidia A100 80 GB GPUs in the us-central1 region. If yes, then run this notebook in the us-central1 region. If you do not have 8 Nvidia A100 80 GPUs or have more GPU requirements than this, then schedule your job with Nvidia H100 GPUs via Dynamic Workload Scheduler using [these instructions](https://cloud.google.com/vertex-ai/docs/training/schedule-jobs-dws). For Dynamic Workload Scheduler, check the [us-central1](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_h100_gpus) or [europe-west4](https://console.cloud.google.com/iam-admin/quotas?location=europe-west4&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_h100_gpus) quota for Nvidia H100 GPUs. If you do not have enough GPUs, then you can follow [these instructions](https://cloud.google.com/docs/quotas/view-manage#viewing_your_quota_console) to request quota.

# @markdown 3. For serving, **[click here](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_l4_gpus)** to check if your project already has the required 1 L4 GPU in the us-central1 region.  If yes, then run this notebook in the us-central1 region. If you need more L4 GPUs for your project, then you can follow [these instructions](https://cloud.google.com/docs/quotas/view-manage#viewing_your_quota_console) to request more. Alternatively, if you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# @markdown 4. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

BUCKET_URI = "gs://"  # @param {type:"string"}

# @markdown 5. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

import base64
import datetime
import importlib
# Import the necessary packages
import json
import os
import uuid

import cv2
from google.cloud import aiplatform

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform==1.103.0'

if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.notebooks.community.model_garden.docker_source_codes.notebook_util.common_util"
)

models, endpoints = {}, {}


# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    if not os.environ.get("GOOGLE_CLOUD_REGION"):
        raise ValueError(
            "REGION must be set. See"
            " https://cloud.google.com/vertex-ai/docs/general/locations for"
            " available cloud locations."
        )
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "detectron2")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"
import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

In [None]:
# @title Define helper functions and constants

# The pre-built training docker image. It contains training scripts and models.
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-detectron2-train"
# The pre-built serving docker image. It contains serving scripts and models.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-detectron2-serve"

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}


def gcs_fuse_path(path: str) -> str:
    """Try to convert path to gcsfuse path if it starts with gs:// else do not modify it."""
    path = path.strip()
    if path.startswith("gs://"):
        return "/gcs/" + path[5:]
    return path


def deploy_model(
    project: str,
    location: str,
    display_name: str,
    serving_container_image_uri: str,
    model_pth_file: str,
    model_cfg_yaml_file: str,
    publisher_model_id: str,
    service_account: str,
    test_threshold: float = 0.5,
    use_dedicated_endpoint: bool = True,
):

    endpoint = aiplatform.Endpoint.create(
        display_name=display_name,
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    serving_env = {
        "MODEL_PTH_FILE": model_pth_file,
        "CONFIG_YAML_FILE": model_cfg_yaml_file,
        "TEST_THRESHOLD": test_threshold,
        "DEPLOY_SOURCE": "notebook",
    }

    model = aiplatform.Model.upload(
        display_name=display_name,
        serving_container_image_uri=serving_container_image_uri,
        serving_container_ports=[7080],
        serving_container_predict_route="/predictions/detectron2_serving",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
        model_garden_source_model_name=(
            f"publishers/google/models/{publisher_model_id}"
        ),
    )

    model.deploy(
        endpoint=endpoint,
        machine_type="n1-highmem-16",
        deploy_request_timeout=1800,
        service_account=service_account,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_pytorch_detectron2.ipynb",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )

    print(model.display_name)
    print(model.resource_name)
    return endpoint


# Prediction


def get_prediction_instances(local_test_filepath):
    with open(local_test_filepath, "rb") as input_file:
        encoded_string = base64.b64encode(input_file.read()).decode("utf-8")

    instances = [
        {
            "data": {"b64": encoded_string},
        }
    ]
    return instances


import numpy as np
# Mask encoding related
import pycocotools.mask as mask_util


def decode_rle_masks(pred_masks_rle):
    return np.stack([mask_util.decode(rle) for rle in pred_masks_rle])


import collections

# Visualization
from PIL import Image, ImageColor, ImageDraw, ImageFont


def load_img(local_path):
    return Image.open(local_path).convert("RGB")


def draw_bounding_box_on_image_array(
    image,
    ymin,
    xmin,
    ymax,
    xmax,
    color="red",
    thickness=4,
    display_str_list=(),
    use_normalized_coordinates=True,
):
    """Adds a bounding box to an image (numpy array).

    Bounding box coordinates can be specified in either absolute (pixel) or
    normalized coordinates by setting the use_normalized_coordinates argument.

    Args:
      image: a numpy array with shape [height, width, 3].
      ymin: ymin of bounding box.
      xmin: xmin of bounding box.
      ymax: ymax of bounding box.
      xmax: xmax of bounding box.
      color: color to draw bounding box. Default is red.
      thickness: line thickness. Default value is 4.
      display_str_list: list of strings to display in box
                        (each to be shown on its own line).
      use_normalized_coordinates: If True (default), treat coordinates
        ymin, xmin, ymax, xmax as relative to the image.  Otherwise treat
        coordinates as absolute.
    """
    image_pil = Image.fromarray(np.uint8(image)).convert("RGB")
    draw_bounding_box_on_image(
        image_pil,
        ymin,
        xmin,
        ymax,
        xmax,
        color,
        thickness,
        display_str_list,
        use_normalized_coordinates,
    )
    np.copyto(image, np.array(image_pil))


def get_font_size(font: ImageFont.FreeTypeFont, text: str):
    left, top, right, bottom = font.getbbox(text)
    return right - left, bottom - top


def draw_bounding_box_on_image(
    image,
    ymin,
    xmin,
    ymax,
    xmax,
    color="red",
    thickness=4,
    display_str_list=(),
    use_normalized_coordinates=True,
):
    """Adds a bounding box to an image.

    Bounding box coordinates can be specified in either absolute (pixel) or
    normalized coordinates by setting the use_normalized_coordinates argument.

    Each string in display_str_list is displayed on a separate line above the
    bounding box in black text on a rectangle filled with the input 'color'.
    If the top of the bounding box extends to the edge of the image, the strings
    are displayed below the bounding box.

    Args:
      image: a PIL.Image object.
      ymin: ymin of bounding box.
      xmin: xmin of bounding box.
      ymax: ymax of bounding box.
      xmax: xmax of bounding box.
      color: color to draw bounding box. Default is red.
      thickness: line thickness. Default value is 4.
      display_str_list: list of strings to display in box
                        (each to be shown on its own line).
      use_normalized_coordinates: If True (default), treat coordinates
        ymin, xmin, ymax, xmax as relative to the image.  Otherwise treat
        coordinates as absolute.
    """
    draw = ImageDraw.Draw(image)
    im_width, im_height = image.size
    if use_normalized_coordinates:
        (left, right, top, bottom) = (
            xmin * im_width,
            xmax * im_width,
            ymin * im_height,
            ymax * im_height,
        )
    else:
        (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
    draw.line(
        [(left, top), (left, bottom), (right, bottom), (right, top), (left, top)],
        width=thickness,
        fill=color,
    )
    try:
        font = ImageFont.truetype("arial.ttf", 24)
    except OSError:
        font = ImageFont.load_default()

    # If the total height of the display strings added to the top of the bounding
    # box exceeds the top of the image, stack the strings below the bounding box
    # instead of above.
    display_str_heights = [get_font_size(font, ds)[1] for ds in display_str_list]
    # Each display_str has a top and bottom margin of 0.05x.
    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)

    if top > total_display_str_height:
        text_bottom = top
    else:
        text_bottom = bottom + total_display_str_height
    # Reverse list and print from bottom to top.
    for display_str in display_str_list[::-1]:
        text_width, text_height = get_font_size(font, display_str)
        margin = np.ceil(0.05 * text_height)
        draw.rectangle(
            [
                (left, text_bottom - text_height - 2 * margin),
                (left + text_width, text_bottom),
            ],
            fill=color,
        )
        draw.text(
            (left + margin, text_bottom - text_height - margin),
            display_str,
            fill="black",
            font=font,
        )
        text_bottom -= text_height - 2 * margin


def draw_mask_on_image_array(image, mask, color="red", alpha=0.4):
    """Draws mask on an image.

    Args:
      image: uint8 numpy array with shape (img_height, img_height, 3)
      mask: a uint8 numpy array of shape (img_height, img_height) with
        values between either 0 or 1.
      color: color to draw the keypoints with. Default is red.
      alpha: transparency value between 0 and 1. (default: 0.4)

    Raises:
      ValueError: On incorrect data type for image or masks.
    """
    if image.dtype != np.uint8:
        raise ValueError("`image` not of type np.uint8")
    if mask.dtype != np.uint8:
        raise ValueError("`mask` not of type np.uint8")
    if np.any(np.logical_and(mask != 1, mask != 0)):
        raise ValueError("`mask` elements should be in [0, 1]")
    if image.shape[:2] != mask.shape:
        raise ValueError(
            "The image has spatial dimensions %s but the mask has "
            "dimensions %s" % (image.shape[:2], mask.shape)
        )
    rgb = ImageColor.getrgb(color)
    pil_image = Image.fromarray(image)

    solid_color = np.expand_dims(np.ones_like(mask), axis=2) * np.reshape(
        list(rgb), [1, 1, 3]
    )
    pil_solid_color = Image.fromarray(np.uint8(solid_color)).convert("RGBA")
    pil_mask = Image.fromarray(np.uint8(255.0 * alpha * mask)).convert("L")
    pil_image = Image.composite(pil_solid_color, pil_image, pil_mask)
    np.copyto(image, np.array(pil_image.convert("RGB")))


STANDARD_COLORS = [
    "AliceBlue",
    "Chartreuse",
    "Aqua",
    "Aquamarine",
    "Azure",
    "Beige",
    "Bisque",
    "BlanchedAlmond",
    "BlueViolet",
    "BurlyWood",
    "CadetBlue",
    "AntiqueWhite",
    "Chocolate",
    "Coral",
    "CornflowerBlue",
    "Cornsilk",
    "Crimson",
    "Cyan",
    "DarkCyan",
    "DarkGoldenRod",
    "DarkGrey",
    "DarkKhaki",
    "DarkOrange",
    "DarkOrchid",
    "DarkSalmon",
    "DarkSeaGreen",
    "DarkTurquoise",
    "DarkViolet",
    "DeepPink",
    "DeepSkyBlue",
    "DodgerBlue",
    "FireBrick",
    "FloralWhite",
    "ForestGreen",
    "Fuchsia",
    "Gainsboro",
    "GhostWhite",
    "Gold",
    "GoldenRod",
    "Salmon",
    "Tan",
    "HoneyDew",
    "HotPink",
    "IndianRed",
    "Ivory",
    "Khaki",
    "Lavender",
    "LavenderBlush",
    "LawnGreen",
    "LemonChiffon",
    "LightBlue",
    "LightCoral",
    "LightCyan",
    "LightGoldenRodYellow",
    "LightGray",
    "LightGrey",
    "LightGreen",
    "LightPink",
    "LightSalmon",
    "LightSeaGreen",
    "LightSkyBlue",
    "LightSlateGray",
    "LightSlateGrey",
    "LightSteelBlue",
    "LightYellow",
    "Lime",
    "LimeGreen",
    "Linen",
    "Magenta",
    "MediumAquaMarine",
    "MediumOrchid",
    "MediumPurple",
    "MediumSeaGreen",
    "MediumSlateBlue",
    "MediumSpringGreen",
    "MediumTurquoise",
    "MediumVioletRed",
    "MintCream",
    "MistyRose",
    "Moccasin",
    "NavajoWhite",
    "OldLace",
    "Olive",
    "OliveDrab",
    "Orange",
    "OrangeRed",
    "Orchid",
    "PaleGoldenRod",
    "PaleGreen",
    "PaleTurquoise",
    "PaleVioletRed",
    "PapayaWhip",
    "PeachPuff",
    "Peru",
    "Pink",
    "Plum",
    "PowderBlue",
    "Purple",
    "Red",
    "RosyBrown",
    "RoyalBlue",
    "SaddleBrown",
    "Green",
    "SandyBrown",
    "SeaGreen",
    "SeaShell",
    "Sienna",
    "Silver",
    "SkyBlue",
    "SlateBlue",
    "SlateGray",
    "SlateGrey",
    "Snow",
    "SpringGreen",
    "SteelBlue",
    "GreenYellow",
    "Teal",
    "Thistle",
    "Tomato",
    "Turquoise",
    "Violet",
    "Wheat",
    "White",
    "WhiteSmoke",
    "Yellow",
    "YellowGreen",
]


def visualize_boxes_and_labels_on_image_array(
    image,
    boxes,
    classes,
    scores,
    category_index,
    instance_masks=None,
    use_normalized_coordinates=False,
    max_boxes_to_draw=20,
    min_score_thresh=0.5,
    agnostic_mode=False,
    line_thickness=4,
    groundtruth_box_visualization_color="black",
    skip_scores=False,
    skip_labels=False,
):
    """Overlay labeled boxes on an image with formatted scores and label names.

    This function groups boxes that correspond to the same location
    and creates a display string for each detection and overlays these
    on the image. Note that this function modifies the image in place, and returns
    that same image.

    Args:
      image: uint8 numpy array with shape (img_height, img_width, 3)
      boxes: a numpy array of shape [N, 4]
      classes: a numpy array of shape [N]. Note that class indices are 1-based,
        and match the keys in the label map.
      scores: a numpy array of shape [N] or None.  If scores=None, then
        this function assumes that the boxes to be plotted are groundtruth
        boxes and plot all boxes as black with no classes or scores.
      category_index: a dict containing category dictionaries (each holding
        category index `id` and category name `name`) keyed by category indices.
      instance_masks: a numpy array of shape [N, image_height, image_width] with
        values ranging between 0 and 1, can be None.
      instance_boundaries: a numpy array of shape [N, image_height, image_width]
        with values ranging between 0 and 1, can be None.
      keypoints: a numpy array of shape [N, num_keypoints, 2], can
        be None
      use_normalized_coordinates: whether boxes is to be interpreted as
        normalized coordinates or not.
      max_boxes_to_draw: maximum number of boxes to visualize.  If None, draw
        all boxes.
      min_score_thresh: minimum score threshold for a box to be visualized
      agnostic_mode: boolean (default: False) controlling whether to evaluate in
        class-agnostic mode or not.  This mode will display scores but ignore
        classes.
      line_thickness: integer (default: 4) controlling line width of the boxes.
      groundtruth_box_visualization_color: box color for visualizing groundtruth
        boxes
      skip_scores: whether to skip score when drawing a single detection
      skip_labels: whether to skip label when drawing a single detection

    Returns:
      uint8 numpy array with shape (img_height, img_width, 3) with overlaid boxes.
    """
    # Create a display string (and color) for every box location, group any boxes
    # that correspond to the same location.
    box_to_display_str_map = collections.defaultdict(list)
    box_to_color_map = collections.defaultdict(str)
    box_to_instance_masks_map = {}
    if not max_boxes_to_draw:
        max_boxes_to_draw = boxes.shape[0]
    for i in range(min(max_boxes_to_draw, boxes.shape[0])):
        if scores is None or scores[i] > min_score_thresh:
            box = tuple(boxes[i].tolist())
            if instance_masks is not None:
                box_to_instance_masks_map[box] = instance_masks[i]
            if scores is None:
                box_to_color_map[box] = groundtruth_box_visualization_color
            else:
                display_str = ""
                if not skip_labels:
                    if not agnostic_mode:
                        if classes[i] in category_index.keys():
                            class_name = category_index[classes[i]]["name"]
                        else:
                            class_name = "N/A"
                        display_str = str(class_name)
                if not skip_scores:
                    if not display_str:
                        display_str = "{}%".format(int(100 * scores[i]))
                    else:
                        display_str = "{}: {}%".format(
                            display_str, int(100 * scores[i])
                        )
                box_to_display_str_map[box].append(display_str)
                if agnostic_mode:
                    box_to_color_map[box] = "DarkOrange"
                else:
                    box_to_color_map[box] = STANDARD_COLORS[
                        classes[i] % len(STANDARD_COLORS)
                    ]

    # Draw all boxes onto image.
    for box, color in box_to_color_map.items():
        # Using Detectron2 style output.
        xmin, ymin, xmax, ymax = box
        if instance_masks is not None:
            draw_mask_on_image_array(image, box_to_instance_masks_map[box], color=color)
        draw_bounding_box_on_image_array(
            image,
            ymin,
            xmin,
            ymax,
            xmax,
            color=color,
            thickness=line_thickness,
            display_str_list=box_to_display_str_map[box],
            use_normalized_coordinates=use_normalized_coordinates,
        )

    return image

## Optional - Download a sample dataset

In [None]:
# @title Download the balloon dataset

# @markdown This step is only necessary if you don't have your own dataset and wish to use the Balloon dataset for demonstration purposes.

# @markdown In case you are using your own dataset, kindly convert it to [COCO format](https://opencv.org/introduction-to-the-coco-dataset/).

!wget https://github.com/matterport/Mask_RCNN/releases/download/v2.1/balloon_dataset.zip
!unzip balloon_dataset.zip > /dev/null

local_balloon_data_directory = "balloon"
BALLOON_DATA_GCS_PATH = os.path.join(BUCKET_URI, "balloon_dataset")

In [None]:
# @title Convert Balloon data to COCO format


def save_coco_format_json(img_dir, output_coco_format_json_filename):
    # Load original balloon data json file
    json_file = os.path.join(img_dir, "via_region_data.json")
    with open(json_file) as f:
        imgs_anns = json.load(f)

    output_coco_format_dict = {}
    # We only have one class: balloon.
    output_coco_format_dict["categories"] = [{"id": 0, "name": "balloon"}]
    output_coco_format_dict["images"] = []
    output_coco_format_dict["annotations"] = []
    annotation_idx = 0
    for image_idx, v in enumerate(imgs_anns.values()):
        filename = os.path.join(img_dir, v["filename"])
        height, width = cv2.imread(filename).shape[:2]
        image_item = {
            "id": image_idx,
            "width": width,
            "height": height,
            "file_name": v["filename"],
        }
        output_coco_format_dict["images"].append(image_item)

        # Process all regions in this image.
        annos = v["regions"]
        for _, anno in annos.items():
            assert not anno["region_attributes"]
            anno = anno["shape_attributes"]
            px = anno["all_points_x"]
            py = anno["all_points_y"]
            poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)]
            poly = [p for x in poly for p in x]

            annotation_idx += 1
            annotation_item = {
                "id": annotation_idx,
                "image_id": image_idx,
                # x, y, width, height
                "bbox": [
                    int(np.min(px)),
                    int(np.min(py)),
                    int(np.max(px) - np.min(px)),
                    int(np.max(py) - np.min(py)),
                ],
                "iscrowd": 0,
                # Only have one category.
                "category_id": 0,
                "segmentation": [poly],
            }
            RLEs = mask_util.frPyObjects([poly], width, height)
            RLE = mask_util.merge(RLEs)
            annotation_item["area"] = float(mask_util.area(RLE))
            output_coco_format_dict["annotations"].append(annotation_item)

    # Save output file.
    json_file = os.path.join(img_dir, output_coco_format_json_filename)
    with open(json_file, "w") as f:
        json.dump(output_coco_format_dict, f)


save_coco_format_json(
    os.path.join(local_balloon_data_directory, "train"),
    "balloon_train_coco_format.json",
)
save_coco_format_json(
    os.path.join(local_balloon_data_directory, "val"), "balloon_val_coco_format.json"
)

In [None]:
# @title Upload the Balloon data Cloud Storage.


def get_bucket_and_blob_name(filepath):
    # The gcs path is of the form gs://<bucket-name>/<blob-name>
    gs_suffix = filepath.split("gs://", 1)[1]
    return tuple(gs_suffix.split("/", 1))


def upload_local_dir_to_gcs(local_dir_path, gcs_dir_path):
    """Uploads files in a local directory to a GCS directory."""
    ! gcloud storage cp -R $local_dir_path $gcs_dir_path


upload_local_dir_to_gcs(
    os.path.join(local_balloon_data_directory, "train"),
    os.path.join(BALLOON_DATA_GCS_PATH, "train"),
)
upload_local_dir_to_gcs(
    os.path.join(local_balloon_data_directory, "val"),
    os.path.join(BALLOON_DATA_GCS_PATH, "val"),
)

## Finetune

In [None]:
# @title Finetune with Detectron2
# @markdown You will use the Vertex AI SDK to create and run the training job with the model-garden detectron2 training docker. You can choose one of the Faster R-CNN, RetinaNet, or Mask R-CNN models to finetune by uncommenting the corresponding code sections below. The training uses one V100 GPU and runs for around 3 mins once the training job begins.
TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
JOB_NAME = "detectron2_balloon_" + TIMESTAMP

container_uri = TRAIN_DOCKER_URI
staging_bucket = os.path.join(BUCKET_URI, "training/temporal")
TRAINING_ACCELERATOR_TYPE = "NVIDIA_TESLA_V100"
TRAINING_MACHINE_TYPE = "n1-standard-4"
TRAINING_AACCELERATOR_COUNT = 1

# Dataset and output directory related parameters.
train_dataset_name = "balloon_train"  # @param {type:"string"}
train_coco_json_file = os.path.join(
    BALLOON_DATA_GCS_PATH, "train/balloon_train_coco_format.json"
)
train_coco_json_file = gcs_fuse_path(train_coco_json_file)
train_image_root = os.path.join(BALLOON_DATA_GCS_PATH, "train")
train_image_root = gcs_fuse_path(train_image_root)
val_dataset_name = "balloon_val"  # @param {type:"string"}
val_coco_json_file = os.path.join(
    BALLOON_DATA_GCS_PATH, "val/balloon_val_coco_format.json"
)
val_coco_json_file = gcs_fuse_path(val_coco_json_file)
val_image_root = os.path.join(BALLOON_DATA_GCS_PATH, "val")
val_image_root = gcs_fuse_path(val_image_root)
output_dir = os.path.join(BUCKET_URI, JOB_NAME)

#################################################
# Model and dataset related parameters for Mask R-CNN.
config_file = "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
config_file = gcs_fuse_path(config_file)
remainder_args_list = []
remainder_args_list += ["DATASETS.TRAIN"] + [
    '("{train_dataset_name}",)'.format(train_dataset_name=train_dataset_name)
]
remainder_args_list += ["DATASETS.TEST"] + [
    '("{val_dataset_name}",)'.format(val_dataset_name=val_dataset_name)
]
remainder_args_list += ["DATALOADER.NUM_WORKERS"] + ["2"]
remainder_args_list += ["SOLVER.IMS_PER_BATCH"] + ["2"]
remainder_args_list += ["SOLVER.MAX_ITER"] + ["300"]
remainder_args_list += ["SOLVER.STEPS"] + ["[]"]
remainder_args_list += ["MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE"] + ["128"]
remainder_args_list += ["MODEL.ROI_HEADS.NUM_CLASSES"] + ["1"]
publisher_model_id = "mask-r-cnn"
#################################################

# #################################################
# # Model and dataset related parameters for RetinaNet.
# config_file='COCO-Detection/retinanet_R_50_FPN_3x.yaml'
# config_file = gcs_fuse_path(config_file)
# remainder_args_list = []
# remainder_args_list += ['DATASETS.TRAIN'] + ['("{train_dataset_name}",)'.format(train_dataset_name=train_dataset_name)]
# remainder_args_list += ['DATASETS.TEST'] + ['("{val_dataset_name}",)'.format(val_dataset_name=val_dataset_name)]
# remainder_args_list += ['DATALOADER.NUM_WORKERS'] + ['2']
# remainder_args_list += ['SOLVER.IMS_PER_BATCH'] + ['2']
# remainder_args_list += ['SOLVER.MAX_ITER'] + ['300']
# remainder_args_list += ['SOLVER.STEPS'] + ['[]']
# remainder_args_list += ['MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE'] + ['128']
# remainder_args_list += ['MODEL.ROI_HEADS.NUM_CLASSES'] + ['1']
# remainder_args_list += ['MODEL.RETINANET.NUM_CLASSES'] + ['1']
# publisher_model_id = "retinanet"
# #################################################

# #################################################
# # Model and dataset related parameters for Faster R-CNN.
# config_file='COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml'
# config_file = gcs_fuse_path(config_file)
# remainder_args_list = []
# remainder_args_list += ['DATASETS.TRAIN'] + ['("{train_dataset_name}",)'.format(train_dataset_name=train_dataset_name)]
# remainder_args_list += ['DATASETS.TEST'] + ['("{val_dataset_name}",)'.format(val_dataset_name=val_dataset_name)]
# remainder_args_list += ['DATALOADER.NUM_WORKERS'] + ['2']
# remainder_args_list += ['SOLVER.IMS_PER_BATCH'] + ['2']
# remainder_args_list += ['SOLVER.MAX_ITER'] + ['300']
# remainder_args_list += ['SOLVER.STEPS'] + ['[]']
# remainder_args_list += ['MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE'] + ['128']
# remainder_args_list += ['MODEL.ROI_HEADS.NUM_CLASSES'] + ['1']
# publisher_model_id = "faster-r-cnn'"
# #################################################

# Create argument list for docker.
# NOTE: Config file flag name has hyphen instead
# of underscore: 'config-file'.
lr = 0.00025
docker_args_list = [
    "--train_dataset_name",
    f"{train_dataset_name}",
    "--train_coco_json_file",
    f"{train_coco_json_file}",
    "--train_image_root",
    f"{train_image_root}",
    "--val_dataset_name",
    f"{val_dataset_name}",
    "--val_coco_json_file",
    f"{val_coco_json_file}",
    "--val_image_root",
    f"{val_image_root}",
    "--lr",
    f"{lr}",
    "--num-gpus",
    f"{TRAINING_AACCELERATOR_COUNT}",
    "--output_dir",
    f"{gcs_fuse_path(output_dir)}",
    "--config-file",
    f"{config_file}",
]
docker_args_list += remainder_args_list

In [None]:
# @title Create and run training job.
# Click on the generated link in the output under "View backing custom job:" to see your run in the Cloud Console.
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=staging_bucket)
training_job = aiplatform.CustomContainerTrainingJob(
    display_name=JOB_NAME,
    container_uri=container_uri,
)

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=TRAINING_ACCELERATOR_TYPE,
    accelerator_count=TRAINING_AACCELERATOR_COUNT,
    is_for_training=True,
)

LABEL = "detectron2-training"

training_job.run(
    args=docker_args_list,
    base_output_dir=f"{output_dir}",
    replica_count=1,
    machine_type=TRAINING_MACHINE_TYPE,
    accelerator_type=TRAINING_ACCELERATOR_TYPE,
    accelerator_count=TRAINING_AACCELERATOR_COUNT,
)

## Deploy model

In [None]:
# @title Upload models to model registry

# @markdown This section uploads the model to Model Registry and deploys it on an Endpoint resource. The model deployment step will take ~15 minutes to complete. You need to set the model path below from the training output Cloud Storage directory.

PRETRAINED_MODEL_PTH_FILE = os.path.join(output_dir, "model_final.pth")
PRETRAINED_MODEL_CFG_YAML_FILE = os.path.join(output_dir, "config.yaml")
TEST_THRESHOLD = 0.7
PREDICTION_CONTAINER_URI = SERVE_DOCKER_URI
PREDICTION_DISPLAY_NAME = "upload_detectron2_" + datetime.datetime.now().strftime(
    "%Y%m%d_%H%M%S"
)

LABEL = "detectron2-prediction"
endpoints[LABEL] = deploy_model(
    project=PROJECT_ID,
    location=REGION,
    display_name=PREDICTION_DISPLAY_NAME,
    serving_container_image_uri=PREDICTION_CONTAINER_URI,
    model_pth_file=PRETRAINED_MODEL_PTH_FILE,
    model_cfg_yaml_file=PRETRAINED_MODEL_CFG_YAML_FILE,
    publisher_model_id=publisher_model_id,
    test_threshold=TEST_THRESHOLD,
    use_dedicated_endpoint=use_dedicated_endpoint,
    service_account=SERVICE_ACCOUNT,
)

print("The uploaded model name is: ", PREDICTION_DISPLAY_NAME)

In [None]:
# @title Run predictions

endpoint_id = endpoints[LABEL].name

local_test_filepath = os.path.join(
    local_balloon_data_directory, "val/410488422_5f8991f26e_b.jpg"
)
instances = get_prediction_instances(local_test_filepath)


response = endpoints[LABEL].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

prediction = json.loads(response.predictions[0])
print(prediction)

print("Predict the test image: ", local_test_filepath)

# Draw boxes and masks.
img = load_img(local_test_filepath)

boxes = prediction["boxes"]
classes = prediction["classes"]
scores = prediction["scores"]
if prediction["masks_rle"]:
    masks_numpy = decode_rle_masks(prediction["masks_rle"])
else:
    masks_numpy = None
img.save("./sample.jpg")
output_image_array = visualize_boxes_and_labels_on_image_array(
    image=np.array(img),
    boxes=np.array(boxes),
    classes=np.array(classes),
    scores=np.array(scores),
    category_index={0: {"name": "balloon"}},
    instance_masks=masks_numpy,
)
output_image = Image.fromarray(np.uint8(output_image_array))
output_image.save("./sample_preds.jpg")
print('Prediction image saved to "./sample_preds.jpg" ')

## Clean up resources

In [None]:
# @title Delete the resources
# @markdown This section deletes the training job and the endpoint to recycle the resources and avoid unnecessary continuous charges that may incur.
# @markdown Delete the bucket if you don't need it anymore.

training_job.delete()

# Delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)


delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_NAME