In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# RS Imagery Batch Inference on VertexAI

This notebook shows how to run a Batch Prediction Job deployed VLMs (image and text) on Vertex AI.

**Prepare the environment for interacting with Vertex AI:**

Initialize the Vertex AI SDK using the aiplatform.init() function.

Configure the SDK to work with your specific Google Cloud project (PROJECT_ID) and region (REGION) that were defined in the previous configuration cell. This step is necessary before using other SDK functions to manage Vertex AI resources.

In [None]:
# @title Setup Notebook

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 3. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

import base64
import io
import json
from typing import Any

from google.cloud import aiplatform, storage
from PIL import Image

# Import common utils
if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.notebooks.community.model_garden.docker_source_codes.notebook_util.common_util"
)

# Setup GCP & VertexAI

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

! gcloud config set project $PROJECT_ID
import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)


def to_png_bytes(img: Image.Image) -> bytes:
    """Encodes the `img` as PNG bytes."""
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    return buf.getvalue()


def to_b64_png(img: Image.Image) -> str:
    """Converts the `img` to a b64 encoded PNG bytes."""
    return base64.b64encode(to_png_bytes(img)).decode()


def write_jsonl_instances(
    bucket: storage.Bucket, path: str, instances: list[dict[str, Any]]
):
    """Writes the list of instances (dicts) as a JSONL serialized file.

    Each dict is an inference instance, matching one of the following structures:

      {'image': <b64 image>} - Image inference only.
      {'text': <str>} - Text inference only.
      {'image': <b64 image, 'texts': list<str>} - Image & text inference.
    """
    with bucket.blob(path).open("wt") as f:
        f.writelines(json.dumps(instance) for instance in instances)

In [None]:
# @title Initialize data bucket
# @markdown ### Enter a GCS bucket name and a path within the bucket:

BUCKET_NAME = ""  # @param { type : 'string' }
OUTPUT_PATH = "batch_inference/inputs"  # @param { type : 'string' }

storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

# Download sample image
!wget -O harbor.jpg https://mrsg.aegean.gr/images/uploads/it2zi0eidej4ql33llj.jpg
harbor_img = Image.open("harbor.jpg")

In [None]:
# @title Prepare data: Image list file

input_uris = []

# The sample image (harbor) is replicated 10 times as an example.
for i in range(10):
    img_path = f"{OUTPUT_PATH}/images/img{i}.png"
    input_uris.append(f"gs://{BUCKET_NAME}/{img_path}")
    bucket.blob(img_path).upload_from_string(
        to_png_bytes(harbor_img), content_type="image/png"
    )

with bucket.blob(f"{OUTPUT_PATH}/input_uris.txt").open("wt") as f:
    f.writelines([f"{i}\n" for i in input_uris])

In [None]:
# @title Prepare JSONL input

# This cell generates sample inputs (JSONL files) in 3 formats:
# Image, text and image_text, the input files are sharded to optionally reduce
# the size of each file. The file pattern (with wildcards) is used as input for
# the batch pipeline, e.g. "gs://bucket_path/image*.jsonl"

# Write 3 shards of text input instances (10 instances each).
instances = [{"text": "test string"}] * 10
for i in range(3):
    write_jsonl_instances(bucket, f"{OUTPUT_PATH}/text{i}.jsonl", instances)

# Write 10 image input instances into 3 shards.
instances = [{"image": to_b64_png(harbor_img)}] * 10
for p in range(3):
    write_jsonl_instances(bucket, f"{OUTPUT_PATH}/image{p}.jsonl", instances)

# Write 10 image & texts input instances into a single file.
instances = [
    {"image": to_b64_png(harbor_img), "texts": ["text1", "text2"]},
] * 10
write_jsonl_instances(bucket, f"{OUTPUT_PATH}/combined.jsonl", instances)

In [None]:
# @title Run batch inference

JOB_DISPLAY_NAME = "batch-inference-vlm-test"  # @param { type: 'string' }
# @markdown Enter the project number and model id, the project number is not the
# @markdown same as project id (it can be found in the project settings).
PROJECT_NUMBER = "<project_number_here>"  # @param { type: 'string' }
MODEL_ID = "<model_id_here>"  # @param { type: 'string' }
MODEL_RESOURCE_NAME = f"projects/{PROJECT_NUMBER}/locations/{REGION}/models/{MODEL_ID}"

# @markdown Choose the input (instances) format, either a file-list of images or
# @markdown a JSONL file pattern of JSON formatted inputs.
INPUT_SOURCE_FORMAT = "file-list"  # @param["file-list", "jsonl"]
# @markdown Configure batch input source  This can use string wildcards such as
# @markdown '*' and '?' to support sharded inputs.
INPUT_SOURCE_PATTERN = "gs://<bucket>/batch_inference/inputs/inputlist.txt"  # @param { type: 'string' }
# @markdown Configure the output folder path, predictions will be written here.
GCS_OUTPUT_PATH = "gs://<bucket>/batch_inference/outputs"  # @param { type: 'string' }
# @markdown Configure the batch runtime setup
USE_GPU = True  # @param { type: 'boolean' }
BATCH_SIZE = 16  # @param { type: 'number' }
REPLICA_COUNT = 1  # @param { type: 'number' }
MAX_REPLICA_COUNT = 4  # @param { type: 'number' }

machine_type = "g2-standard-8"
if USE_GPU:
    accelerator_type = "NVIDIA_L4"
    accelerator_count = 1
else:
    accelerator_type = None
    accelerator_count = None

model = aiplatform.Model(MODEL_RESOURCE_NAME)

job = model.batch_predict(
    job_display_name=JOB_DISPLAY_NAME,
    gcs_source=INPUT_SOURCE_PATTERN,
    gcs_destination_prefix=GCS_OUTPUT_PATH,
    instances_format=INPUT_SOURCE_FORMAT,
    machine_type=machine_type,
    accelerator_count=accelerator_count,
    accelerator_type=accelerator_type,
    starting_replica_count=REPLICA_COUNT,
    max_replica_count=MAX_REPLICA_COUNT,
    labels={
        "task": "batch-inference",
        "vertex-ai-pipelines-run-billing-id": JOB_DISPLAY_NAME,
    },
    batch_size=BATCH_SIZE,
    sync=False,
)

print(f"Batch prediction job started: {job}")

In [None]:
# Monitor the job status

print(
    f"Running batch prediction {job.display_name}, resource:"
    f" {job.resource_name}. State: {job.state}"
)