In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - PaliGemma 2 (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_hf_paligemma2_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_hf_paligemma2_deployment.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook provides a practical introduction to using the PaLiGemma 2 model, a powerful vision-language model developed by Google. We'll demonstrate how to leverage its multimodal capabilities to perform tasks like vision question answering. Consult the [model card](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/paligemma) for more information.


### Objective

- Deploy PaliGemma 2 to a Vertex AI Endpoint.
- Make predictions to the endpoint including:
  - Answering questions about a given image.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Setup Google Cloud project

# Used for common utilities.
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

# Import the necessary packages
import importlib
from typing import Any, Dict, Tuple

from google.cloud import aiplatform

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

BUCKET_URI = "gs://"  # @param {type:"string"}

# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 4. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, us-east5, europe-west4, us-west1, asia-southeast1 |

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

models, endpoints = {}, {}

# @markdown You must provide a Hugging Face User Access Token (read) to access the PaLiGemma 2 models. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.
HF_TOKEN = ""  # @param {type:"string", isTemplate:true}

# The pre-built serving docker images.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-one-serve:20250110_0822_RC00"


def deploy_model(
    model_id: str = None,
    task: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    service_account: str = None,
    serving_port: int = 7080,
    serving_route: str = "/predict",
    serving_docker_uri: str = SERVE_DOCKER_URI,
    hf_token: str = None,
) -> Tuple[aiplatform.Endpoint, aiplatform.Model]:
    """Deploys a model to a real-time prediction endpoint.

    Args:
        model_id: The model ID.
        task: The task to perform.
        machine_type: The machine type.
        accelerator_type: The accelerator type.
        accelerator_count: The accelerator count.
        service_account: The service account.
        serving_port: The serving port.
        serving_route: The serving route.
        hf_token: HuggingFace token for model access.

    Returns:
        A tuple containing the created endpoint and deployed model objects.
    """

    endpoint = aiplatform.Endpoint.create(
        display_name=common_util.get_job_name_with_datetime(prefix="paligemma-2")
    )
    serving_env = {
        "MODEL_ID": model_id,
        "DEPLOY_SOURCE": "notebook",
        "HF_TOKEN": hf_token,
        "TASK": task,
    }
    model = aiplatform.Model.upload(
        display_name=task,
        serving_container_image_uri=serving_docker_uri,
        serving_container_ports=[serving_port],
        serving_container_predict_route=serving_route,
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
        model_garden_source_model_name="publishers/google/models/paligemma",
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        service_account=service_account,
    )
    return endpoint, model


def vqa_predict(
    endpoint: aiplatform.Endpoint,
    image_url: str,
    text_prompt: str,
    parameters: Dict[str, Any] = None,
) -> str:
    """Predicts the answer to a question about an image using an Endpoint,

       and passes parameters in the payload.

    Args:
        endpoint: The deployed Vertex AI endpoint.
        image_url: URL of the image to ask about.
        text_prompt: The text prompt question.
        parameters: Additional parameters for the prediction request.

    Returns:
        The predicted answer string or None if no prediction.
    """

    instances = []
    if text_prompt:
        instances.append(
            {
                "text_prompt": text_prompt,
                "image_url": image_url,
            }
        )

    # Construct the prediction payload
    payload = {"instances": instances}
    if parameters:
        payload["parameters"] = parameters

    response = endpoint.predict(instances=instances, parameters=parameters)
    answer = None
    if response.predictions:
        answer = response.predictions[0]["text"].split("\n")[1]
    return answer

## Deploy Model to a Vertex AI Endpoint

In [None]:
# @title Deploy

# @markdown This section uploads the prebuilt PaliGemma 2 models to Model Registry and deploys it to a Vertex AI Endpoint. It takes approximately 15 minutes to finish.

# @markdown Select the desired resolution and precision of prebuilt model to deploy, leaving the optional `custom_paligemma_model_uri` as is. Higher resolution and precision_type can result in better inference results, but may require additional GPU.


MODEL_ID = "google/paligemma2-3b-pt-224"  # @param ["google/paligemma2-3b-pt-224", "google/paligemma2-3b-pt-448", "google/paligemma2-10b-ft-docci-448"]
TASK = "paligemma_VQA"  # @param ["paligemma_VQA"]
accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4"]
accelerator_count = 1  # @param [1]
machine_type = "g2-standard-8"  # @param  ["g2-standard-8"]


# @markdown If you want to use other accelerator types not listed above, then check other Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute. You may need to manually set the `machine_type`, `accelerator_type`, and `accelerator_count` in the code by clicking `Show code` first.


common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

models["paligemma2"], endpoints["paligemma2"] = deploy_model(
    model_id=MODEL_ID,
    task=TASK,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    service_account=SERVICE_ACCOUNT,
    serving_port=7080,
    serving_route="/predict",
    serving_docker_uri=SERVE_DOCKER_URI,
    hf_token=HF_TOKEN,
)

In [None]:
# @title [Optional] Loading an existing Endpoint
# @markdown If you've already deployed an Endpoint, you can load it by filling in the Endpoint's ID below.
# @markdown You can view deployed Endpoints at [Vertex Online Prediction](https://console.cloud.google.com/vertex-ai/online-prediction/endpoints).
endpoint_id = ""  # @param {type: "string"}

if endpoint_id:
    endpoint = aiplatform.Endpoint(
        endpoint_name=endpoint_id,
        project=PROJECT_ID,
        location=REGION,
    )

### Predict

The following sections will use images from [pexels.com](https://www.pexels.com/) for demoing purposes. All the images have the following license: https://www.pexels.com/license/.

Images will be resized to a width of 1000 pixels by default since requests made to a Vertex Endpoint are limited to 1.500MB.

In [None]:
# @title Visual Question Answering

# @markdown This section uses the deployed PaliGemma model to answer questions about a given image.

# @markdown ![](https://images.pexels.com/photos/1006293/pexels-photo-1006293.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2)
image_url = "https://images.pexels.com/photos/1006293/pexels-photo-1006293.jpeg"  # @param {type:"string"}

# @markdown You may leave question prompts empty and they will be ignored.
question_prompt = "What animal is shown in the picture?"  # @param {type: "string"}

# @markdown The question prompt can be non-English languages.

# Using max_new_tokens along with other parameters
parameters_with_tokens = {"max_new_tokens": 50}
predictions_with_tokens = vqa_predict(
    endpoint=endpoint,
    image_url=image_url,
    text_prompt=question_prompt,
    parameters=parameters_with_tokens,
)

print(f"Prediction Response: {predictions_with_tokens}")
# @markdown Click "Show Code" to see more details.

## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_NAME