In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Hugging Face DLCs: Serving PaliGemma using Pytorch Inference on Vertex AI with Custom Handler

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/vertex_ai_pytorch_inference_paligemma_with_custom_handler.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fserving%2Fvertex_ai_pytorch_inference_paligemma_with_custom_handler.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/serving/vertex_ai_pytorch_inference_paligemma_with_custom_handler.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/vertex_ai_pytorch_inference_paligemma_with_custom_handler.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/vertex_ai_pytorch_inference_paligemma_with_custom_handler.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/vertex_ai_pytorch_inference_paligemma_with_custom_handler.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/vertex_ai_pytorch_inference_paligemma_with_custom_handler.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/vertex_ai_pytorch_inference_paligemma_with_custom_handler.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/vertex_ai_pytorch_inference_paligemma_with_custom_handler.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| | |
|-|-|
| Author(s) |  [Ivan Nardini](https://github.com/inardini), [Alvaro Bartolome](https://github.com/alvarobartt) |

## Overview

> [**PaliGemma**](https://ai.google.dev/gemma) is a lightweight open vision-language model (VLM) inspired by PaLI-3, and based on open components like the SigLIP vision model and the Gemma language model.

> [**Hugging Face DLCs**](https://github.com/huggingface/Google-Cloud-Containers) are pre-built and optimized Deep Learning Containers (DLCs) maintained by Hugging Face and Google Cloud teams to simplify environment configuration for your ML workloads.

> [**Google Vertex AI**](https://cloud.google.com/vertex-ai) is a Machine Learning (ML) platform that lets you train and deploy ML models and AI applications, and customize large language models (LLMs) for use in your AI-powered applications.

This notebook showcases how to deploy Google PaliGemma from the Hugging Face Hub on Vertex AI using the Hugging Face Deep Learning Container (DLC) for Pytorch Inference in combination with a [custom handler](https://huggingface.co/docs/inference-endpoints/en/guides/custom_handler#create-custom-inference-handler).

By the end of this notebook, you will learn how to:

- Create a custom handler and test it
- Register any LLM from the Hugging Face Hub on Vertex AI
- Deploy an LLM on Vertex AI
- Send online predictions on Vertex AI

## Get started

### Install Vertex AI SDK and other required packages


In [None]:
%pip install --upgrade --user --quiet 'torch' 'torchvision' 'torchaudio'
%pip install --upgrade --user --quiet 'transformers' 'accelerate>=0.26.0'
%pip install --upgrade --user --quiet 'google-cloud-aiplatform[prediction]' 'crcmod' 'etils'

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says "Your session crashed for an unknown reason." This is expected. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Hugging Face `HF_HOME` variable

In [None]:
import os

from etils.epath import Path

ROOT_PATH = Path(".")
TUTORIAL_PATH = ROOT_PATH / "deploy_paligemma_with_custom_handler_tutorial"

os.environ["HF_HOME"] = str(TUTORIAL_PATH)

### Authenticate your Hugging Face account

As [`google/paligemma-3b-mix-448`](https://huggingface.co/google/paligemma-3b-mix-448) is a gated model, you need to have a Hugging Face Hub account, and accept the Google's usage license for PaliGemma. Once that's done, you need to generate a new user access token with read-only access so that the weights can be downloaded from the Hub in the Hugging Face DLC for TGI.

> Note that the user access token can only be generated via [the Hugging Face Hub UI](https://huggingface.co/settings/tokens/new), where you can either select read-only access to your account, or follow the recommendations and generate a fine-grained token with read-only access to [`google/paligemma-3b-mix-448`](https://huggingface.co/google/paligemma-3b-mix-448).

Then you can install the `huggingface_hub` that comes with a CLI that will be used for the authentication with the token generated in advance. So that then the token can be safely retrieved via `huggingface_hub.get_token`.

In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

Read more about [Hugging Face Security](https://huggingface.co/docs/hub/en/security), specifically about [Hugging Face User Access Tokens](https://huggingface.co/docs/hub/en/security-tokens).

### Requirements

#### Set Project ID and Location

To get started using Vertex AI, you must have an existing Google Cloud project and [enable these APIs](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,artifactregistry.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

PROJECT_NUMBER = !gcloud projects describe {PROJECT_ID} --format="get(projectNumber)"[0]
PROJECT_NUMBER = PROJECT_NUMBER[0]

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

#### Set and create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as models.

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}

BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

#### Set Service Account and permissions

You will need to have the following IAM roles set:

- Vertex AI User (roles/aiplatform.user)
- Artifact Registry Reader (roles/artifactregistry.reader)
- Storage Object Admin (roles/storage.objectAdmin)

For more information about granting roles, see [Manage access](https://cloud.google.com/iam/docs/granting-changing-revoking-access).


> If you run following commands using Vertex AI Workbench, run directly in the terminal.


In [None]:
SERVICE_ACCOUNT = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

In [None]:
for role in ['aiplatform.user', 'storage.objectAdmin', 'artifactregistry.reader']:

    ! gcloud projects add-iam-policy-binding {PROJECT_ID} \
      --member=serviceAccount:{SERVICE_ACCOUNT} \
      --role=roles/{role} --condition=None

#### Accept Google PaliGemma usage license on Hugging Face Hub

To access PaliGemma on Hugging Face, you are required to review and agree to Google usage license on the Hugging Face Hub for any of the models from the [PaliGemma release collection](https://huggingface.co/collections/google/paligemma-release-6643a9ffbf57de2ae0448dda), and the access request will be processed inmediately.

### Initiate Vertex AI SDK

Initiate Vertex AI client session.

In [None]:
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

### Import libraries

Import relevant libraries.

In [None]:
import base64
import gc
from io import BytesIO
import json

from PIL import Image
from etils import epath
from google.cloud.aiplatform import Endpoint, Model
from google.cloud.aiplatform.prediction import LocalModel
import requests
import torch
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor

### Helpers

Define some helpers.

In [None]:
def hf_model_path(hf_home_path):
    """
    Copy model files from the Hugging Face cache.
    """

    # Convert and expand the input path
    base_path = Path(hf_home_path).expanduser()

    if not base_path.exists():
        raise FileNotFoundError(f"Cache path does not exist: {base_path}")

    # Find the deepest snapshot directory
    snapshot_path = None
    for model_dir in base_path.iterdir():
        if not model_dir.is_dir():
            continue

        # Look for 'snapshots' directory
        snapshots_dir = model_dir / "snapshots"
        if not snapshots_dir.exists():
            continue

        # Get the first snapshot folder (usually there's only one)
        for snapshot in snapshots_dir.iterdir():
            if snapshot.is_dir():
                snapshot_path = snapshot
                break

        if snapshot_path:
            break
    if not snapshot_path:
        raise FileNotFoundError("No snapshot directory found in the cache")

    return snapshot_path


def get_cuda_device_names():
    """A function to get the list of NVIDIA GPUs"""
    if not torch.cuda.is_available():
        return None

    return [str(i) for i in range(torch.cuda.device_count())]


def empty_gpu_ram():
    gc.collect()
    torch.cuda.empty_cache()

## Prepare a custom handler to serve PaliGemma on Vertex AI

Custom Handlers are custom classes in Python that define the pre-processing, inference, and post-processing steps required to run the inference using HuggingFace Pytorch Prediction container on Vertex AI.

Think of Custom Handlers as personalized instructions for Hugging Face models. They define how to prepare the input data, run the model, and handle the results. In this sense, Custom Handlers add flexibility. They let you customize how data is prepared and processed, add extra steps, and even build in custom measurements or logging. This means you can tailor the process to your exact needs when the standard setup isn't sufficient.

These instructions are stored in a file named `handler.py`. If you need additional dependecies, you can list it in a `requirements.txt` file. The PyTorch container automatically finds and uses these files if they're present.

Have a look at [🤗 Serve Anything with Inference Endpoints + Custom Handlers](https://huggingface.co/blog/alvarobartt/serve-anything-inference-endpoints-custom-code) to learn more.

### Test the `handler` locally

Before to build the handler module, you can test its coding locally.

#### Load Paligemma from HuggingFace Hub

Load a pre-trained model called "paligemma" for text generation. You first sets up the text preprocessor, which handles tasks like tokenization (breaking text into words or sub-words). Then you load the actual model itself, optimized for lower memory usage and automatic device placement (likely GPU if available).

In [None]:
paligemma_processor = PaliGemmaProcessor.from_pretrained("google/paligemma-3b-mix-448")

paligemma_model = PaliGemmaForConditionalGeneration.from_pretrained(
    "google/paligemma-3b-mix-448",
    low_cpu_mem_usage=True,
    device_map="auto",
).eval()

#### Collect image for generating predictions

Download an image from a URL and opens it using the Pillow library. The code first fetches the image data from the given URL, then checks if the download was successful (raising an error if not). Finally, it opens the downloaded image data directly from memory using BytesIO, making it ready for further processing with Pillow.

In [None]:
image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image_response = requests.get(image_url)
image_response.raise_for_status()
image = Image.open(BytesIO(image_response.content))

#### Prepare the prediction request

Prepare the image captioning request by creating a dictionary with image data and captioning instructions.

The image, downloaded from a previous response (`image_response`), is encoded into a base64 string. `max_new_tokens`limits the caption to 100 words and `do_sample` tells the model to generate the most likely caption rather than a more creative, potentially less accurate one.


In [None]:
prediction_request = {
    "instances": [
        {
            "prompt": "caption it",
            "image_base64": base64.b64encode(image_response.content).decode("utf-8"),
            "generation_kwargs": {"max_new_tokens": 100, "do_sample": False},
        }
    ]
}

#### Generate prediction

Process a list of prediction requests, each containing an image and a text prompt.

For each request, you decode the base64-encoded image, prepare the image and prompt for a model (paligemma_model) using a processor (paligemma_processor), and then generate a text response using the model. Finally, you decode the generated response and adds it to a list of predictions.

Error handling ensures that each request includes the necessary keys and that the image can be loaded correctly. Custom generation parameters can be provided via generation_kwargs for each instance.

In [None]:
predictions = []

for instance in prediction_request["instances"]:
    if any(key not in instance for key in {"prompt", "image_base64"}):
        raise ValueError(
            "The request body for each instance should contain both the `prompt` and the `image_base64` key with a valid image."
        )

    try:
        image_bytes = base64.b64decode(instance["image_base64"])
        image_file = BytesIO(image_bytes)
        image = Image.open(image_file)
    except Exception as e:
        raise ValueError(f"The provided image cannot be loaded (with exception {e}).")
    inputs = paligemma_processor(
        text=instance["prompt"], images=image, return_tensors="pt"
    ).to(paligemma_model.device)

    input_len = inputs["input_ids"].shape[-1]
    with torch.inference_mode():
        generation_kwargs = instance.get(
            "generation_kwargs", {"max_new_tokens": 100, "do_sample": False}
        )
        generation = paligemma_model.generate(**inputs, **generation_kwargs)
        generation = generation[0][input_len:]
        response = paligemma_processor.decode(generation, skip_special_tokens=True)
        predictions.append(response)

Get the prediction.

In [None]:
print(predictions)

###  Define the `handler.py` module

After testing the handler code, you assemble the code in a Python module which defines the custom inference handler for PaliGemma.

You write this handler code to a file named handler.py within the model directory.

In [None]:
model_path = hf_model_path(TUTORIAL_PATH / "hub")

In [None]:
handler_module = """
from typing import Any, Dict, List
import torch
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration
import base64
from io import BytesIO
from PIL import Image
import logging
import sys

# Configure logging to output to stdout
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger('huggingface_inference_toolkit')

class EndpointHandler:
    def __init__(
        self,
        model_dir: str = '/opt/huggingface/model',
        **kwargs: Any,
    ) -> None:
        self.processor = PaliGemmaProcessor.from_pretrained(model_dir)
        self.model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_dir,
            low_cpu_mem_usage=True,
            device_map="auto",
            revision="bfloat16",
            torch_dtype=torch.bfloat16,
        ).eval()

    def __call__(self, data: Dict[str, Any]) -> Dict[str, List[Any]]:
        logger.info("Processing new request")
        predictions = []

        for instance in data['instances']:
            logger.info(f"Processing instance: {instance.get('prompt', '')[:100]}...")

            if any(key not in instance for key in {"prompt", "image_base64"}):
                error_msg = "Missing prompt or image_base64 in request body"
                logger.info(error_msg)
                raise ValueError(error_msg)

            try:
                image_bytes = BytesIO(base64.b64decode(instance['image_base64']))
                image = Image.open(image_bytes)
                logger.info("Image loaded successfully")
            except Exception as e:
                error_msg = f"Failed to load image: {str(e)}"
                logger.info(error_msg)
                raise ValueError(error_msg)

            inputs = self.processor(
                text=instance["prompt"], images=image, return_tensors="pt"
            ).to(self.model.device)
            input_len = inputs["input_ids"].shape[-1]
            logger.info(f"Input processed, length: {input_len}")

            with torch.inference_mode():
                generation_kwargs = data.get(
                    "generation_kwargs", {"max_new_tokens": 100, "do_sample": False}
                )
                logger.info(f"Generation kwargs: {generation_kwargs}")

                generation = self.model.generate(**inputs, **generation_kwargs)
                generation = generation[0][input_len:]
                response = self.processor.decode(generation, skip_special_tokens=True)
                logger.info(f"Generated response: {response[:100]}...")
                predictions.append(response)

        logger.info(f"Successfully processed {len(predictions)} instances")
        return {"predictions": predictions}
"""

with open(model_path / "handler.py", "w") as hf:
    hf.write(handler_module)
hf.close()

### Copy model with custom handler on Cloud Bucket

Efficiently upload the model directory to Google Cloud Storage using `gsutil`.

Note that `-m` enables multi-threaded uploads for faster transfer, especially for large directories. `-o GSUtil:parallel_composite_upload_threshold=150M` optimizes large file uploads by splitting them into smaller parts for parallel transfer, significantly speeding up the process for files larger than 150MB.

In [None]:
model_uri = epath.Path(BUCKET_URI) / "model"
model_uri.mkdir(parents=True, exist_ok=True)

In [None]:
! gsutil -o GSUtil:parallel_composite_upload_threshold=150M -m cp -r {str(model_path)}/* {str(model_uri)}

### (Optional) Testing the handler offline using serving container with Vertex AI LocalModel

For debugging purpose, Vertex AI provides `LocalModel` class, accessible through the Vertex AI SDK for Python. This class allows you to build and deploy your model locally, simulating the Vertex AI environment. Using LocalModel involves creating a Docker image that encapsulates your custom predictor code and the associated handler.

> **Important**: Running the LocalModel class requires a local Docker installation. This allows the model to be encapsulated within a container for consistent execution across different environments.

> If you haven't already installed Docker Engine, please refer to the official installation guide: [Install Docker Engine](https://docs.docker.com/engine/install/). This documentation provides detailed instructions for various operating systems and will guide you through the installation process. Ensure Docker is running correctly before proceeding with the LocalModel examples.


#### Create a LocalModel instance

Set up a local model by specifying the container image to use (a Hugging Face Transformers model optimized for PyTorch and GPUs) and the port it will listen on (5000).


In [None]:
local_paligemma_model = LocalModel(
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cu121.2-2.transformers.4-44.ubuntu2204.py311",
    serving_container_ports=[5000],
)

#### Create a LocalEndpoint instance

Deploy the model to a local endpoint for serving. You load the model from the specified location (model_uri) and start serving the model locally.

The `gpu_device_ids` sets leverages available GPUs if present.


In [None]:
local_paligemma_endpoint = local_paligemma_model.deploy_to_local_endpoint(
    artifact_uri=str(model_uri), gpu_device_ids=get_cuda_device_names()
)

local_paligemma_endpoint.serve()

#### Monitoring Your Containerized Deployment

To keep track of your container's deployment progress and identify any potential issues, you can use the following Docker commands within your terminal:

1. **List all containers:** `docker container ls -a` displays a list of all running and stopped containers. Locate the container associated with your local endpoint and copy its ID.  This ID is essential for the next step.

2. **Stream container logs:** `docker logs --follow <CONTAINER_ID>`  provides a real-time stream of your container's logs. Replace `<CONTAINER_ID>` with the ID you copied in the previous step. Monitoring these logs allows you to observe the deployment process, identify any errors or warnings, and understand the container's overall health.

#### Generate predictions

Send a prediction request to a local Vertex AI endpoint.

You convert the request data (prediction_request) into a JSON string, send it to the endpoint, and then print the predictions from the JSON response.


In [None]:
vertex_prediction_request = json.dumps(prediction_request)
vertex_prediction_response = local_paligemma_endpoint.predict(
    request=vertex_prediction_request, headers={"Content-Type": "application/json"}
)
print(vertex_prediction_response.json()["predictions"])

## Register Google Paligemma on Vertex AI

To serve PaliGemma with Pytorch Inference on Vertex AI, you start importing the model on Vertex AI Model Registry, a central repository where you can manage the lifecycle of your ML models on Vertex AI.

Before going into the code to upload or import a model on Vertex AI, let's quickly review the arguments provided to the `aiplatform.Model.upload` method:

* **`display_name`** is the name that will be shown in the Vertex AI Model Registry.

* **`serving_container_image_uri`** is the location of the Hugging Face DLC for Pytorch Inference that will be used for serving the model.

* (optional) **`serving_container_ports`** is the port where the Vertex AI endpoint |will be exposed, by default 8080.

For more information on the supported `aiplatform.Model.upload` arguments, check [its Python reference](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_upload).

In [None]:
model = Model.upload(
    display_name="google--paligemma-3b-mix-448",
    artifact_uri=str(model_uri),
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cu121.2-2.transformers.4-44.ubuntu2204.py311",
    serving_container_ports=[8080],
)
model.wait()

## Deploy Google PaliGemma on Vertex AI

After the model is registered on Vertex AI, you can deploy the model to an endpoint.

You need to first deploy a model to an endpoint before that model can be used to serve online predictions. Deploying a model associates physical resources with the model so it can serve online predictions with low latency.

Before going into the code to deploy a model to an endpoint, let's quickly review the arguments provided to the `aiplatform.Model.deploy` method:

- **`endpoint`** is the endpoint to deploy the model to, which is optional, and by default will be set to the model display name with the `_endpoint` suffix.
- **`machine_type`**, **`accelerator_type`** and **`accelerator_count`** are arguments that define which instance to use, and additionally, the accelerator to use and the number of accelerators, respectively. The `machine_type` and the `accelerator_type` are tied together, so you will need to select an instance that supports the accelerator that you are using and vice-versa. More information about the different instances at [Compute Engine Documentation - GPU machine types](https://cloud.google.com/compute/docs/gpus), and about the `accelerator_type` naming at [Vertex AI Documentation - MachineSpec](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec).

For more information on the supported `aiplatform.Model.deploy` arguments, you can check [its Python reference](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_deploy).

In [None]:
deployed_model = model.deploy(
    endpoint=Endpoint.create(display_name="google--paligemma-7b-it-endpoint"),
    machine_type="g2-standard-4",
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
)

> Note that the model deployment on Vertex AI can take around 15 to 25 minutes; most of the time being the allocation / reservation of the resources, setting up the network and security, and such.

## Online predictions on Vertex AI

Once the model is deployed on Vertex AI, you can run the online predictions using the `aiplatform.Endpoint.predict` method, which will send the requests to the running endpoint in the `/predict` route specified within the container following Vertex AI I/O payload formatting.

### Via Python

#### Within the same session

To run the online prediction via the Vertex AI SDK, you can simply use the `predict` method.

In [None]:
output = deployed_model.predict(instances=prediction_request["instances"])
output

#### From a different session

To run the online prediction from a different session, you can run the following snippet.

In [None]:
import base64
from io import BytesIO
import os

from PIL import Image
from google.cloud import aiplatform
import requests

PROJECT_ID = "[your-project-id]"  # @param {type:"string", isTemplate: true}
if PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

aiplatform.init(project=PROJECT_ID, location=LOCATION)

ENDPOINT_DISPLAY_NAME = (
    "google--paligemma-7b-it-endpoint"  # @param {type:"string", isTemplate: true}
)

# Iterates over all the Vertex AI Endpoints within the current project and keeps the first match (if any), otherwise set to None
ENDPOINT_ID = next(
    (
        endpoint.name
        for endpoint in aiplatform.Endpoint.list()
        if endpoint.display_name == ENDPOINT_DISPLAY_NAME
    ),
    None,
)
assert ENDPOINT_ID, (
    "`ENDPOINT_ID` is not set, please make sure that the `endpoint_display_name` is correct at "
    f"https://console.cloud.google.com/vertex-ai/online-prediction/endpoints?project={os.getenv('PROJECT_ID')}"
)

# Initiate the endpoint
endpoint = aiplatform.Endpoint(
    f"projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}"
)

# Get the image
IMAGE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"  # @param {type:"string"}
image_response = requests.get(IMAGE_URL)
image_response.raise_for_status()
image = Image.open(BytesIO(image_response.content))

# Generate the prediction
output = endpoint.predict(
    instances=[
        {
            "prompt": "caption it",
            "image_base64": base64.b64encode(image_response.content).decode("utf-8"),
            "generation_kwargs": {"max_new_tokens": 100, "do_sample": False},
        }
    ],
)
output

### Via gcloud

You can also send the requests using the `gcloud` CLI via the `gcloud ai endpoints` command.

> Note that, before proceeding, you should either replace the values or set the following environment variables in advance from the Python variables set in the example, as follows:
>
> ```python
> import os
> os.environ["PROJECT_ID"] = PROJECT_ID
> os.environ["LOCATION"] = LOCATION
> os.environ["ENDPOINT_NAME"] = "google--paligemma-7b-it-endpoint"
> ```

In [None]:
%%bash
# Get Endpoint ID
ENDPOINT_ID=$(gcloud ai endpoints list \
  --project=$PROJECT_ID \
  --region=$LOCATION \
  --filter="display_name=$ENDPOINT_NAME" \
  --format="value(name)" \
  | cut -d'/' -f6)

# Get the image in base64
IMAGE_URL="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"  # @param {type:"string"}
BASE64_IMAGE=$(curl -s "$IMAGE_URL" | base64 -w 0)

# Generate the prediction
echo '{
  "instances": [
    {
        "prompt": "caption it",
        "image_base64": "${BASE64_IMAGE}",
        "generation_kwargs": {"max_new_tokens": 100, "do_sample": false}
    }
  ]
}' | gcloud ai endpoints predict $ENDPOINT_ID \
  --project=$PROJECT_ID \
  --region=$LOCATION \
  --json-request="-"

### Via cURL

Alternatively, you can also send the requests via `cURL`.

> Note that, before proceeding, you should either replace the values or set the following environment variables in advance from the Python variables set in the example, as follows:
>
> ```python
> import os
> os.environ["PROJECT_ID"] = PROJECT_ID
> os.environ["LOCATION"] = LOCATION
> os.environ["ENDPOINT_NAME"] = "google--paligemma-7b-it-endpoint"
> ```

In [None]:
%%bash
# Get Endpoint ID
ENDPOINT_ID=$(gcloud ai endpoints list \
  --project=$PROJECT_ID \
  --region=$LOCATION \
  --filter="display_name=$ENDPOINT_NAME" \
  --format="value(name)" \
  | cut -d'/' -f6)

# Get the image in base64
IMAGE_URL="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true" # @param {type:"string"}
BASE64_IMAGE=$(curl -s "$IMAGE_URL" | base64 -w 0)

# Generate the prediction
curl -X POST \
    -H "Authorization: Bearer $(gcloud auth print-access-token)" \
    -H "Content-Type: application/json" \
    "https://${LOCATION}-aiplatform.googleapis.com/v1/projects/${PROJECT_ID}/locations/${LOCATION}/endpoints/${ENDPOINT_ID}:predict" \
    -d @- <<EOF
{
    "instances": [
        {
            "prompt": "caption it",
            "image_base64": "${BASE64_IMAGE}",
            "generation_kwargs": {"max_new_tokens": 100, "do_sample": false}
        }
    ]
}

## Cleaning up

In [None]:
delete_endpoint = False
delete_model = False
delete_bucket = False

if delete_endpoint:
    deployed_model.undeploy_all()
    deployed_model.delete()

if delete_endpoint:
    delete_model.delete()

if delete_bucket:
    ! gsutil rm -r {BUCKET_URI}