In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving Open Models on Vertex AI using vLLM with GPU

<table align="left">
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/model_serving/vertexai_serving_gpu.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fmodel_serving%2Fvertexai_serving_gpu.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/model_serving/vertexai_serving_gpu.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

There are multiple ways of serving open models such as Llama 3.2 on Google Cloud Vertex AI Platform. The Llama models are available in [Model Garden](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama) and Model Garden allows a single click self-deployment of the models. This notebooks demonstrates how open models can be served via Vertex AI Endpoint using a custom vLLM container image built for the GPU. This notebook does the following:

- Builds a custom docker container image using customized vLLM source code
- Uploads the model to Model Registry using custom docker container image
- Creates a public Endpoint for Online Prediction
- Deploys model to the Endpoint
- Llama 3.2 3B model is downloaded from Hugging Face during deployment

The code in this notebook can be used for serving other open models supported by vLLM. This notebook has been tested with Python 3.10 and the latest version of `google-cloud-aiplatform` SDK, which currently is `1.101.0`.

## Get Started

### Install Vertex AI SDK for Python and other required packages

In [None]:
!pip install --upgrade --quiet google-cloud-aiplatform==1.101.0

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Create vLLM Customer Container Image for Vertex AI

Vertex AI requires [requests](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#inference) and [responses](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#response_requirements) in specific formats. vLLM API server implements OpenAI API protocol and therefore, it does not support the Vertex AI request and response requirements. Therefore, the vLLM API server (vllm.entrypoints.openai.api_server.py) needs to be updated to support Vertex AI request and response formats.

### Enable Artifact Registry API
Enable the Artifact Registry API service for the Google cloud project. This tutorial requires [gcloud CLI](https://cloud.google.com/sdk/docs/install) installed.

In [None]:
! gcloud components update --quiet && gcloud services enable artifactregistry.googleapis.com

### Create a private Docker repository
Create a Docker repository in [Artifact Registry](https://cloud.google.com/artifact-registry/docs/overview).

In [None]:
DOCKER_REPOSITORY = "my-docker-repo"
! gcloud artifacts repositories create {DOCKER_REPOSITORY} --repository-format=docker --location={LOCATION} --description="Vertex AI Docker repository"

### Build vLLM Custom Docker Container Image for GPU

Build docker container image from vLLM source.

**NOTE:** Building container image from a notebook/colab may cause python kernel crash, run the following commands from a shell instead.

In [None]:
DOCKER_URI = f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{DOCKER_REPOSITORY}/vllm-gcp-gpu"
! cd docker && docker build -f Dockerfile.gpu -t {DOCKER_URI} .

Configure the [authentication for Google Artifact Registry's Docker repository](https://cloud.google.com/artifact-registry/docs/docker/pushing-and-pulling#auth) before pushing the container image to the repository.

In [None]:
! gcloud auth configure-docker {LOCATION}-docker.pkg.dev --quiet

Push docker container image to Artifact Registry repository.

In [None]:
! docker push {DOCKER_URI}

## Deploy Model to Vertex AI Endpoint

Following steps are required to serve model via a Vertex AI Prediction Endpoint:
- import model to model registry
- create a Online Prediction Endpoint
- Deploy the model to endpoint

### Define Variable

In [None]:
hf_token = "[your-hugging-face-auth-token]"  # @param {type:"string"}
model_name="gpu-llama3_2_3B-serve-vllm"  # @param {type:"string"}
model_id = "meta-llama/Llama-3.2-3B"  # @param {type:"string"}
machine_type = "g2-standard-8"  # @param {type:"string"}
accelerator_type = "NVIDIA_L4"  # @param {type:"string"}
accelerator_count = 1  # @param {type:"string"}

### Import model to Model Registry

In [None]:
from google.cloud import aiplatform

def upload_model(
    model_name: str,
    model_id: str,
    hf_token: str,
    accelerator_count: int,
) -> aiplatform.Model:

    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.openai.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        "--max-model-len=2048",
        "--gpu-memory-utilization=0.9",
        "--enable-prefix-caching",
        f"--tensor-parallel-size={accelerator_count}",
    ]

    env_vars = {
        "HF_TOKEN": hf_token,        
    }
    
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/v1/completions",
        serving_container_health_route="/health",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=1800,
    )
    return model

vertexai_model = upload_model(model_name=model_name, model_id=model_id, hf_token=hf_token, accelerator_count=int(accelerator_count))

### Create Vertex AI Endpoint for Online Prediction

In [None]:
def create_model_endpoint(model_name: str) -> aiplatform.Endpoint:
    endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_name}-endpoint",
            dedicated_endpoint_enabled=False,
        )
    return endpoint

vertexai_endpoint = create_model_endpoint(model_name=model_name)

### Deploy Model to Endpoint
**NOTE**: The model deployment may take around 30 minutes to complete.

In [None]:
def deploy_model(
    model: aiplatform.Model,
    endpoint: aiplatform.Endpoint,
    model_name: str,
    machine_type: str,
    accelerator_type: str,
    accelerator_count: int,
):
    print(f"Deploying {model_name} to endpoint: {endpoint.resource_name} using machine type: {machine_type}")
    model.deploy(
        endpoint=endpoint,
        deployed_model_display_name=model_name,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        traffic_percentage=100,
        deploy_request_timeout=1800,
    )

deploy_model(
    model=vertexai_model,
    endpoint=vertexai_endpoint,
    model_name=model_name,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=int(accelerator_count),
)

## Test Endpoint

In [None]:
PROMPT = (
    "Distance of moon from earth is "
)

instances = [
    {
        "prompt": PROMPT,
        "temperature": 0.0,
    },
]

response = vertexai_endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, delete the resources created in this tutorial.

### Delete private docker repository

In [None]:
! gcloud artifacts repositories delete {DOCKER_REPOSITORY} --location={LOCATION} --quiet