In [None]:
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Deploy LoRA Fine-tuned Models on Vertex AI with Custom vLLM Container

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/vertexai_serving_vllm/vertexai_serving_vllm_mistral_7b_lora_custom_container.ipynb">
      <img src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fprediction%2Fvertexai_serving_vllm%2Fvertexai_serving_vllm_mistral_7b_lora_custom_container.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/prediction/vertexai_serving_vllm/vertexai_serving_vllm_mistral_7b_lora_custom_container.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/vertexai_serving_vllm/vertexai_serving_vllm_mistral_7b_lora_custom_container.ipynb">
      <img width="32px" src="https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook demonstrates how to deploy a LoRA (Low-Rank Adaptation) fine-tuned model on Vertex AI using a custom-built vLLM container. This approach gives you more control over the vLLM configuration and allows you to customize the serving environment.

### Models used in this example

**Base Model:** [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)

Mistral-7B-Instruct-v0.3 is an instruction-tuned version of the Mistral-7B model, optimized for following instructions and conversational tasks. It features a 32k context window and uses the v3 tokenizer with extended vocabulary.

**LoRA Adapter:** [Research-Reasoner-7B-v0.3](https://huggingface.co/Raymond-dev-546730/Research-Reasoner-7B-v0.3)

Research-Reasoner-7B-v0.3 is a LoRA adapter fine-tuned on top of Mistral-7B to enhance research planning and reasoning capabilities. It enables the model to provide structured, step-by-step research methodologies.

### What you will learn

In this tutorial, you will learn how to:

* Set up Artifact Registry for custom container storage
* Build and push a custom vLLM container using Cloud Build
* Configure vLLM serving arguments for LoRA deployment
* Register the model in Vertex AI Model Registry
* Deploy the model to a Vertex AI Endpoint
* Run inference using both the base model and LoRA adapter

## Get started

### Install required packages

Install the Vertex AI SDK for Python. This package provides the `aiplatform` module for:

- Uploading models to the Model Registry
- Creating and managing endpoints
- Deploying models for online predictions

**Note**: Unlike the prebuilt container notebook, we don't need `huggingface_hub` here because the models are assumed to already be in GCS.

In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform

### Import libraries

In [1]:
import json
import os
import sys

import vertexai
from google.cloud import aiplatform

### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.

In [2]:
if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = "us-central1"  # @param {type:"string"}

BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
if not BUCKET_NAME or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = f"{PROJECT_ID}-vertex-ai-models"

In [3]:
PROJECT_ID = "shamika-customer-ml-dev"  # <--- REPLACE WITH YOUR PROJECT ID
BUCKET_NAME = "shamika-ml-dev-vertex-ai-artifacts" # <--- REPLACE WITH YOUR BUCKET NAME
LOCATION = "us-central1"
SERVICE_ACCOUNT = "shamika-ml-dev-vertex-deployer@shamika-customer-ml-dev.iam.gserviceaccount.com"

### Initialize Vertex AI SDK

In [4]:
vertexai.init(project=PROJECT_ID, location=LOCATION)
print(f"Initialized Vertex AI for project: {PROJECT_ID}")

Initialized Vertex AI for project: shamika-customer-ml-dev


## Configure model paths

Define the GCS paths where your base model and LoRA adapter are stored.

**Prerequisites**: Before running this notebook, ensure that:
1. The Mistral-7B-Instruct-v0.3 base model is uploaded to `gs://{BUCKET_NAME}/deployments/lora/basemodels/`
2. The Research-Reasoner LoRA adapter is uploaded to `gs://{BUCKET_NAME}/deployments/lora/adapters/`

You can use the prebuilt container notebook to download and upload these models, or use the Hugging Face CLI:
```bash
huggingface-cli download mistralai/Mistral-7B-Instruct-v0.3 --local-dir ./model
gsutil -m cp -r ./model gs://{BUCKET_NAME}/deployments/lora/basemodels/mistralai/Mistral-7B-Instruct-v0.3
```

In [5]:
GCS_BASE_MODEL_URI = f"gs://{BUCKET_NAME}/deployments/lora/basemodels/mistralai/Mistral-7B-Instruct-v0.3"
GCS_ADAPTER_URI = f"gs://{BUCKET_NAME}/deployments/lora/adapters/Raymond-dev-546730/Research-Reasoner-7B-v0.3"

## Enable required APIs

Enable the Google Cloud APIs needed for building and deploying custom containers:

- **Artifact Registry API**: For storing Docker container images
- **Cloud Build API**: For building Docker images in the cloud

These APIs are required to build the custom vLLM container and push it to your project's Artifact Registry.

In [7]:
! gcloud services enable artifactregistry.googleapis.com cloudbuild.googleapis.com

Operation "operations/acat.p2-39042042168-21db2887-d325-4b3b-87bb-c22286db35b4" finished successfully.


## Create Artifact Registry repository

Create a Docker repository in Artifact Registry to store your custom vLLM container image.

**What is Artifact Registry?**
Artifact Registry is Google Cloud's package management service for storing, managing, and securing container images and other artifacts. It integrates with Cloud Build and Vertex AI for seamless deployment workflows.

**Repository configuration:**
- **Format**: Docker (for container images)
- **Location**: Same region as your Vertex AI deployment for lower latency
- **Access**: Vertex AI automatically has access to pull images from Artifact Registry in the same project

In [8]:
DOCKER_REPOSITORY = "vllm-lora-vertex-ai-docker-repo"

! gcloud artifacts repositories create {DOCKER_REPOSITORY} \
    --repository-format=docker \
    --location={LOCATION} \
    --description="vLLM LoRA Vertex AI Docker repository" \
    --quiet || echo "Repository may already exist"

Create request issued for: [vllm-lora-vertex-ai-docker-repo]
Waiting for operation [projects/shamika-customer-ml-dev/locations/us-central1/o
perations/d786b8f9-ecac-4a3f-bfec-e92b986cb382] to complete...done.            
Created repository [vllm-lora-vertex-ai-docker-repo].


## Build custom vLLM container

Build a custom vLLM Docker container using Cloud Build. This approach offers several advantages over the prebuilt container:

**Why use a custom container?**
- Install specific vLLM versions or patches
- Add custom dependencies or modifications
- Include model-specific optimizations
- Bundle additional tools for debugging

**Cloud Build configuration:**

| Parameter | Description |
|-----------|-------------|
| `--config` | Path to cloudbuild.yaml defining build steps |
| `--region` | Build region (should match deployment region) |
| `--timeout` | Maximum build time (2 hours for large images) |
| `--machine-type` | Build machine (e2-highcpu-32 for faster builds) |
| `--substitutions` | Variables passed to cloudbuild.yaml |

**Required files in `custom_container/` directory:**
- `Dockerfile`: Container build instructions
- `cloudbuild.yaml`: Cloud Build configuration

The build process typically takes 10-15 minutes.

In [10]:
! cd ./custom_container \
    && gcloud builds submit \
        --config=cloudbuild.yaml \
        --region={LOCATION} \
        --timeout="2h" \
        --machine-type=e2-highcpu-32 \
        --gcs-source-staging-dir=gs://{BUCKET_NAME}/cloudbuild-staging \
        --substitutions=_REPOSITORY={DOCKER_REPOSITORY} \
        --gcs-log-dir=gs://{BUCKET_NAME}/cloudbuild-staging/logs

Creating temporary archive of 5 file(s) totalling 12.5 KiB before compression.
Uploading tarball of [.] to [gs://shamika-ml-dev-vertex-ai-artifacts/cloudbuild-staging/1770056864.082014-67ee5944961e4c8e97f5eaa3a40b26fc.tgz]
[1;31mERROR:[0m (gcloud.builds.submit) INVALID_ARGUMENT: could not resolve source: googleapi: Error 403: 39042042168-compute@developer.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object. Permission 'storage.objects.get' denied on resource (or it may not exist)., forbidden
- '@type': type.googleapis.com/google.rpc.DebugInfo
  detail: "could not resolve source: googleapi: Error 403: 39042042168-compute@developer.gserviceaccount.com\
    \ does not have storage.objects.get access to the Google Cloud Storage object.\
    \ Permission 'storage.objects.get' denied on resource (or it may not exist).,\
    \ forbidden"


## Deploy model to Vertex AI Endpoint

Deployment to Vertex AI involves three main steps:

1. **Upload model to Model Registry**: Register your model configuration including container image, serving arguments, and routes
2. **Create an Endpoint**: Create a Vertex AI Endpoint resource that will host the deployed model
3. **Deploy model to Endpoint**: Deploy the model to the endpoint with specified hardware configuration

This separation allows you to:
- Reuse the same model across multiple endpoints
- Update endpoints independently of model versions
- Manage traffic splitting between model versions

### Configure deployment settings

Define the hardware configuration and container image URI for deployment.

**Hardware selection:**

| Parameter | Value | Description |
|-----------|-------|-------------|
| `machine_type` | g2-standard-12 | 12 vCPUs, 48GB RAM, 1 NVIDIA L4 GPU |
| `accelerator_type` | NVIDIA_L4 | Cost-effective GPU with 24GB VRAM |
| `accelerator_count` | 1 | Single GPU (sufficient for 7B model) |

**Why NVIDIA L4?**
- 24GB GPU memory fits 7B models with LoRA adapters
- Good price-performance ratio for inference workloads
- Supports bfloat16 for efficient inference

**Container URI format:**
`{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE_NAME}`

In [None]:
machine_type = "g2-standard-12"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

DOCKER_URI = f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{DOCKER_REPOSITORY}/vllm-gpu"
model_name = "mistral-7B-instruct-v0.3-lora-custom-container"

### Upload model to Model Registry

Register the model in Vertex AI Model Registry with the custom container and vLLM serving arguments.

**vLLM serving parameters:**

| Parameter | Value | Description |
|-----------|-------|-------------|
| `--model` | GCS URI | Path to base model in GCS |
| `--served-model-name` | mistral-base-custom-container | Model name for API calls |
| `--tensor-parallel-size` | 1 | GPUs for model parallelism |
| `--gpu-memory-utilization` | 0.90 | Use 90% of GPU memory |
| `--max-model-len` | 8192 | Maximum context length |
| `--max-num-seqs` | 64 | Concurrent request batching |
| `--dtype` | bfloat16 | Use bfloat16 precision |

**LoRA configuration:**

| Parameter | Description |
|-----------|-------------|
| `--enable-lora` | Enable LoRA adapter support |
| `--max-loras` | Max adapters in GPU memory |
| `--max-lora-rank` | Maximum LoRA rank (64) |
| `--lora-modules` | Register adapter as `researcher={GCS_URI}` |

**Key difference from prebuilt container**: With a custom container, the LoRA adapter is loaded directly from GCS using the full GCS URI, rather than from a local path.

In [None]:
max_model_len = 8192
gpu_memory_utilization = 0.90
max_num_seqs = 64

vllm_args = [
    "python3",
    "-m",
    "vllm.entrypoints.openai.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={GCS_BASE_MODEL_URI}",
    "--served-model-name=mistral-base-custom-container",
    f"--tensor-parallel-size={accelerator_count}",
    "--swap-space=16",
    f"--gpu-memory-utilization={gpu_memory_utilization}",
    f"--max-model-len={max_model_len}",
    f"--max-num-seqs={max_num_seqs}",
    "--dtype=bfloat16",
    # LoRA configuration
    "--enable-lora",
    "--max-loras=1",
    "--max-cpu-loras=1",
    "--max-lora-rank=64",
    f"--lora-modules=researcher={GCS_ADAPTER_URI}"
]

model = aiplatform.Model.upload(
    display_name=model_name,
    serving_container_image_uri=DOCKER_URI,
    serving_container_args=vllm_args,
    serving_container_ports=[8080],
    serving_container_predict_route="/v1/completions",
    serving_container_health_route="/health",
    serving_container_shared_memory_size_mb=(16 * 1024),
    serving_container_deployment_timeout=7200,
)
print(f"Model uploaded: {model.resource_name}")

### Create Vertex AI Endpoint

Create a Vertex AI Endpoint resource. An endpoint is a cloud resource that provides a URL for serving online predictions.

**Why separate endpoint creation?**
- Endpoints can host multiple deployed models for A/B testing
- Traffic can be split between model versions
- Endpoints persist even when models are undeployed
- Enables blue-green deployments

In [None]:
endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
print(f"Endpoint created: {endpoint.resource_name}")

### Deploy model to endpoint

Deploy the model to the endpoint with autoscaling configuration.

**Deployment parameters:**

| Parameter | Value | Description |
|-----------|-------|-------------|
| `min_replica_count` | 1 | Minimum running replicas |
| `max_replica_count` | 4 | Maximum replicas for autoscaling |
| `autoscaling_target_accelerator_duty_cycle` | 60 | Scale up when GPU utilization exceeds 60% |
| `traffic_percentage` | 100 | Send all traffic to this model |
| `deploy_request_timeout` | 1800 | 30 minutes for deployment |

**Autoscaling behavior:**
- Vertex AI monitors GPU utilization
- When average utilization exceeds 60%, new replicas are added (up to max)
- When utilization drops, replicas are removed (down to min)
- Scaling decisions are made based on a rolling average

**Note**: Deployment takes 10-20 minutes as the container downloads model weights and loads them into GPU memory.

In [None]:
model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=model_name,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    min_replica_count=1,
    max_replica_count=4,
    autoscaling_target_accelerator_duty_cycle=60,
    traffic_percentage=100,
    deploy_request_timeout=1800,
    sync=True,
)
print(f"Model deployed to endpoint: {endpoint.resource_name}")

## Run inference

Test the deployed endpoint by sending prompts to both the base model and the LoRA adapter.

**How it works:**

The vLLM server exposes an OpenAI-compatible API. You can select which model to use via the `model` field:
- `"model": "mistral-base-custom-container"` - Uses the base Mistral model
- `"model": "researcher"` - Uses the base model with the Research-Reasoner LoRA adapter

**Expected differences:**

| Aspect | Base Model | LoRA Adapter |
|--------|------------|--------------|
| Response style | General instruction-following | Structured research methodology |
| Output format | Variable | Step-by-step research plan |
| Domain focus | Broad | Research planning & reasoning |

The `raw_predict` method sends raw HTTP requests to the vLLM container, enabling the use of the OpenAI-compatible `/v1/completions` endpoint.

In [None]:
def get_response(endpoint, prompt, model_name):
    """Send a prompt to the model and get a response."""
    payload = {
        "model": model_name,
        "prompt": prompt,
        "max_tokens": 1000,
        "temperature": 0.0,
        "top_p": 1.0,
        "top_k": 1
    }

    payload_bytes = json.dumps(payload).encode("utf-8")

    response = endpoint.raw_predict(
        body=payload_bytes,
        headers={"Content-Type": "application/json"}
    )

    result = json.loads(response.content)
    generated_text = result["choices"][0]["text"]
    return generated_text

In [None]:
prompt = "<s>[INST] Research Topic: \"Hybrid Quantum-Classical Algorithms for Scalable Variational Quantum Simulation of Strongly Correlated Materials\"\nLet's think step by step:? [/INST]"

# Get response from base model
base_response = get_response(endpoint, prompt, "mistral-base-custom-container")
print("Base Model Response:")
print(base_response)

In [None]:
# Get response from LoRA adapter
adapter_response = get_response(endpoint, prompt, "researcher")
print("LoRA Adapter Response:")
print(adapter_response)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources created in this tutorial:

**Resources to clean up:**
1. **Vertex AI Endpoint**: Undeploy models and delete the endpoint
2. **Vertex AI Model**: Delete from Model Registry
3. **Artifact Registry**: Delete the Docker repository (optional)
4. **Cloud Storage**: Delete uploaded model artifacts (optional)

**Cost considerations:**
- Running endpoints incur costs even with no traffic
- GPU instances (g2-standard-12) are billed per second
- Artifact Registry and Cloud Storage have storage costs

In [None]:
# Set this to True to delete resources
delete_resources = False

if delete_resources:
    # Undeploy model from endpoint and delete endpoint
    endpoint.undeploy_all()
    endpoint.delete()
    print("Endpoint deleted.")

    # Delete the model from Model Registry
    model.delete()
    print("Model deleted.")

    # Delete Artifact Registry repository
    # ! gcloud artifacts repositories delete {DOCKER_REPOSITORY} --location={LOCATION} --quiet
    # print("Artifact Registry repository deleted.")

    # Optionally delete GCS artifacts
    # ! gsutil -m rm -r gs://{BUCKET_NAME}/deployments/lora/