In [None]:
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Deploy LoRA Fine-tuned Models on Vertex AI with Prebuilt vLLM Container

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/vertexai_serving_vllm/vertexai_serving_vllm_mistral_7b_lora_prebuilt_container.ipynb">
      <img src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fprediction%2Fvertexai_serving_vllm%2Fvertexai_serving_vllm_mistral_7b_lora_prebuilt_container.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/prediction/vertexai_serving_vllm/vertexai_serving_vllm_mistral_7b_lora_prebuilt_container.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/vertexai_serving_vllm/vertexai_serving_vllm_mistral_7b_lora_prebuilt_container.ipynb">
      <img width="32px" src="https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook demonstrates how to deploy a LoRA (Low-Rank Adaptation) fine-tuned model on Vertex AI using the prebuilt vLLM serving container. LoRA is an efficient fine-tuning technique that allows you to adapt large language models with minimal additional parameters.

### Models used in this Example

**Base Model:** [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)

Mistral-7B-Instruct-v0.3 is an instruction-tuned version of the Mistral-7B model, optimized for following instructions and conversational tasks. It features a 32k context window and uses the v3 tokenizer with extended vocabulary.

**LoRA Adapter:** [Research-Reasoner-7B-v0.3](https://huggingface.co/Raymond-dev-546730/Research-Reasoner-7B-v0.3)

Research-Reasoner-7B-v0.3 is a LoRA adapter fine-tuned on top of Mistral-7B to enhance research planning and reasoning capabilities. It enables the model to provide structured, step-by-step research methodologies.

### What you will learn

In this tutorial, you will learn how to:

* Download a base model and LoRA adapter from Hugging Face
* Upload model artifacts to Google Cloud Storage
* Configure vLLM serving arguments for LoRA deployment
* Register the model in Vertex AI Model Registry
* Deploy the model to a Vertex AI Endpoint
* Run inference using both the base model and LoRA adapter

## Get started

### Install required packages

Install the necessary Python packages:

- **google-cloud-aiplatform**: The Vertex AI SDK for model deployment and management
- **google-cloud-storage**: For uploading model artifacts to Google Cloud Storage
- **transformers**: Hugging Face library for working with transformer models
- **huggingface_hub**: For downloading models from the Hugging Face Hub

In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform google-cloud-storage transformers huggingface_hub

### Import libraries

In [None]:
import json
import os
import sys

from google.cloud import aiplatform
from google.cloud import storage
from huggingface_hub import snapshot_download

### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.

In [None]:
if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

**Service Account Requirements:**

For Vertex AI deployment, you need a service account with the following roles:
- `roles/storage.objectViewer` - To read model artifacts from GCS buckets
- `roles/aiplatform.user` - To deploy models to Vertex AI

You can find your service account details in [IAM & Admin > Service Accounts](https://console.cloud.google.com/iam-admin/serviceaccounts).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = "us-central1"  # @param {type:"string"}

BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
if not BUCKET_NAME or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = f"{PROJECT_ID}-vertex-ai-models"

# Service account for Vertex AI deployment - must have storage.objectViewer access to BUCKET_NAME
# Format: your-service-account@your-project.iam.gserviceaccount.com
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type: "string", placeholder: "[your-service-account@your-project.iam.gserviceaccount.com]", isTemplate: true}

### Initialize Vertex AI SDK

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION)
print(f"Initialized Vertex AI for project: {PROJECT_ID}")

## Define helper functions

The following helper function uploads a local directory to Google Cloud Storage (GCS). This is needed because Vertex AI requires model artifacts to be stored in GCS for deployment. The function:

1. Parses the GCS URI to extract the bucket name and path prefix
2. Walks through all files in the local directory
3. Uploads each file to the corresponding path in GCS

In [None]:
def upload_directory_to_gcs(local_path, gcs_uri):
    """Upload a directory to GCS."""
    gcs_path = gcs_uri.replace("gs://", "")
    bucket_name = gcs_path.split("/")[0]
    prefix = "/".join(gcs_path.split("/")[1:])

    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)

    for root, dirs, files in os.walk(local_path):
        for file in files:
            local_file = os.path.join(root, file)
            relative_path = os.path.relpath(local_file, local_path)
            blob_name = f"{prefix}/{relative_path}"

            blob = bucket.blob(blob_name)
            blob.upload_from_filename(local_file)
            print(f"Uploaded {relative_path}")

## Configure model paths

Define the Hugging Face model identifiers and construct the GCS paths where the models will be stored.

- **BASE_MODEL_NAME**: The Hugging Face repository ID for the base Mistral model
- **LORA_MODEL_ADAPTER**: The Hugging Face repository ID for the LoRA adapter
- **GCS_BASE_MODEL_URI**: The GCS path where the base model will be uploaded
- **GCS_ADAPTER_URI**: The GCS path where the LoRA adapter will be uploaded

The models are organized in a hierarchical structure in GCS, with adapters stored under the base model directory.

In [None]:
BASE_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
LORA_MODEL_ADAPTER = "Raymond-dev-546730/Research-Reasoner-7B-v0.3"

GCS_BASE_MODEL_PATH = f"{BUCKET_NAME}/deployments/lora/models/{BASE_MODEL_NAME}"
GCS_ADAPTER_PATH = f"{GCS_BASE_MODEL_PATH}/adapters/{LORA_MODEL_ADAPTER}"

GCS_BASE_MODEL_URI = f"gs://{GCS_BASE_MODEL_PATH}"
GCS_ADAPTER_URI = f"{GCS_BASE_MODEL_URI}/adapters/{LORA_MODEL_ADAPTER}"

## Download the base model

Download the Mistral-7B-Instruct-v0.3 base model from Hugging Face Hub using `snapshot_download()`. This function:

- Downloads all model files (weights, tokenizer, config) to your local cache
- Returns the local path where the model is stored
- Supports resumable downloads if interrupted

**Note**: The Mistral-7B model is approximately 14GB in size. Ensure you have sufficient disk space and bandwidth.

In [None]:
local_base_model_path = snapshot_download(repo_id=BASE_MODEL_NAME)
print(f"Base model downloaded to: {local_base_model_path}")

## Download the LoRA adapter

Download only the LoRA adapter weights from the Research-Reasoner repository. The `allow_patterns` parameter filters the download to include only the LoRA adapter files, which are much smaller than the full model (typically a few hundred MB).

LoRA adapters contain:
- **adapter_config.json**: Configuration specifying the LoRA rank, alpha, and target modules
- **adapter_model.safetensors**: The trained low-rank weight matrices
- **Tokenizer files**: If the adapter uses a modified tokenizer

In [None]:
local_repo_path = snapshot_download(
    repo_id=LORA_MODEL_ADAPTER,
    allow_patterns=[
        "Model_Weights/LoRA_adapter/**",
    ]
)

local_adapter_path = f"{local_repo_path}/Model_Weights/LoRA_adapter"
print(f"Adapter downloaded to: {local_adapter_path}")

## Upload model and adapter to GCS

Upload both the base model and LoRA adapter to Google Cloud Storage. Vertex AI will load these artifacts during container startup.

**Why GCS?**
- Vertex AI prediction containers automatically mount GCS paths to `/tmp/model_dir/`
- This allows the vLLM server to access the model weights without bundling them in the container
- Separating model storage from the container enables easier model updates

In [None]:
upload_directory_to_gcs(local_base_model_path, GCS_BASE_MODEL_URI)
upload_directory_to_gcs(local_adapter_path, GCS_ADAPTER_URI)

## Deploy with prebuilt vLLM container

This section deploys the model using Google's prebuilt vLLM serving container from Vertex AI Model Garden.

**Architecture:** Prebuilt vLLM Container + Single LoRA loaded on startup.

**Why use the prebuilt container?**
- No need to build or maintain custom Docker images
- Optimized for Vertex AI with proper health checks and logging
- Regularly updated with the latest vLLM optimizations
- Supports LoRA adapters out of the box

The container URI points to Google's Artifact Registry where the prebuilt vLLM image is hosted.

In [None]:
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20251216_0916_RC01"

### Configure vLLM serving arguments

Configure the vLLM server with optimal settings for serving the Mistral model with LoRA.

**Key parameters explained:**

| Parameter | Value | Description |
|-----------|-------|-------------|
| `--model` | GCS URI | Path to the base model in GCS |
| `--served-model-name` | mistral-base | Name used to reference the base model in API calls |
| `--tensor-parallel-size` | 1 | Number of GPUs for tensor parallelism (1 for single L4) |
| `--gpu-memory-utilization` | 0.90 | Fraction of GPU memory to use (90%) |
| `--max-model-len` | 8192 | Maximum sequence length for generation |
| `--max-num-seqs` | 64 | Maximum concurrent sequences for batching |
| `--dtype` | bfloat16 | Use bfloat16 for faster inference with minimal quality loss |

**LoRA-specific parameters:**

| Parameter | Description |
|-----------|-------------|
| `--enable-lora` | Enable LoRA adapter support |
| `--max-loras` | Maximum number of LoRA adapters to load in GPU memory |
| `--max-cpu-loras` | Maximum adapters to keep in CPU memory for swapping |
| `--max-lora-rank` | Maximum LoRA rank supported (64 for this adapter) |
| `--lora-modules` | Register adapter with format `name=path` |

In [None]:
LOCAL_ADAPTER_PATH = f"/tmp/model_dir/{GCS_ADAPTER_PATH}"

max_model_len = 8192
gpu_memory_utilization = 0.90
max_num_seqs = 64

vllm_args = [
    "python", "-m", "vllm.entrypoints.openai.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={GCS_BASE_MODEL_URI}",
    "--served-model-name=mistral-base",
    "--tensor-parallel-size=1",
    "--swap-space=16",
    f"--gpu-memory-utilization={gpu_memory_utilization}",
    f"--max-model-len={max_model_len}",
    f"--max-num-seqs={max_num_seqs}",
    "--dtype=bfloat16",
    # LoRA configuration
    "--enable-lora",
    "--max-loras=1",
    "--max-cpu-loras=1",
    "--max-lora-rank=64",
    f"--lora-modules=researcher={LOCAL_ADAPTER_PATH}"
]

### Upload model to Vertex AI Model Registry

Register the model in Vertex AI Model Registry. This creates a model resource that can be deployed to endpoints.

**Key configuration options:**

- **serving_container_image_uri**: The prebuilt vLLM Docker image
- **serving_container_args**: Command-line arguments passed to vLLM
- **serving_container_ports**: Port 8080 for the OpenAI-compatible API
- **serving_container_predict_route**: `/v1/completions` for the completions endpoint
- **serving_container_health_route**: `/health` for Vertex AI health checks
- **serving_container_shared_memory_size_mb**: 16GB shared memory for model loading
- **serving_container_deployment_timeout**: 2 hours to allow for large model loading

In [None]:
model = aiplatform.Model.upload(
    display_name="mistral-7B-instruct-v0.3-with-lora-adapter-prebuilt",
    serving_container_image_uri=VLLM_DOCKER_URI,
    serving_container_args=vllm_args,
    serving_container_ports=[8080],
    serving_container_predict_route="/v1/completions",
    serving_container_health_route="/health",
    serving_container_shared_memory_size_mb=(16 * 1024),
    serving_container_deployment_timeout=7200,
)
print(f"Model uploaded: {model.resource_name}")

### Deploy model to endpoint

Deploy the model to a Vertex AI Endpoint for online predictions.

**Hardware configuration:**

- **machine_type**: `g2-standard-12` provides 12 vCPUs, 48GB RAM, and 1 NVIDIA L4 GPU
- **accelerator_type**: `NVIDIA_L4` - A cost-effective GPU with 24GB memory, suitable for 7B parameter models
- **accelerator_count**: 1 GPU (matching `tensor-parallel-size=1`)

**Scaling configuration:**

- **min_replica_count**: 1 - Always keep at least one replica running
- **max_replica_count**: 1 - Fixed scaling for this example (increase for production)
- **deploy_request_timeout**: 1800 seconds (30 minutes) for deployment to complete

**Note**: Deployment takes approximately 10-15 minutes as the container starts, downloads model weights from GCS, and loads them into GPU memory.

In [None]:
endpoint = model.deploy(
    machine_type="g2-standard-12",  # 1x NVIDIA L4
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1,
    sync=True,
    service_account=SERVICE_ACCOUNT,
    deploy_request_timeout=1800
)
print(f"Model deployed to endpoint: {endpoint.resource_name}")

## Run inference

Test the deployed endpoint by sending prompts to both the base model and the LoRA adapter.

**How it works:**

The vLLM server exposes an OpenAI-compatible API. You can select which model to use via the `model` field in your request:
- `"model": "mistral-base"` - Uses the base Mistral model without any adapter
- `"model": "researcher"` - Uses the base model with the Research-Reasoner LoRA adapter applied

The `raw_predict` method sends HTTP requests directly to the container, bypassing Vertex AI's standard prediction format.

**Request parameters:**

| Parameter | Description |
|-----------|-------------|
| `model` | Model name to use (base or LoRA adapter name) |
| `prompt` | Input text to complete |
| `max_tokens` | Maximum tokens to generate |
| `temperature` | Controls randomness (0.0 = deterministic) |
| `top_p` | Nucleus sampling parameter |
| `top_k` | Top-k sampling parameter |

In [None]:
def get_response(endpoint, prompt, model_name):
    """Send a prompt to the model and get a response."""
    payload = {
        "model": model_name,
        "prompt": prompt,
        "max_tokens": 1000,
        "temperature": 0.0,
        "top_p": 1.0,
        "top_k": 1
    }

    payload_bytes = json.dumps(payload).encode("utf-8")

    response = endpoint.raw_predict(
        body=payload_bytes,
        headers={"Content-Type": "application/json"}
    )

    result = json.loads(response.content)
    generated_text = result["choices"][0]["text"]
    return generated_text

In [None]:
# ENDPOINT_ID = "8812523024461856768"
# endpoint = aiplatform.Endpoint(f"projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}")
# print(f"Using endpoint: {endpoint.resource_name}")

In [None]:
prompt = "<s>[INST] Research Topic: \"Hybrid Quantum-Classical Algorithms for Scalable Variational Quantum Simulation of Strongly Correlated Materials\"\nLet's think step by step:? [/INST]"


# Get response from base model
base_response = get_response(endpoint, prompt, "mistral-base")
print("Base Model Response:")
print(base_response)

In [None]:
# Get response from LoRA adapter
adapter_response = get_response(endpoint, prompt, "researcher")
print("LoRA Adapter Response:")
print(adapter_response)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources created in this tutorial:

**Resources to clean up:**
1. **Vertex AI Endpoint**: Undeploy models and delete the endpoint
2. **Vertex AI Model**: Delete from Model Registry
3. **Cloud Storage**: Delete uploaded model artifacts (optional)

**Cost considerations:**
- Running endpoints incur costs even with no traffic
- GPU instances (g2-standard-12) are billed per second while the endpoint is active
- Cloud Storage has minimal storage costs for model artifacts

In [None]:
# Set this to True to delete resources
delete_resources = True

if delete_resources:
    # Undeploy model from endpoint and delete endpoint
    endpoint.undeploy_all()
    endpoint.delete()
    print("Endpoint deleted.")

    # Delete the model from Model Registry
    model.delete()
    print("Model deleted.")

    # Optionally delete GCS artifacts
    # ! gsutil -m rm -r gs://{BUCKET_NAME}/deployments/lora/