In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Llama 3.1 (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/instances">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_llama3_1_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_1_deployment.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates downloading, deploying, and serving prebuilt Llama 3.1 models with [Hex-LLM](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm) or [vLLM](https://github.com/vllm-project/vllm) (standard and optimized).


### Objective

- Deploy Llama 3.1 8B and 70B with Hex-LLM on TPU.
- Deploy Llama 3.1 8B Instruct with the Fast Deployment feature.
- Deploy Llama 3.1 8B, 70B and 405B with standard vLLM on GPU, optionally with dynamic LoRA adapters.
- Deploy Llama 3.1 8B and 70B with optimized vLLM on GPU.

### File a bug

File a bug on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new) if you encounter any issue with the notebook.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [5]:
# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform==1.93.1'
! pip3 install --upgrade --quiet 'openai==1.85.0'

In [7]:
# Import the necessary packages

import importlib
import os
import re
from typing import Tuple

import requests
from google import auth
from google.cloud import aiplatform

if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

models, endpoints = {}, {}


# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
REGION = "us-central1"

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

! gcloud config set project $PROJECT_ID
import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

fatal: destination path 'vertex-ai-samples' already exists and is not an empty directory.
Initializing Vertex AI API.
Updated property [core/project].


## Deploy prebuilt Llama 3.1 8B and 70B with Hex-LLM

**Hex-LLM** is a **H**igh-**E**fficiency **L**arge **L**anguage **M**odel (LLM) TPU serving solution built with **XLA**, which is being developed by Google Cloud.

Refer to the "Request for TPU quota" section for TPU quota.

In [8]:
MODEL_ID = "Meta-Llama-3.1-8B-Instruct"
TPU_DEPLOYMENT_REGION = "us-central1"
VERTEX_AI_MODEL_GARDEN_LLAMA_3_1 = "gs://vertex-model-garden-restricted-us/llama3.1"
model_id = os.path.join(VERTEX_AI_MODEL_GARDEN_LLAMA_3_1, MODEL_ID)
hf_model_id = "meta-llama/" + MODEL_ID

# The pre-built serving docker images.
HEXLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/hex-llm-serve:stable"
use_dedicated_endpoint = True

# Sets ct5lp-hightpu-4t (4 TPU chips) to deploy models.
machine_type = "ct5lp-hightpu-4t"
# Note: 1 TPU V5 chip has only one core.
tpu_type = "TPU_V5e"

enable_prefix_cache_hbm = True

disagg_topo = None

if "8B" in MODEL_ID:
    tpu_count = 4
    tpu_topo = "1x4"
elif "70B" in MODEL_ID:
    tpu_count = 16
    tpu_topo = "4x4"
else:
    raise ValueError(f"Unsupported MODEL_ID: {MODEL_ID}")


common_util.check_quota(
    project_id=PROJECT_ID,
    region=TPU_DEPLOYMENT_REGION,
    accelerator_type=tpu_type,
    accelerator_count=tpu_count,
    is_for_training=False,
)

# Server parameters.
tensor_parallel_size = tpu_count


# Fraction of HBM memory allocated for KV cache after model loading. A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.
hbm_utilization_factor = 0.8
# Maximum number of running sequences in a continuous batch.
max_running_seqs = 256
# Maximum context length for a request.
max_model_len = 4096

# Endpoint configurations.
min_replica_count = 1
max_replica_count = 1


In [9]:
def deploy_model_hexllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    base_model_id: str = None,
    data_parallel_size: int = 1,
    tensor_parallel_size: int = 1,
    machine_type: str = "ct5lp-hightpu-1t",
    tpu_topology: str = "1x1",
    disagg_topology: str = None,
    hbm_utilization_factor: float = 0.6,
    max_running_seqs: int = 256,
    max_model_len: int = 4096,
    enable_prefix_cache_hbm: bool = False,
    endpoint_id: str = "",
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    use_dedicated_endpoint: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    
    print(f"model_name              :{model_name}")
    print(f"model_id                :{model_id}")
    print(f"publisher               :{publisher}")
    print(f"publisher_model_id      :{publisher_model_id}")
    print(f"base_model_id           :{base_model_id}")
    print(f"data_parallel_size      :{data_parallel_size}")
    print(f"tensor_parallel_size    :{tensor_parallel_size}")
    print(f"machine_type            :{machine_type}")
    print(f"tpu_topology            :{tpu_topology}")
    print(f"disagg_topology         :{disagg_topology}")
    print(f"hbm_utilization_factor  :{hbm_utilization_factor}")
    print(f"max_running_seqs        :{max_running_seqs}")
    print(f"max_model_len           :{max_model_len}")
    print(f"enable_prefix_cache_hbm :{enable_prefix_cache_hbm}")
    print(f"endpoint_id             :{endpoint_id}")
    print(f"min_replica_count       :{min_replica_count}")
    print(f"max_replica_count       :{max_replica_count}")
    print(f"use_dedicated_endpoint  :{use_dedicated_endpoint}")
    
    """Deploys models with Hex-LLM on TPU in Vertex AI."""
    if endpoint_id:
        print(f"endpoint_id             :{endpoint_id}")
        aip_endpoint_name = (
            f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
        )
        endpoint = aiplatform.Endpoint(aip_endpoint_name)
    else:
        endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_name}-endpoint",
            location=TPU_DEPLOYMENT_REGION,
            dedicated_endpoint_enabled=use_dedicated_endpoint,
        )

    if not base_model_id:
        base_model_id = model_id
    print(f"base_model_id           :{base_model_id}")

    if not tensor_parallel_size:
        tensor_parallel_size = int(machine_type[-2])
    print(f"tensor_parallel_size    :{tensor_parallel_size}")

    num_hosts = int(tpu_topology.split("x")[0])
    print(f"num_hosts               :{num_hosts}")

    # Learn more about the supported arguments and environment variables at https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm#config-server.
    hexllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--data_parallel_size={data_parallel_size}",
        f"--tensor_parallel_size={tensor_parallel_size}",
        f"--num_hosts={num_hosts}",
        f"--hbm_utilization_factor={hbm_utilization_factor}",
        f"--max_running_seqs={max_running_seqs}",
        f"--max_model_len={max_model_len}",
    ]
    if disagg_topology:
        hexllm_args.append(f"--disagg_topo={disagg_topology}")
    if enable_prefix_cache_hbm and not disagg_topology:
        hexllm_args.append("--enable_prefix_cache_hbm")
    print(f"hexllm_args             :{hexllm_args}")

    env_vars = {
        "MODEL_ID": base_model_id,
        "HEX_LLM_LOG_LEVEL": "info",
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars.update({"HF_TOKEN": HF_TOKEN})
        print(f"env_vars                :{env_vars}")
    except:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=HEXLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "hex_llm.server.api_server"],
        serving_container_args=hexllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        location=TPU_DEPLOYMENT_REGION,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        tpu_topology=tpu_topology if num_hosts > 1 else None,
        deploy_request_timeout=1800,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_pytorch_llama3_1_deployment.ipynb",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )
    return model, endpoint

In [10]:
LABEL = "hexllm_tpu"
models[LABEL], endpoints[LABEL] = deploy_model_hexllm(
    model_name=common_util.get_job_name_with_datetime(prefix=MODEL_ID),
    model_id=model_id,
    publisher="meta",
    publisher_model_id="llama3_1",
    base_model_id=hf_model_id,
    tensor_parallel_size=tensor_parallel_size,
    machine_type=machine_type,
    tpu_topology=tpu_topo,
    disagg_topology=disagg_topo,
    hbm_utilization_factor=hbm_utilization_factor,
    max_running_seqs=max_running_seqs,
    max_model_len=max_model_len,
    enable_prefix_cache_hbm=enable_prefix_cache_hbm,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

model = models[LABEL]
endpoint = endpoints[LABEL]

model_name              :Meta-Llama-3.1-8B-Instruct-20250610-095922
model_id                :gs://vertex-model-garden-restricted-us/llama3.1/Meta-Llama-3.1-8B-Instruct
publisher               :meta
publisher_model_id      :llama3_1
base_model_id           :meta-llama/Meta-Llama-3.1-8B-Instruct
data_parallel_size      :1
tensor_parallel_size    :4
machine_type            :ct5lp-hightpu-4t
tpu_topology            :1x4
disagg_topology         :None
hbm_utilization_factor  :0.8
max_running_seqs        :256
max_model_len           :4096
enable_prefix_cache_hbm :True
endpoint_id             :
min_replica_count       :1
max_replica_count       :1
use_dedicated_endpoint  :True
Creating Endpoint
Create Endpoint backing LRO: projects/979398597045/locations/us-central1/endpoints/8807964449253097472/operations/1644534422063546368
Endpoint created. Resource name: projects/979398597045/locations/us-central1/endpoints/8807964449253097472
To use this Endpoint in another session:
endpoint = aiplatform.

In [21]:
print(endpoints["hexllm_tpu"].gca_resource)

name: "projects/979398597045/locations/us-central1/endpoints/8807964449253097472"
display_name: "Meta-Llama-3.1-8B-Instruct-20250610-095922-endpoint"
deployed_models {
  id: "6420641031351435264"
  model: "projects/979398597045/locations/us-central1/models/3790521256781021184"
  display_name: "Meta-Llama-3.1-8B-Instruct-20250610-095922"
  create_time {
    seconds: 1749549567
    nanos: 740805000
  }
  dedicated_resources {
    machine_spec {
      machine_type: "ct5lp-hightpu-4t"
    }
    min_replica_count: 1
    max_replica_count: 1
  }
  model_version_id: "1"
  status {
    available_replica_count: 1
  }
}
traffic_split {
  key: "6420641031351435264"
  value: 100
}
etag: "AMEw9yPmcJFndG1suqtWngndiwzFJ7v7vNmuj_qtWzs-45khQqDqXM3Tz7NNVX9g-w=="
create_time {
  seconds: 1749549563
  nanos: 74893000
}
update_time {
  seconds: 1749550948
  nanos: 1787000
}
dedicated_endpoint_enabled: true
dedicated_endpoint_dns: "8807964449253097472.us-central1-979398597045.prediction.vertexai.goog"



### 原生SDK

In [14]:
prompt = "What is a car?" 
max_tokens = 50
temperature = 1.0
top_p = 1.0
top_k = 1

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
    },
]
response = endpoints["hexllm_tpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

 A car is a road vehicle, typically with four wheels, powered by an internal combustion engine or an electric motor. Cars are used for transportation, recreation, and other purposes. They come in various shapes, sizes, and models, ranging from small hatch


### Chat completion推理

In [17]:
# @title Chat completion

temp_region = REGION
REGION = TPU_DEPLOYMENT_REGION

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints["hexllm_tpu"].gca_resource.dedicated_endpoint_dns
    ENDPOINT_RESOURCE_NAME = "projects/{}/locations/{}/endpoints/{}".format(
        PROJECT_ID, REGION, endpoints["hexllm_tpu"].name)

! pip install -qU openai google-auth requests

user_message = "How is your day going?"
max_tokens = 500
temperature = 1.0
stream = False


import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

print(f'BASE_URL : {BASE_URL}')
client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response)

BASE_URL : https://8807964449253097472.us-central1-979398597045.prediction.vertexai.goog/v1beta1/projects/ali-icbu-gpu-project/locations/us-central1/endpoints/8807964449253097472
ChatCompletion(id=None, choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="I'm just a language model, I don't have emotions or experiences like humans do, so I don't have good or bad days. However, I'm functioning properly and ready to assist you with any questions or tasks you may have. How about you? How's your day going?", refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[]))], created=None, model='', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=59, prompt_tokens=42, total_tokens=101, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=0)))


## Deploy prebuilt Llama 3.1 8B, 70B and 405B with standard vLLM

In [16]:
base_model_name = "Meta-Llama-3.1-8B-Instruct"
model_id = os.path.join(VERTEX_AI_MODEL_GARDEN_LLAMA_3_1, base_model_name)
ENABLE_DYNAMIC_LORA = True
version_id = base_model_name.lower()[5:]
hf_model_id = "meta-llama/" + base_model_name
PUBLISHER_MODEL_NAME = f"publishers/meta/models/llama3_1@{version_id}"

# The pre-built serving docker images.
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241210_0916_RC00"

use_dedicated_endpoint = True
if "8b" in base_model_name.lower():
    accelerator_type = "NVIDIA_L4"
    machine_type = "g2-standard-12"
    accelerator_count = 1
    max_loras = 5
elif "70b" in base_model_name.lower():
    accelerator_type = "NVIDIA_H100_80GB"
    machine_type = "a3-highgpu-4g"
    accelerator_count = 4
    max_loras = 1
elif "405b" in base_model_name.lower():
    accelerator_type = "NVIDIA_H100_80GB"
    machine_type = "a3-highgpu-8g"
    accelerator_count = 8
    max_loras = 1
else:
    raise ValueError(
        f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}."
    )
    
print(f'accelerator_type : {accelerator_type}')
print(f'machine_type     : {machine_type}')
print(f'accelerator_count: {accelerator_count}')
print(f'max_loras        : {max_loras}')

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

accelerator_type : NVIDIA_L4
machine_type     : g2-standard-12
accelerator_count: 1
max_loras        : 5


In [22]:
gpu_memory_utilization = 0.95
max_model_len = 8192  # Maximum context length.

# Enable automatic prefix caching using GPU HBM
enable_prefix_cache = True
# Setting this value >0 will use the idle host memory for a second-tier prefix kv
# cache beneath the HBM cache. It only has effect if enable_prefix_cache=True.
# The range of this value: [0, 1)
# Setting host_prefix_kv_cache_utilization_target to 0 will disable the host memory prefix kv cache.
host_prefix_kv_cache_utilization_target = 0.7

is_spot = False

min_replica_count = 1
max_replica_count = 1
required_replica_count = 1

# Set the target of GPU duty cycle or CPU usage between 1 and 100 for auto-scaling.
autoscale_by_gpu_duty_cycle_target = 0
autoscale_by_cpu_usage_target = 0

# Note: GPU duty cycle is not the most accurate metric for scaling workloads. More advanced auto-scaling metrics are coming soon. See [the public doc](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute#autoscaling) for more details.


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
    enable_trust_remote_code: bool = False,
    enforce_eager: bool = False,
    enable_lora: bool = False,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    host_prefix_kv_cache_utilization_target: float = 0.0,
    max_loras: int = 1,
    max_cpu_loras: int = 8,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int = 256,
    model_type: str = None,
    enable_llama_tool_parser: bool = False,
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    required_replica_count: int = 1,
    autoscale_by_gpu_duty_cycle_target: int = 0,
    autoscale_by_cpu_usage_target: int = 0,
    is_spot: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )
    
    print(f"model_name                              :{model_name}")
    print(f"model_id                                :{model_id}")
    print(f"publisher                               :{publisher}")
    print(f"publisher_model_id                      :{publisher_model_id}")
    print(f"base_model_id                           :{base_model_id}")
    print(f"machine_type                            :{machine_type}")
    print(f"accelerator_type                        :{accelerator_type}")
    print(f"accelerator_count                       :{accelerator_count}")
    print(f"gpu_memory_utilization                  :{gpu_memory_utilization}")
    print(f"max_model_len                           :{max_model_len}")
    print(f"dtype                                   :{dtype}")
    print(f"enable_trust_remote_code                :{enable_trust_remote_code}")
    print(f"enforce_eager                           :{enforce_eager}")
    print(f"enable_lora                             :{enable_lora}")
    print(f"enable_chunked_prefill                  :{enable_chunked_prefill}")
    print(f"enable_prefix_cache                     :{enable_prefix_cache}")
    print(f"host_prefix_kv_cache_utilization_target :{host_prefix_kv_cache_utilization_target}")
    print(f"max_loras                               :{max_loras}")
    print(f"max_cpu_loras                           :{max_cpu_loras}")
    print(f"use_dedicated_endpoint                  :{use_dedicated_endpoint}")
    print(f"max_num_seqs                            :{max_num_seqs}")
    print(f"model_type                              :{model_type}")
    print(f"enable_llama_tool_parser                :{enable_llama_tool_parser}")
    print(f"min_replica_count                       :{min_replica_count}")
    print(f"max_replica_count                       :{max_replica_count}")
    print(f"required_replica_count                  :{required_replica_count}")
    print(f"autoscale_by_gpu_duty_cycle_target      :{autoscale_by_gpu_duty_cycle_target}")
    print(f"autoscale_by_cpu_usage_target           :{autoscale_by_cpu_usage_target}")
    print(f"is_spot                                 :{is_spot}")


    if not base_model_id:
        base_model_id = model_id

    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.
    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        f"--max-num-seqs={max_num_seqs}",
        "--disable-log-stats",
    ]

    if enable_trust_remote_code:
        vllm_args.append("--trust-remote-code")

    if enforce_eager:
        vllm_args.append("--enforce-eager")

    if enable_lora:
        vllm_args.append("--enable-lora")

    if enable_chunked_prefill:
        vllm_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllm_args.append("--enable-prefix-caching")

    if 0 < host_prefix_kv_cache_utilization_target < 1:
        vllm_args.append(
            f"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}"
        )

    if model_type:
        vllm_args.append(f"--model-type={model_type}")

    if enable_llama_tool_parser:
        if "Llama-4" not in model_id:
            vllm_args.append("--enable-auto-tool-choice")
            vllm_args.append("--tool-call-parser=vertex-llama-3")
        else:
            vllm_args.append("--enable-auto-tool-choice")
            vllm_args.append("--tool-call-parser=llama3_json")
            
    print(f"vllm_args                               :{vllm_args}")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
        print(f"env_vars                                :{env_vars}")
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )

    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)

    url = f"https://{REGION}-aiplatform.googleapis.com/ui/projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint.name}:deployModel"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {creds.token}",
    }
    data = {
        "deployedModel": {
            "model": model.resource_name,
            "displayName": model_name,
            "dedicatedResources": {
                "machineSpec": {
                    "machineType": machine_type,
                    "acceleratorType": accelerator_type,
                    "acceleratorCount": accelerator_count,
                },
                "minReplicaCount": min_replica_count,
                "requiredReplicaCount": required_replica_count,
                "maxReplicaCount": max_replica_count,
            },
            "system_labels": {
                "NOTEBOOK_NAME": "model_garden_pytorch_llama3_1_deployment.ipynb",
                "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
            },
        },
    }
    if is_spot:
        data["deployedModel"]["dedicatedResources"]["spot"] = True
    if autoscale_by_gpu_duty_cycle_target > 0 or autoscale_by_cpu_usage_target > 0:
        data["deployedModel"]["dedicatedResources"]["autoscalingMetricSpecs"] = []
        if autoscale_by_gpu_duty_cycle_target > 0:
            data["deployedModel"]["dedicatedResources"][
                "autoscalingMetricSpecs"
            ].append(
                {
                    "metricName": "aiplatform.googleapis.com/prediction/online/accelerator/duty_cycle",
                    "target": autoscale_by_gpu_duty_cycle_target,
                }
            )
        if autoscale_by_cpu_usage_target > 0:
            data["deployedModel"]["dedicatedResources"][
                "autoscalingMetricSpecs"
            ].append(
                {
                    "metricName": "aiplatform.googleapis.com/prediction/online/cpu/utilization",
                    "target": autoscale_by_cpu_usage_target,
                }
            )
    response = requests.post(url, headers=headers, json=data)
    print(f"Deploy Model response: {response.json()}")
    if response.status_code != 200 or "name" not in response.json():
        raise ValueError(f"Failed to deploy model: {response.text}")
    common_util.poll_and_wait(response.json()["name"], REGION, 7200)
    print("endpoint_name:", endpoint.name)

    return model, endpoint


In [23]:
LABEL = "vllm_gpu"
models[LABEL], endpoints[LABEL] = deploy_model_vllm(
    model_name=common_util.get_job_name_with_datetime(prefix="llama3_1-serve"),
    model_id=model_id,
    publisher="meta",
    publisher_model_id="llama3_1",
    base_model_id=hf_model_id,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
    max_loras=max_loras,
    enforce_eager=True,
    enable_lora=ENABLE_DYNAMIC_LORA,
    enable_chunked_prefill=not ENABLE_DYNAMIC_LORA,
    enable_prefix_cache=enable_prefix_cache,
    host_prefix_kv_cache_utilization_target=host_prefix_kv_cache_utilization_target,
    use_dedicated_endpoint=use_dedicated_endpoint,
    enable_llama_tool_parser=True,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    required_replica_count=required_replica_count,
    autoscale_by_gpu_duty_cycle_target=autoscale_by_gpu_duty_cycle_target,
    autoscale_by_cpu_usage_target=autoscale_by_cpu_usage_target,
    is_spot=is_spot,
)

model = models[LABEL]
endpoint = endpoints[LABEL]

Creating Endpoint
Create Endpoint backing LRO: projects/979398597045/locations/us-central1/endpoints/991967295951601664/operations/8971609540818632704
Endpoint created. Resource name: projects/979398597045/locations/us-central1/endpoints/991967295951601664
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/979398597045/locations/us-central1/endpoints/991967295951601664')
model_name                              :llama3-1-serve-20250610-103803
model_id                                :gs://vertex-model-garden-restricted-us/llama3.1/Meta-Llama-3.1-8B-Instruct
publisher                               :meta
publisher_model_id                      :llama3_1
base_model_id                           :meta-llama/Meta-Llama-3.1-8B-Instruct
machine_type                            :g2-standard-12
accelerator_type                        :NVIDIA_L4
accelerator_count                       :1
gpu_memory_utilization                  :0.95
max_model_len                       

In [26]:
print(endpoint.gca_resource)

name: "projects/979398597045/locations/us-central1/endpoints/991967295951601664"
display_name: "llama3-1-serve-20250610-103803-endpoint"
etag: "AMEw9yNe6PBFD7WR9baQldSKtlqTImFGRPr03oGFTeDkpe17vbXBBvF9cC9c1APSPrEM"
create_time {
  seconds: 1749551884
  nanos: 55702000
}
update_time {
  seconds: 1749551885
  nanos: 434051000
}
dedicated_endpoint_enabled: true



### 原生SDK推理

In [30]:

prompt = "What is a car?" 
max_tokens = 500
temperature = 1.0
top_p = 1.0
top_k = 1
# Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = False 
lora_id = ""

# Overrides parameters for inferences.
instance = {
    "prompt": prompt,
    "max_tokens": max_tokens,
    "temperature": temperature,
    "top_p": top_p,
    "top_k": top_k,
    "raw_response": raw_response,
}
if lora_id:
    instance["dynamic-lora"] = lora_id
instances = [instance]
response = endpoints["vllm_gpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)


Prompt:
What is a car?
Output:
 A car is a road vehicle, typically with four wheels, powered by an internal combustion engine or an electric motor. Cars are used for transportation, and they come in a wide range of shapes, sizes, and styles. From compact sedans to luxury SUVs, cars are an essential part of modern life.
The first cars were invented in the late 19th century, and they were powered by steam engines. However, it wasn't until the early 20th century that cars became widely available and affordable for the general public. Since then, cars have evolved significantly, with advances in technology, design, and safety features.
Today, cars are a ubiquitous part of modern life, with millions of vehicles on the road worldwide. They are used for commuting, road trips, and other forms of transportation. Cars also come with a range of features, such as air conditioning, GPS navigation, and infotainment systems, making them a comfortable and convenient mode of transportation.
Types of Ca

### OpenAI SDK推理

In [29]:
# @title Chat completion

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints[LABEL].gca_resource.dedicated_endpoint_dns
ENDPOINT_RESOURCE_NAME = endpoints[LABEL].resource_name


! pip install -qU openai google-auth requests

user_message = "How is your day going?" 
max_tokens = 500
temperature = 1.0
stream = False

# Now we can send a request.

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response)


ChatCompletion(id='chatcmpl-6de8c92690954cdeb443392eaaf8e210', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="I'm just a language model, so I don't have emotions or experiences like a human would. However, I'm functioning properly and ready to help with any questions or tasks you may have. It's always great to have someone to chat with, so feel free to ask me anything or start a conversation on a topic that interests you.", refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[]), stop_reason=None)], created=1749553572, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=69, prompt_tokens=42, total_tokens=111, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None)


## Clean up resources

In [None]:
# @title Delete the models and endpoints

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()