In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - DeepSeek (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/instances">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_deepseek_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_deepseek_deployment.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates serving DeepSeek models with [vLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), or [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). [DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3) is a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) is one of the first-generation reasoning models introduced by DeepSeek and offers performance comparable to OpenAI-o1 across math, code, and reasoning tasks.


### Objective

- Deploy DeepSeek-V3 and DeepSeek-R1 largest variants with vLLM, SGLang, or TensorRT-LLM on GPU using single-host and multi-host serving, and [Spot VMs](https://cloud.google.com/compute/docs/instances/spot) (Optional). Multi-host GPU serving is a preview feature.

### File a bug

File a bug on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new) if you encounter any issue with the notebook.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Request for quota

# @markdown To deploy the largest variants of the DeepSeek models, you need 1 host of 8 x H200 machine, or 2 hosts of 8 x H100 machines (which gives a total of 16 x H100s). Check that you have sufficient quota:
# @markdown - For Spot VM quota, check [`CustomModelServingPreemptibleH100GPUsPerProjectPerRegion`](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_preemptible_nvidia_h100_gpus). H200 GPUs are currently not available in Spot VM quota.
# @markdown - For regular VM quota, check [`CustomModelServingH200GPUsPerProjectPerRegion`](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h200_gpus) and [`CustomModelServingH100GPUsPerProjectPerRegion`](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).
#
# @markdown If you don't have sufficient quota, request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If left unchanged, the region defaults to us-east4 for using H200 GPUs.

REGION = "us-east4"  # @param {type:"string"}

# @markdown 3. If you want to run predictions with H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for Spot VM H100 GPUs: [`CustomModelServingPreemptibleH100GPUsPerProjectPerRegion`](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_preemptible_nvidia_h100_gpus) and regular VM H100s: [`CustomModelServingH100GPUsPerProjectPerRegion`](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus)..

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a3-highgpu-8g (Spot VM) | 8 NVIDIA_H100_80GB | us-central1, europe-west4, asia-southeast1 |
# @markdown | a3-highgpu-8g (regular VM) | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform==1.93.1'

# Import the necessary packages
import importlib
import json
import os
import time
from typing import Tuple

import requests
from google import auth
from google.cloud import aiplatform

# Upgrade Vertex AI SDK.
if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)


def check_quota(
    project_id: str,
    region: str,
    resource_id: str,
    accelerator_count: int,
):
    """Checks if the project and the region has the required quota."""
    quota = common_util.get_quota(project_id, region, resource_id)
    quota_request_instruction = (
        "Either use "
        "a different region or request additional quota. Follow "
        "instructions here "
        "https://cloud.google.com/docs/quotas/view-manage#requesting_higher_quota"
        " to check quota in a region or request additional quota for "
        "your project."
    )
    if quota == -1:
        raise ValueError(
            f"Quota not found for: {resource_id} in {region}."
            f" {quota_request_instruction}"
        )
    if quota < accelerator_count:
        raise ValueError(
            f"Quota not enough for {resource_id} in {region}: {quota} <"
            f" {accelerator_count}. {quota_request_instruction}"
        )


LABEL = "vllm_gpu"
models, endpoints = {}, {}

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

! gcloud config set project $PROJECT_ID

import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

## Deploy DeepSeek-V3 and DeepSeek-R1 with vLLM

In [None]:
# @title Set the model variants

# @markdown It's recommended to use the region selected by the deployment button on the model card. If the deployment button is not available, it's recommended to stay with the default region of the notebook.

# @markdown Multi-host GPU serving is a preview feature.

# @markdown Set the model to deploy.

base_model_name = "DeepSeek-R1"  # @param ["DeepSeek-V3", "DeepSeek-V3-Base", "DeepSeek-V3-0324", "DeepSeek-R1", "DeepSeek-R1-0528"] {isTemplate:true}
model_id = "deepseek-ai/" + base_model_name
hf_model_id = model_id
if "R1" in model_id:
    model_user_id = "deepseek-r1"
    model_id = f"gs://vertex-model-garden-restricted-us/{model_id}"
else:
    model_user_id = "deepseek-v3"

PUBLISHER_MODEL_NAME = (
    f"publishers/deepseek-ai/models/{model_user_id}@{base_model_name.lower()}"
)

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}

In [None]:
# @title Deploy with customized configs

# @markdown This section uploads DeepSeek models to Model Registry and deploys them to a Vertex Prediction Endpoint. It takes ~1 hour to finish.

# @markdown The following vLLM container version has been validated. The version will be continuously updated to incorporate latest optimizations and features.
# The pre-built serving docker image for vLLM past v0.7.3, https://github.com/vllm-project/vllm/commit/f6bb18fd9a19e5e4fb1991339638fc666d06b27a.
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250312_0916_RC01"

# @markdown Choose whether to use a [Spot VM](https://cloud.google.com/compute/docs/instances/spot) for the deployment.
is_spot = False  # @param {type:"boolean"}

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.
accelerator_type = "NVIDIA_H200_141GB"  # @param ["NVIDIA_H200_141GB", "NVIDIA_H100_80GB"] {isTemplate:true}
accelerator_count = 8
if accelerator_type == "NVIDIA_H200_141GB":
    machine_type = "a3-ultragpu-8g"
    multihost_gpu_node_count = 1
    if is_spot:
        raise ValueError("H200 GPUs are currently not available in Spot VM quota.")
    else:
        resource_id = "custom_model_serving_nvidia_h200_gpus"
else:
    machine_type = "a3-highgpu-8g"
    multihost_gpu_node_count = 2
    if is_spot:
        resource_id = "custom_model_serving_preemptible_nvidia_h100_gpus"
    else:
        resource_id = "custom_model_serving_nvidia_h100_gpus"

check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    resource_id=resource_id,
    accelerator_count=int(accelerator_count * multihost_gpu_node_count),
)

if accelerator_type == "NVIDIA_H200_141GB":
    # @markdown With a single host of 8 x H200s, speculative decoding with MTP and a context length of 8192 are supported in the specified configuration. The configuration has been validated for stability and performance.
    pipeline_parallel_size = 1
    gpu_memory_utilization = 0.75
    max_model_len = 8192  # Maximum context length.
    enable_chunked_prefill = False
    max_num_seqs = 64
    kv_cache_dtype = "auto"
    num_speculative_tokens = 3
    speculative_draft_tensor_parallel_size = 8
else:
    # @markdown With 2 hosts of 8 x H100s, chunked prefill and a context length of 163840 are supported in the specified configuration. The configuration has been validated for stability and performance.
    pipeline_parallel_size = 2
    gpu_memory_utilization = 0.82
    max_model_len = 163840  # Maximum context length.
    enable_chunked_prefill = True
    max_num_seqs = 64
    kv_cache_dtype = "auto"
    num_speculative_tokens = None
    speculative_draft_tensor_parallel_size = None


# # The pre-built serving docker image and configuration for vLLM v0.7.2.
# VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250304_0916_RC01"
# accelerator_type = "NVIDIA_H100_80GB"
# accelerator_count = 8
# machine_type = "a3-highgpu-8g"
# multihost_gpu_node_count = 2
# pipeline_parallel_size = 2
# gpu_memory_utilization = 0.8
# max_model_len = 4096  # Maximum context length.
# enable_chunked_prefill = False
# max_num_seqs = 64
# kv_cache_dtype = "auto"
# num_speculative_tokens = None
# speculative_draft_tensor_parallel_size = None

# # The pre-built serving docker image and configuration for vLLM v0.6.6.post1.
# VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250130_0916_RC01"
# accelerator_type = "NVIDIA_H100_80GB"
# accelerator_count = 8
# machine_type = "a3-highgpu-8g"
# multihost_gpu_node_count = 2
# pipeline_parallel_size = 1
# gpu_memory_utilization = 0.9
# max_model_len = 32768  # Maximum context length.
# enable_chunked_prefill = False
# max_num_seqs = 128
# kv_cache_dtype = "fp8"
# num_speculative_tokens = None
# speculative_draft_tensor_parallel_size = None


# Enable automatic prefix caching using GPU HBM
enable_prefix_cache = False
# Setting this value >0 will use the idle host memory for a second-tier prefix kv
# cache beneath the HBM cache. It only has effect if enable_prefix_cache=True.
# The range of this value: [0, 1)
# Setting host_prefix_kv_cache_utilization_target to 0 will disable the host memory prefix kv cache.
host_prefix_kv_cache_utilization_target = 0

# @markdown To enable the auto-scaling in deployment, you can set the following options:

min_replica_count = 1  # @param {type:"integer"}
max_replica_count = 1  # @param {type:"integer"}
required_replica_count = 1  # @param {type:"integer"}

# @markdown Set the target of GPU duty cycle or CPU usage between 1 and 100 for auto-scaling.
autoscale_by_gpu_duty_cycle_target = 0  # @param {type:"integer"}
autoscale_by_cpu_usage_target = 0  # @param {type:"integer"}

# @markdown Note: GPU duty cycle is not the most accurate metric for scaling workloads. More advanced auto-scaling metrics are coming soon. See [the public doc](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute#autoscaling) for more details.


def deploy_model_vllm_multihost_spec_decode(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str = None,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    multihost_gpu_node_count: int = 1,
    pipeline_parallel_size: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
    kv_cache_dtype: str = "auto",
    enable_trust_remote_code: bool = False,
    enforce_eager: bool = False,
    enable_lora: bool = False,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    host_prefix_kv_cache_utilization_target: float = 0.0,
    max_loras: int = 1,
    max_cpu_loras: int = 8,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int = 256,
    num_speculative_tokens: int = None,
    speculative_draft_tensor_parallel_size: int = None,
    model_type: str = None,
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    required_replica_count: int = 1,
    autoscale_by_gpu_duty_cycle_target: int = 0,
    autoscale_by_cpu_usage_target: int = 0,
    is_spot: bool = True,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.
    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size={int(accelerator_count * multihost_gpu_node_count / pipeline_parallel_size)}",
        f"--pipeline-parallel-size={pipeline_parallel_size}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        f"--kv-cache-dtype={kv_cache_dtype}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        f"--max-num-seqs={max_num_seqs}",
        "--disable-log-requests",
    ]

    if multihost_gpu_node_count > 1:
        vllm_args = ["/vllm-workspace/ray_launcher.sh"] + vllm_args

    if enable_trust_remote_code:
        vllm_args.append("--trust-remote-code")

    if enforce_eager:
        vllm_args.append("--enforce-eager")

    if enable_lora:
        vllm_args.append("--enable-lora")

    if enable_chunked_prefill:
        vllm_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllm_args.append("--enable-prefix-caching")

    if 0 < host_prefix_kv_cache_utilization_target < 1:
        vllm_args.append(
            f"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}"
        )

    if num_speculative_tokens is not None:
        vllm_args.append(f"--num-speculative-tokens={num_speculative_tokens}")

    if speculative_draft_tensor_parallel_size is not None:
        vllm_args.append(
            f"--speculative-draft-tensor-parallel-size={speculative_draft_tensor_parallel_size}"
        )

    if model_type:
        vllm_args.append(f"--model-type={model_type}")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {int(accelerator_count * multihost_gpu_node_count)} {accelerator_type} GPU(s)."
    )

    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)

    url = f"https://{REGION}-aiplatform.googleapis.com/ui/projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint.name}:deployModel"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {creds.token}",
    }
    data = {
        "deployedModel": {
            "model": model.resource_name,
            "displayName": model_name,
            "dedicatedResources": {
                "machineSpec": {
                    "machineType": machine_type,
                    "multihostGpuNodeCount": multihost_gpu_node_count,
                    "acceleratorType": accelerator_type,
                    "acceleratorCount": accelerator_count,
                },
                "minReplicaCount": min_replica_count,
                "requiredReplicaCount": required_replica_count,
                "maxReplicaCount": max_replica_count,
            },
            "system_labels": {
                "NOTEBOOK_NAME": "model_garden_pytorch_deepseek_deployment.ipynb",
                "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
            },
        },
    }
    if service_account:
        data["deployedModel"]["serviceAccount"] = service_account
    if is_spot:
        data["deployedModel"]["dedicatedResources"]["spot"] = True
    if autoscale_by_gpu_duty_cycle_target > 0 or autoscale_by_cpu_usage_target > 0:
        data["deployedModel"]["dedicatedResources"]["autoscalingMetricSpecs"] = []
        if autoscale_by_gpu_duty_cycle_target > 0:
            data["deployedModel"]["dedicatedResources"][
                "autoscalingMetricSpecs"
            ].append(
                {
                    "metricName": "aiplatform.googleapis.com/prediction/online/accelerator/duty_cycle",
                    "target": autoscale_by_gpu_duty_cycle_target,
                }
            )
        if autoscale_by_cpu_usage_target > 0:
            data["deployedModel"]["dedicatedResources"][
                "autoscalingMetricSpecs"
            ].append(
                {
                    "metricName": "aiplatform.googleapis.com/prediction/online/cpu/utilization",
                    "target": autoscale_by_cpu_usage_target,
                }
            )
    response = requests.post(url, headers=headers, json=data)
    print(f"Deploy Model response: {response.json()}")
    if response.status_code != 200 or "name" not in response.json():
        raise ValueError(f"Failed to deploy model: {response.text}")
    common_util.poll_and_wait(response.json()["name"], REGION, 7200)
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models["vllm_gpu"], endpoints["vllm_gpu"] = deploy_model_vllm_multihost_spec_decode(
    model_name=common_util.get_job_name_with_datetime(prefix="deepseek-serve"),
    model_id=model_id,
    publisher="deepseek-ai",
    publisher_model_id=("deepseek-v3" if "V3" in model_id else "deepseek-r1"),
    base_model_id=hf_model_id,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    multihost_gpu_node_count=multihost_gpu_node_count,
    pipeline_parallel_size=pipeline_parallel_size,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
    max_num_seqs=max_num_seqs,
    kv_cache_dtype=kv_cache_dtype,
    enable_trust_remote_code=True,
    enforce_eager=False,
    enable_lora=False,
    enable_chunked_prefill=enable_chunked_prefill,
    num_speculative_tokens=num_speculative_tokens,
    speculative_draft_tensor_parallel_size=speculative_draft_tensor_parallel_size,
    enable_prefix_cache=enable_prefix_cache,
    host_prefix_kv_cache_utilization_target=host_prefix_kv_cache_utilization_target,
    use_dedicated_endpoint=use_dedicated_endpoint,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    required_replica_count=required_replica_count,
    autoscale_by_gpu_duty_cycle_target=autoscale_by_gpu_duty_cycle_target,
    autoscale_by_cpu_usage_target=autoscale_by_cpu_usage_target,
    is_spot=is_spot,
)
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Raw predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown User: What is the best way to diagnose and fix a flickering light in my house?
# @markdown Assistant: Okay, so I need to figure out how to diagnose and fix a flickering light in my house. Hmm, where do I start? Let's think. First, I remember that flickering lights can be caused by various issues. Maybe the bulb is loose? That's a common problem. Let me start with the simplest things first.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

# @markdown A chat template formatted prompt for the DeepSeek-R1 model is shown below as an example.
# @markdown A chat template formatted prompt for the DeepSeek-V3 model would be: "<｜begin▁of▁sentence｜><｜User｜>What is the best way to diagnose and fix a flickering light in my house?<｜Assistant｜>\n"
prompt = "<｜begin▁of▁sentence｜><｜User｜>What is the best way to diagnose and fix a flickering light in my house?<｜Assistant｜><think>\n"  # @param {type: "string"}
# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.
max_tokens = 1024  # @param {type:"integer"}
temperature = 0.6  # @param {type:"number"}
top_p = 0.95  # @param {type:"number"}
# @markdown Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = True  # @param {type:"boolean"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "raw_response": raw_response,
    },
]
response = endpoints["vllm_gpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title Chat completion

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints["vllm_gpu"].gca_resource.dedicated_endpoint_dns
ENDPOINT_RESOURCE_NAME = endpoints["vllm_gpu"].resource_name

# @markdown Because the DeepSeek-R1 model generates detailed reasoning steps, the output is expected to be long. We recommend using streaming for a better generation experience.
# @title Chat Completions Inference

# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.

# @markdown First you will need to install the SDK and some auth-related dependencies.

! pip install -qU openai google-auth requests

# @markdown Next fill out some request parameters:

user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
stream = False  # @param {type: "boolean"}

# @markdown Now we can send a request.

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response)

# @markdown Click "Show Code" to see more details.

## Deploy DeepSeek-V3 and DeepSeek-R1 with SGLang

In [None]:
# @title Deploy

# @markdown This section uploads DeepSeek models to Model Registry and deploys them to a Vertex Prediction Endpoint. It takes ~1 hour to finish.

# @markdown It's recommended to use the region selected by the deployment button on the model card. If the deployment button is not available, it's recommended to stay with the default region of the notebook.

# @markdown Multi-host GPU serving is a preview feature.

# @markdown Set the model to deploy.

base_model_name = "DeepSeek-R1"  # @param ["DeepSeek-V3", "DeepSeek-V3-Base", "DeepSeek-V3-0324", "DeepSeek-R1", "DeepSeek-R1-0528"] {isTemplate:true}
model_id = "deepseek-ai/" + base_model_name
hf_model_id = model_id

# The pre-built serving docker images.
SGLANG_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/sglang-serve.cu124.0-4.ubuntu2204.py310:20250427-1800-rc0"

# @markdown Choose whether to use a [Spot VM](https://cloud.google.com/compute/docs/instances/spot) for the deployment.
is_spot = False  # @param {type:"boolean"}

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.
accelerator_type = "NVIDIA_H200_141GB"  # @param ["NVIDIA_H200_141GB", "NVIDIA_H100_80GB"] {isTemplate:true}
accelerator_count = 8
if accelerator_type == "NVIDIA_H200_141GB":
    machine_type = "a3-ultragpu-8g"
    multihost_gpu_node_count = 1
    if is_spot:
        raise ValueError("H200 GPUs are currently not available in Spot VM quota.")
    else:
        resource_id = "custom_model_serving_nvidia_h200_gpus"
else:
    machine_type = "a3-highgpu-8g"
    multihost_gpu_node_count = 2
    if is_spot:
        resource_id = "custom_model_serving_preemptible_nvidia_h100_gpus"
    else:
        resource_id = "custom_model_serving_nvidia_h100_gpus"

check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    resource_id=resource_id,
    accelerator_count=int(accelerator_count * multihost_gpu_node_count),
)

# @markdown The maximum context length 163840 is supported in the following configurations.
# @markdown These configuration has been validated for stability and performance.
# @markdown 1. Low latency: This profile optimizes for low latency on small batches of incoming requests.
# @markdown 2. High throughput: This profile optimizes for high throughput on large batches of incoming requests.
profile = "Low latency"  # @param ["Low latency", "High throughput"] {isTemplate:true}

# Set this value to the expected number of concurrent requests.
torch_compile_max_bs = 4

if profile == "Low latency":
    enable_torch_compile = True
    disable_cuda_graph = False
    if base_model_name not in ("DeepSeek-V3", "DeepSeek-V3-0324", "DeepSeek-R1"):
        speculative_algorithm = None
        speculative_draft_model_path = ""
        print(
            f"No speculative draft model is available for {base_model_name}. Performance will be degraded."
        )
    else:
        speculative_algorithm = "EAGLE"
        speculative_draft_model_path = f"lmsys/{base_model_name}-NextN"
    enable_jit_deepgemm = True
    enable_dp_attention = False
    dp_size = 1
else:
    enable_torch_compile = False
    disable_cuda_graph = False
    speculative_algorithm = None
    speculative_draft_model_path = ""
    enable_jit_deepgemm = True
    enable_dp_attention = True
    dp_size = 8


def poll_operation(op_name: str) -> bool:  # noqa: F811
    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)
    headers = {
        "Authorization": f"Bearer {creds.token}",
    }
    get_resp = requests.get(
        f"https://{REGION}-aiplatform.googleapis.com/ui/{op_name}",
        headers=headers,
    )
    opjs = get_resp.json()
    if "error" in opjs:
        raise ValueError(f"Operation failed: {opjs['error']}")
    return opjs.get("done", False)


def poll_and_wait(op_name: str, total_wait: int, interval: int = 60):  # noqa: F811
    waited = 0
    while not poll_operation(op_name):
        if waited > total_wait:
            raise TimeoutError("Operation timed out")
        print(
            f"\rStill waiting for operation... Waited time in second: {waited:<6}",
            end="",
            flush=True,
        )
        waited += interval
        time.sleep(interval)


def deploy_model_sglang_multihost(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str = "",
    base_model_id: str = "",
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    multihost_gpu_node_count: int = 1,
    gpu_memory_utilization: float | None = None,
    context_length: int | None = None,
    dtype: str | None = None,
    enable_trust_remote_code: bool = False,
    enable_torch_compile: bool = False,
    torch_compile_max_bs: int | None = None,
    attention_backend: str = "",
    enable_flashinfer_mla: bool = False,
    disable_cuda_graph: bool = False,
    speculative_algorithm: str | None = None,
    speculative_draft_model_path: str = "",
    speculative_num_steps: int = 3,
    speculative_eagle_topk: int = 1,
    speculative_num_draft_tokens: int = 4,
    enable_jit_deepgemm: bool = False,
    enable_dp_attention: bool = False,
    dp_size: int = 1,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int | None = None,
    is_spot: bool = True,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with SGLang into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.sglang.ai/backend/server_arguments.html for a list of possible arguments with descriptions.
    sglang_args = [
        f"--model={model_id}",
        f"--tp={accelerator_count * multihost_gpu_node_count}",
        f"--dp={dp_size}",
    ]

    if context_length:
        sglang_args.append(f"--context-length={context_length}")

    if gpu_memory_utilization:
        sglang_args.append(f"--mem-fraction-static={gpu_memory_utilization}")

    if max_num_seqs:
        sglang_args.append(f"--max-running-requests={max_num_seqs}")

    if dtype:
        sglang_args.append(f"--dtype={dtype}")

    if enable_trust_remote_code:
        sglang_args.append("--trust-remote-code")

    if enable_torch_compile:
        sglang_args.append("--enable-torch-compile")
        if torch_compile_max_bs:
            sglang_args.append(f"--torch-compile-max-bs={torch_compile_max_bs}")

    if attention_backend:
        sglang_args.append(f"--attention-backend={attention_backend}")

    if enable_flashinfer_mla:
        sglang_args.append("--enable-flashinfer-mla")

    if disable_cuda_graph:
        sglang_args.append("--disable-cuda-graph")

    if speculative_algorithm:
        sglang_args.append(f"--speculative-algorithm={speculative_algorithm}")
        sglang_args.append(
            f"--speculative-draft-model-path={speculative_draft_model_path}"
        )
        sglang_args.append(f"--speculative-num-steps={speculative_num_steps}")
        sglang_args.append(f"--speculative-eagle-topk={speculative_eagle_topk}")
        sglang_args.append(
            f"--speculative-num-draft-tokens={speculative_num_draft_tokens}"
        )

    if enable_dp_attention:
        sglang_args.append("--enable-dp-attention")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    if enable_jit_deepgemm:
        env_vars["SGL_ENABLE_JIT_DEEPGEMM"] = "1"

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=SGLANG_DOCKER_URI,
        serving_container_args=sglang_args,
        serving_container_ports=[30000],
        serving_container_predict_route="/vertex_generate",
        serving_container_health_route="/health",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {int(accelerator_count * multihost_gpu_node_count)} {accelerator_type} GPU(s)."
    )

    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)

    url = f"https://{REGION}-aiplatform.googleapis.com/ui/projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint.name}:deployModel"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {creds.token}",
    }
    data = {
        "deployedModel": {
            "model": model.resource_name,
            "displayName": model_name,
            "dedicatedResources": {
                "machineSpec": {
                    "machineType": machine_type,
                    "multihostGpuNodeCount": multihost_gpu_node_count,
                    "acceleratorType": accelerator_type,
                    "acceleratorCount": accelerator_count,
                },
                "minReplicaCount": 1,
                "maxReplicaCount": 1,
            },
            "system_labels": {
                "NOTEBOOK_NAME": "model_garden_pytorch_deepseek_deployment.ipynb",
                "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
            },
        },
    }
    if service_account:
        data["deployedModel"]["serviceAccount"] = service_account
    if is_spot:
        data["deployedModel"]["dedicatedResources"]["spot"] = True
    response = requests.post(url, headers=headers, json=data)
    print(f"Deploy Model response: {response.json()}")
    if response.status_code != 200 or "name" not in response.json():
        raise ValueError(f"Failed to deploy model: {response.text}")
    poll_and_wait(response.json()["name"], 7200)
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models["sglang_gpu"], endpoints["sglang_gpu"] = deploy_model_sglang_multihost(
    model_name=common_util.get_job_name_with_datetime(prefix="deepseek-serve"),
    model_id=model_id,
    publisher="deepseek-ai",
    publisher_model_id=("deepseek-v3" if "V3" in model_id else "deepseek-r1"),
    base_model_id=hf_model_id,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    multihost_gpu_node_count=multihost_gpu_node_count,
    enable_trust_remote_code=True,
    enable_torch_compile=enable_torch_compile,
    torch_compile_max_bs=torch_compile_max_bs,
    attention_backend="fa3",
    disable_cuda_graph=disable_cuda_graph,
    speculative_algorithm=speculative_algorithm,
    speculative_draft_model_path=speculative_draft_model_path,
    enable_jit_deepgemm=enable_jit_deepgemm,
    enable_dp_attention=enable_dp_attention,
    dp_size=dp_size,
    use_dedicated_endpoint=use_dedicated_endpoint,
    is_spot=is_spot,
)
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Raw predict


# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by SGLang can be found [here](https://docs.sglang.ai/backend/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown User: What is the best way to diagnose and fix a flickering light in my house?
# @markdown Assistant: Okay, so I need to figure out how to diagnose and fix a flickering light in my house. Hmm, where do I start? Let's think. First, I remember that flickering lights can be caused by various issues. Maybe the bulb is loose? That's a common problem. Let me start with the simplest things first.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

# @markdown A chat template formatted prompt for the DeepSeek-V3 model is shown below as an example.
prompt = "<｜begin▁of▁sentence｜><｜User｜>What is the best way to diagnose and fix a flickering light in my house?<｜Assistant｜>"  # @param {type: "string"}
# @markdown For the DeepSeek-R1 model, `<think>` should be appended to the prompt, as shown below.
if model_id.lower().endswith("deepseek-r1"):
    prompt += "<think>\n"

# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_new_tokens`.
max_new_tokens = 1024  # @param {type:"integer"}
temperature = 0.6  # @param {type:"number"}
top_p = 0.95  # @param {type:"number"}

# Overrides parameters for inferences.
instances = [{"text": prompt}]
parameters = {
    "sampling_params": {
        "max_new_tokens": max_new_tokens,
        "temperature": temperature,
        "top_p": top_p,
    }
}
response = endpoints["sglang_gpu"].predict(
    instances=instances,
    parameters=parameters,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

for prediction in response.predictions:
    print(prediction)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title Chat completion

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints["sglang_gpu"].gca_resource.dedicated_endpoint_dns
ENDPOINT_RESOURCE_NAME = endpoints["sglang_gpu"].resource_name

# @markdown Because the DeepSeek-R1 / DeepSeek-V3 model generates detailed reasoning steps, the output is expected to be long. We recommend using streaming for a better generation experience.
# @title Chat Completions Inference

# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.

# @markdown First you will need to install the SDK and some auth-related dependencies.

! pip install -qU openai google-auth requests

# @markdown Next fill out some request parameters:

user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
stream = False  # @param {type: "boolean"}

# @markdown Now we can send a request.

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response)

# @markdown Click "Show Code" to see more details.

## Deploy DeepSeek-V3 and DeepSeek-R1 with TensorRT-LLM

In [None]:
# @title Deploy

# @markdown This section uploads DeepSeek models to Model Registry and deploys them to a Vertex Prediction Endpoint. It takes ~1 hour to finish.

# @markdown It's recommended to use the region selected by the deployment button on the model card. If the deployment button is not available, it's recommended to stay with the default region of the notebook.

# @markdown Set the model to deploy.

base_model_name = "DeepSeek-R1"  # @param ["DeepSeek-V3", "DeepSeek-V3-Base", "DeepSeek-V3-0324", "DeepSeek-R1", "DeepSeek-R1-0528"] {isTemplate:true}
model_id = "deepseek-ai/" + base_model_name
hf_model_id = model_id

# The pre-built serving docker images.
TRTLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/tensorrt-llm.cu128.0-18.ubuntu2404.py312:deepseek"

# @markdown Choose whether to use a [Spot VM](https://cloud.google.com/compute/docs/instances/spot) for the deployment.
is_spot = False  # @param {type:"boolean"}

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.
trtllm_accelerator_type = "NVIDIA_H200_141GB"  # @param ["NVIDIA_H200_141GB"] {isTemplate:true}
accelerator_count = 8
trtllm_region = "us-east4"  # @param ["us-east4"] {isTemplate:true}
if trtllm_accelerator_type == "NVIDIA_H200_141GB":
    machine_type = "a3-ultragpu-8g"
    multihost_gpu_node_count = 1
    resource_id = "custom_model_serving_nvidia_h200_gpus"
else:
    raise ValueError("Only NVIDIA_H200_141GB is supported for DeepSeek-R1.")

check_quota(
    project_id=PROJECT_ID,
    region=trtllm_region,
    resource_id=resource_id,
    accelerator_count=int(accelerator_count * multihost_gpu_node_count),
)

# 18K context length. This is the maximum supported by the current version of TensorRT-LLM on DeepSeek V3/R1 models.
MAX_INPUT_LEN = 18000
MAX_MODEL_LEN = 18000
MAX_NUM_SEQS = 128
GPU_MEMORY_UTILIZATION = 0.55


def poll_operation(op_name: str, trtllm_region: str) -> bool:  # noqa: F811
    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)
    headers = {
        "Authorization": f"Bearer {creds.token}",
    }
    get_resp = requests.get(
        f"https://{trtllm_region}-aiplatform.googleapis.com/ui/{op_name}",
        headers=headers,
    )
    opjs = get_resp.json()
    if "error" in opjs:
        raise ValueError(f"Operation failed: {opjs['error']}")
    return opjs.get("done", False)


def poll_and_wait_trtllm(
    op_name: str, total_wait: int, trtllm_region: str, interval: int = 60
):  # noqa: F811
    waited = 0
    while not poll_operation(op_name, trtllm_region):
        if waited > total_wait:
            raise TimeoutError("Operation timed out")
        print(
            f"\rStill waiting for operation... Waited time in second: {waited:<6}",
            end="",
            flush=True,
        )
        waited += interval
        time.sleep(interval)


def deploy_model_tensorrt_llm_multihost(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str = "",
    base_model_id: str = "",
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    multihost_gpu_node_count: int = 1,
    gpu_memory_utilization: float | None = None,
    max_input_len: int | None = None,
    max_model_len: int | None = None,
    max_num_seqs: int | None = None,
    enable_trust_remote_code: bool = False,
    enable_chunked_prefill: bool = False,
    use_dedicated_endpoint: bool = False,
    is_spot: bool = True,
    trtllm_region: str = REGION,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with TensorRT-LLM on Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        location=trtllm_region,
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    container_args = [
        "python",
        "api_server.py",
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count * multihost_gpu_node_count}",
    ]

    if gpu_memory_utilization:
        container_args.append(f"--gpu-memory-utilization={gpu_memory_utilization}")

    if max_input_len:
        container_args.append(f"--max-input-len={max_input_len}")

    if max_model_len:
        container_args.append(f"--max-model-len={max_model_len}")

    if max_num_seqs:
        container_args.append(f"--max-num-seqs={max_num_seqs}")

    if enable_trust_remote_code:
        container_args.append("--trust-remote-code=True")

    if enable_chunked_prefill:
        container_args.append("--enable-chunked-prefill=True")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        location=trtllm_region,
        serving_container_image_uri=TRTLLM_DOCKER_URI,
        serving_container_args=container_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/v1/chat/completions",
        serving_container_health_route="/health",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {int(accelerator_count * multihost_gpu_node_count)} {accelerator_type} GPU(s)."
    )

    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)

    url = f"https://{trtllm_region}-aiplatform.googleapis.com/ui/projects/{PROJECT_ID}/locations/{trtllm_region}/endpoints/{endpoint.name}:deployModel"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {creds.token}",
    }
    data = {
        "deployedModel": {
            "model": model.resource_name,
            "displayName": model_name,
            "dedicatedResources": {
                "machineSpec": {
                    "machineType": machine_type,
                    "multihostGpuNodeCount": multihost_gpu_node_count,
                    "acceleratorType": accelerator_type,
                    "acceleratorCount": accelerator_count,
                },
                "minReplicaCount": 1,
                "maxReplicaCount": 1,
            },
            "system_labels": {
                "NOTEBOOK_NAME": "model_garden_pytorch_deepseek_deployment.ipynb",
                "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
            },
        },
    }
    if service_account:
        data["deployedModel"]["serviceAccount"] = service_account
    if is_spot:
        data["deployedModel"]["dedicatedResources"]["spot"] = True
    response = requests.post(url, headers=headers, json=data)
    print(f"Deploy Model response: {response.json()}")
    if response.status_code != 200 or "name" not in response.json():
        raise ValueError(f"Failed to deploy model: {response.text}")
    poll_and_wait_trtllm(response.json()["name"], 7200, trtllm_region)
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models["trtllm_gpu"], endpoints["trtllm_gpu"] = deploy_model_tensorrt_llm_multihost(
    model_name=common_util.get_job_name_with_datetime(prefix="deepseek-serve"),
    model_id=model_id,
    publisher="deepseek-ai",
    publisher_model_id=("deepseek-v3" if "V3" in model_id else "deepseek-r1"),
    base_model_id=hf_model_id,
    machine_type=machine_type,
    accelerator_type=trtllm_accelerator_type,
    accelerator_count=accelerator_count,
    multihost_gpu_node_count=multihost_gpu_node_count,
    gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
    max_input_len=MAX_INPUT_LEN,
    max_model_len=MAX_MODEL_LEN,
    max_num_seqs=MAX_NUM_SEQS,
    enable_trust_remote_code=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
    is_spot=is_spot,
    trtllm_region=trtllm_region,
)
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Raw predict
# @markdown Once deployment succeeds, you can send requests to the endpoint using `:rawPredict`. The OpenAI Client chat completions support is coming soon.
endpoints["trtllm_gpu"] = aiplatform.Endpoint(endpoints["trtllm_gpu"].resource_name)

# @markdown Fill out some request parameters:
user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}

# @markdown Now we can send a request.

response = endpoints["trtllm_gpu"].raw_predict(
    body=json.dumps(
        {
            "model": "",
            "messages": [
                {
                    "role": "user",
                    "content": user_message,
                }
            ],
            "max_tokens": max_tokens,
            "temperature": temperature,
        }
    ),
    headers={"Content-Type": "application/json"},
    use_dedicated_endpoint=use_dedicated_endpoint,
)
print(response.json()["choices"][0]["message"]["content"])

# @markdown Click "Show Code" to see more details.

## Clean up resources

In [None]:
# @title Delete the models and endpoints

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()