In [None]:
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Qwen3-Coder (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_qwen3_coder_deployment.ipynb">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_qwen3_coder_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_qwen3_coder_deployment.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates serving Qwen3-Coder models with [SGLang](https://github.com/sgl-project/sglang). 

[Qwen3-Coder](https://huggingface.co/collections/Qwen/qwen3-coder-687fc861e53c939e52d52d10) is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.


### Objective

- Deploy Qwen3-Coder with SGLang on GPU.

### File a bug

File a bug on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new) if you encounter any issue with the notebook.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Upgrade Vertex AI SDK

# @markdown After executing this cell, click "RESTART SESSION" if prompted in the output.

! pip install --upgrade google-cloud-aiplatform

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 3. This model requires NVIDIA_H200_141GB gpus. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia H200 141GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h200_141gb_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a3-ultragpu-8g | 8 NVIDIA_H200_141GB | asia-south2, us-south1 |

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform==1.103.0'

# Import the necessary packages
import importlib
import os
import time
from typing import Tuple

import requests
from google import auth
from google.cloud import aiplatform

if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.notebooks.community.model_garden.docker_source_codes.notebook_util.common_util"
)

LABEL = "sglang_gpu"
models, endpoints = {}, {}

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

! gcloud config set project $PROJECT_ID

import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

## Deploy Qwen3 with SGLang

In [None]:
# @title Select the model variants

# @markdown Set the model to deploy.

MODEL_ID = "Qwen/Qwen3-Coder-480B-A35B-Instruct"  # @param ["Qwen/Qwen3-Coder-480B-A35B-Instruct"] {isTemplate:true}

version_id = MODEL_ID.split("/")[-1]
hf_model_id = MODEL_ID

# The pre-built serving docker images.
SGLANG_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/sglang-serve.cu124.0-4.ubuntu2204.py310:model-garden.sglang-0-4-release_20250720.00_p0"

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}

accelerator_type = "NVIDIA_H200_141GB"
machine_type = "a3-ultragpu-8g"
accelerator_count = 8
resource_id = "custom_model_serving_nvidia_h200_gpus"

PUBLISHER_MODEL_NAME = f"publishers/qwen/models/qwen3-coder@{version_id.lower()}"

print("Checking quota...")
common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)
print("Quota check completed.")

# @markdown Click "Show Code" to see more details.

In [None]:
# @title [Option 1] Deploy with Model Garden SDK
# @markdown Deploy with Gen AI model-centric SDK. This section uploads the prebuilt model to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model. See [use open models with Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-open-models) for documentation on other use cases.
deploy_request_timeout = 1800  # 30 minutes
from vertexai import model_garden

model = model_garden.OpenModel(PUBLISHER_MODEL_NAME)
endpoints[LABEL] = model.deploy(
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    use_dedicated_endpoint=use_dedicated_endpoint,
    spot=is_spot,
    deploy_request_timeout=deploy_request_timeout,
    accept_eula=False,
)

endpoint = endpoints[LABEL]

# @markdown Click "Show Code" to see more details.

In [None]:
# @title [Option 2] Deploy with customized configs

# @markdown This section uploads Qwen3 models to Model Registry and deploys them to a Vertex Prediction Endpoint. It takes ~1 hour to finish.

# @markdown It's recommended to use the region selected by the deployment button on the model card. If the deployment button is not available, it's recommended to stay with the default region of the notebook.


def poll_operation(op_name: str) -> bool:  # noqa: F811
    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)
    headers = {
        "Authorization": f"Bearer {creds.token}",
    }
    get_resp = requests.get(
        f"https://{REGION}-aiplatform.googleapis.com/ui/{op_name}",
        headers=headers,
    )
    opjs = get_resp.json()
    if "error" in opjs:
        raise ValueError(f"Operation failed: {opjs['error']}")
    return opjs.get("done", False)


def poll_and_wait(op_name: str, total_wait: int, interval: int = 60):  # noqa: F811
    waited = 0
    while not poll_operation(op_name):
        if waited > total_wait:
            raise TimeoutError("Operation timed out")
        print(
            f"\rStill waiting for operation... Waited time in second: {waited:<6}",
            end="",
            flush=True,
        )
        waited += interval
        time.sleep(interval)


def deploy_model_sglang_multihost(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str = "",
    base_model_id: str = "",
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    multihost_gpu_node_count: int = 1,
    gpu_memory_utilization: float | None = None,
    context_length: int | None = None,
    dtype: str | None = None,
    quantization: str | None = None,
    enable_trust_remote_code: bool = False,
    enable_torch_compile: bool = False,
    torch_compile_max_bs: int | None = None,
    attention_backend: str = "",
    enable_flashinfer_mla: bool = False,
    disable_cuda_graph: bool = False,
    speculative_algorithm: str | None = None,
    speculative_draft_model_path: str = "",
    speculative_num_steps: int = 3,
    speculative_eagle_topk: int = 1,
    speculative_num_draft_tokens: int = 4,
    enable_jit_deepgemm: bool = False,
    enable_dp_attention: bool = False,
    dp_size: int = 1,
    enable_multimodal: bool = False,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int | None = None,
    is_spot: bool = True,
    tool_call_parser: str | None = None,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with SGLang into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.sglang.ai/backend/server_arguments.html for a list of possible arguments with descriptions.
    sglang_args = [
        f"--model={model_id}",
        f"--tp={accelerator_count * multihost_gpu_node_count}",
        f"--dp={dp_size}",
    ]

    if context_length:
        sglang_args.append(f"--context-length={context_length}")

    if gpu_memory_utilization:
        sglang_args.append(f"--mem-fraction-static={gpu_memory_utilization}")

    if max_num_seqs:
        sglang_args.append(f"--max-running-requests={max_num_seqs}")

    if dtype:
        sglang_args.append(f"--dtype={dtype}")

    if quantization:
        sglang_args.append(f"--quantization={quantization}")

    if enable_trust_remote_code:
        sglang_args.append("--trust-remote-code")

    if enable_torch_compile:
        sglang_args.append("--enable-torch-compile")
        if torch_compile_max_bs:
            sglang_args.append(f"--torch-compile-max-bs={torch_compile_max_bs}")

    if attention_backend:
        sglang_args.append(f"--attention-backend={attention_backend}")

    if enable_flashinfer_mla:
        sglang_args.append("--enable-flashinfer-mla")

    if disable_cuda_graph:
        sglang_args.append("--disable-cuda-graph")

    if speculative_algorithm:
        sglang_args.append(f"--speculative-algorithm={speculative_algorithm}")
        sglang_args.append(
            f"--speculative-draft-model-path={speculative_draft_model_path}"
        )
        sglang_args.append(f"--speculative-num-steps={speculative_num_steps}")
        sglang_args.append(f"--speculative-eagle-topk={speculative_eagle_topk}")
        sglang_args.append(
            f"--speculative-num-draft-tokens={speculative_num_draft_tokens}"
        )

    if enable_dp_attention:
        sglang_args.append("--enable-dp-attention")

    if enable_multimodal:
        sglang_args.append("--enable-multimodal")

    if tool_call_parser:
        sglang_args.append(f"--tool-call-parser={tool_call_parser}")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    if enable_jit_deepgemm:
        env_vars["SGL_ENABLE_JIT_DEEPGEMM"] = "1"

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=SGLANG_DOCKER_URI,
        serving_container_args=sglang_args,
        serving_container_ports=[30000],
        serving_container_predict_route="/vertex_generate",
        serving_container_health_route="/health",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {int(accelerator_count * multihost_gpu_node_count)} {accelerator_type} GPU(s)."
    )

    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)

    url = f"https://{REGION}-aiplatform.googleapis.com/ui/projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint.name}:deployModel"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {creds.token}",
    }
    data = {
        "deployedModel": {
            "model": model.resource_name,
            "displayName": model_name,
            "dedicatedResources": {
                "machineSpec": {
                    "machineType": machine_type,
                    "multihostGpuNodeCount": multihost_gpu_node_count,
                    "acceleratorType": accelerator_type,
                    "acceleratorCount": accelerator_count,
                },
                "minReplicaCount": 1,
                "maxReplicaCount": 1,
            },
            "system_labels": {
                "NOTEBOOK_NAME": "model_garden_pytorch_qwen3_coder_deployment.ipynb",
                "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
            },
        },
    }
    if service_account:
        data["deployedModel"]["serviceAccount"] = service_account
    if is_spot:
        data["deployedModel"]["dedicatedResources"]["spot"] = True
    response = requests.post(url, headers=headers, json=data)
    print(f"Deploy Model response: {response.json()}")
    if response.status_code != 200 or "name" not in response.json():
        raise ValueError(f"Failed to deploy model: {response.text}")
    poll_and_wait(response.json()["name"], 7200)
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models[LABEL], endpoints[LABEL] = deploy_model_sglang_multihost(
    model_name=common_util.get_job_name_with_datetime(prefix=version_id),
    model_id=MODEL_ID,
    publisher="qwen",
    publisher_model_id="qwen3-coder",
    base_model_id=hf_model_id,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    use_dedicated_endpoint=use_dedicated_endpoint,
    tool_call_parser="qwen25",
)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title Raw predict


# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by SGLang can be found [here](https://docs.sglang.ai/backend/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown Write a quick sort algorithm in Python.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "Write a quick sort algorithm in Python."  # @param {type: "string"}

max_new_tokens = 32768  # @param {type:"integer"}
temperature = 0.7  # @param {type:"number"}
top_p = 0.8  # @param {type:"number"}
top_k = 20  # @param {type:"number"}

# Overrides parameters for inferences.
instances = [{"text": prompt}]
parameters = {
    "sampling_params": {
        "max_new_tokens": max_new_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
    }
}
response = endpoints["sglang_gpu"].predict(
    instances=instances,
    parameters=parameters,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

for prediction in response.predictions:
    print(prediction)

# @markdown Click "Show Code" to see more details.

## Clean up resources

In [None]:
# @title Delete the models and endpoints

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()