In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden Integration With ADK

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_integration_with_adk.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_integration_with_adk.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates how to build an agent with a Model Garden open model deployed through Vertex endpoint and [Google Agent Development Kit](https://google.github.io/adk-docs/) (ADK). The agent can automatically call function tools like `get_weather` and `get_current_time`.


### Objective

- Deploy Vertex AI Model Garden OSS LLMs properly for ADK integration
- Test deployed endpoints
- Build agent web apps with deployed endpoints and ADK
- Deploy agent web apps to Cloud Run

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 3. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform>=1.84.0'
! pip install -qU openai google-auth requests

# Import the necessary packages
import datetime
import importlib
import os
import uuid
from typing import Tuple

from google.cloud import aiplatform

if not os.path.exists("./vertex-ai-samples"):
  ! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

models, endpoints = {}, {}

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    if not os.environ.get("GOOGLE_CLOUD_REGION"):
        raise ValueError(
            "REGION must be set. See"
            " https://cloud.google.com/vertex-ai/docs/general/locations for"
            " available cloud locations."
        )
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

from etils import epath
TUTORIAL_DIR = epath.Path("vmg_adk_agent_tutorial")
BUILD_DIR = TUTORIAL_DIR / "build"
BUILD_DIR.mkdir(exist_ok=True, parents=True)

REPOSITORY_NAME = "vertex-vision-model-garden-dockers"

!gcloud artifacts repositories create $REPOSITORY_NAME \
      --repository-format=docker \
      --location=$REGION \
      --project=$PROJECT_ID

! gcloud auth configure-docker $REGION-docker.pkg.dev --quiet

## Integrate OSS LLMs With ADK

In [None]:
# @title Deploy OSS LLMs For Agents
# @markdown In order to use OSS LLM endpoints smoothly with ADK,
# @markdown these models should be deployed with:

# @markdown - **enabling tool calls**. e.g.:
# @markdown If the models are deployed with *vllm*,
# @markdown the deployment should specify settings like `--enable-auto-tool-choice`
# @markdown and `--tool-call-parser=hermes`.
# @markdown If the models are deployed with *sglang*, the deployment should specify
# @markdown setting like `--tool-call-parser=qwen25`.
# @markdown Refer to tool calls in
# @markdown [vllm](https://docs.vllm.ai/en/stable/features/tool_calling.html)
# @markdown and [sglang](https://docs.sglang.ai/backend/function_calling.html) for more details.

# @markdown - **disable dedicated endpoints**. The dedicated endpoints are not
# @markdown supported in ADK yet.

# @markdown You can deploy models below with proper deployment settings for ADK integration.

MODEL_ID = "Qwen3-32B"  # @param ["Qwen3-32B"] {isTemplate: true}
accelerator_type = "NVIDIA_H100_80GB"  # @param ["NVIDIA_L4", "NVIDIA_A100_80GB", "NVIDIA_H100_80GB"] {isTemplate: true}

if accelerator_type == "NVIDIA_L4":
    accelerator_count = 4
    # Sets machine type to g2-standard-48 for 4 L4's
    machine_type = "g2-standard-48"
elif accelerator_type == "NVIDIA_A100_80GB":
    accelerator_count = 1
    # Sets machine type to a2-ultragpu-1g for 1 Nvidia A100 80 GB.
    machine_type = "a2-ultragpu-1g"
elif accelerator_type == "NVIDIA_H100_80GB":
    accelerator_count = 2
    machine_type = "a3-highgpu-2g"

else:
    raise ValueError(
        "Recommended machine settings not found for accelerator type: %s"
        % accelerator_type
    )

deploy_request_timeout = 1800  # 30 minutes

from vertexai.preview import model_garden
publisher_model_name = f"publishers/qwen/models/qwen3@{MODEL_ID.lower()}"
model = model_garden.OpenModel(publisher_model_name)

container_spec = model.list_deploy_options()[0].container_spec
updated_args = container_spec.args[:-2] + [f"--tp={accelerator_count}", "--tool-call-parser=qwen25"]
container_spec.args = updated_args

print("The container spec are:")
print(container_spec)

print("Start to check quota for the deployment.")
common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)
print("Finished to check quota for the deployment.")

print("Start to deploy models to endpoints.")
endpoint = model.deploy(
    serving_container_spec=container_spec,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    use_dedicated_endpoint=False,
    spot=False,
    deploy_request_timeout=deploy_request_timeout,
    accept_eula=False,
)
print("Finished to deploy models to endpoints.")
# @markdown After endpoints are deployed successfully, you get the endpoint
# @markdown resource name with the format as
# @markdown `projects/{PROJECT_ID}/locations/{REGION}/endpoints/{ENDPOINT_ID}`.
# @markdown The endpoint resource name will be used in local predictions and
# @markdown integration with ADK below.
endpoint_resource_name = endpoint.resource_name
print("The deployed endpoint resource name is:")
print(endpoint_resource_name)
# @markdown Click "Show Code" to see more details.


In [None]:
# @title Test The Endpoint
# endpoint_resource_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{ENDPOINT_NAME}"
# )
endpoint = aiplatform.Endpoint(endpoint_resource_name)

location = endpoint_resource_name.split('/')[3]
base_url = f"https://{location}-aiplatform.googleapis.com/v1beta1/{endpoint.resource_name}"

# @markdown Predict locally with some requests.

user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 100  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
stream = True  # @param {type: "boolean"}

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

client = openai.OpenAI(base_url=base_url, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response.choices[0].message.content)

In [None]:
# @title Build Agent Web App Dockers With VMG Endpoints
# @markdown The section will create required python and docker files first, and
# @markdown then build the dockers with cloud build.

# @markdown 1. Create `agent.py` by loading VMG endpoints and example tool functions.
agent_app = '''
"""This is a sample agent for model garden agents."""

import datetime
import os
import re
import zoneinfo

from google.adk.agents import LlmAgent
from google.adk.models.lite_llm import LiteLlm
import google.auth

_MODEL_GARDEN_ENDPOINT_REGEX = r"projects\/.+\/locations\/.+\/endpoints\/.+"


def get_weather(city: str) -> str:
  """Simulates a web search. Use it get information on weather.

  Args:
      city: A string containing the location to get weather information for.

  Returns:
      A string with the simulated weather information for the queried city.
  """
  if "sf" in city.lower() or "san francisco" in city.lower():
    return "It's 70 degrees and foggy."
  return "It's 80 degrees and sunny."


def get_current_time(city: str) -> str:
  """Simulates getting the current time for a city.

  Args:
      city: The name of the city to get the current time for.

  Returns:
      A string with the current time information.
  """
  if "sf" in city.lower() or "san francisco" in city.lower():
    tz_identifier = "America/Los_Angeles"
  else:
    return f"Sorry, I don't have timezone information for city: {city}."

  tz = zoneinfo.ZoneInfo(tz_identifier)
  now = datetime.datetime.now(tz)
  return (
      f"The current time for city {city} is"
      f" {now.strftime('%Y-%m-%d %H:%M:%S %Z%z')}"
  )


def _get_auth_headers() -> dict[str, str]:
  """Gets the auth headers for the model garden endpoint."""
  creds, _ = google.auth.default(
      scopes=["https://www.googleapis.com/auth/cloud-platform"]
  )
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  return {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {creds.token}",
  }


def _setup_model_garden_endpoint():
  """Sets up the model garden endpoint."""
  endpoint = os.environ.get("GOOGLE_MODEL_GARDEN_ENDPOINT", "")

  if not re.compile(_MODEL_GARDEN_ENDPOINT_REGEX).fullmatch(endpoint):
    raise ValueError(
        f"Invalid model garden endpoint: {endpoint}. Please use the format"
        " projects/{project}/locations/{location}/endpoints/{endpoint}."
    )
  endpoint_parts = endpoint.split("/")
  os.environ.setdefault("GOOGLE_GENAI_USE_VERTEXAI", "True")
  os.environ["VERTEXAI_PROJECT"] = endpoint_parts[1]
  os.environ["VERTEXAI_LOCATION"] = endpoint_parts[3]
  os.environ["LITELLM_LOG"] = "DEBUG"
  return f"vertex_ai/openai/{endpoint_parts[5]}"


auth_headers = _get_auth_headers()
model = _setup_model_garden_endpoint()

print("The current model is: {model}")

root_agent = LlmAgent(
    name="root_agent",
    model=LiteLlm(
        model=model,
        extra_headers=auth_headers,
    ),
    instruction=(
        "You are a helpful AI assistant designed to provide accurate and useful"
        " information. Please output the tool callings with json format if"
        " exists."
    ),
    description="Retrieves the weather and current time using specific tools.",
    tools=[get_weather, get_current_time],
)
'''
with BUILD_DIR.joinpath("agent.py").open("w") as f:
    f.write(agent_app)

# @markdown 2. Create `__init__.py` to load agent.py for ADK apps.
initialize = '''
from . import agent
'''
with BUILD_DIR.joinpath("__init__.py").open("w") as f:
    f.write(initialize)

# @markdown 3. Create `Dockerfile` to build agent app dockers.
dockerfile_content = '''
FROM python:3.11-slim
WORKDIR /app
RUN adduser --disabled-password --gecos "" myuser
RUN chown -R myuser:myuser /app
USER myuser
ENV PATH="/home/myuser/.local/bin:$PATH"
RUN pip install \
  google-adk~=0.4.0 \
  google-cloud-logging~=3.11.4 \
  opentelemetry-exporter-gcp-trace~=1.9.0 \
  google-cloud-aiplatform[evaluation,agent-engines]~=1.88.0 \
  litellm~=1.66.2

COPY agent.py "/app/agents/model_garden_agents/"
COPY __init__.py "/app/agents/model_garden_agents/"
ENV GOOGLE_MODEL_GARDEN_ENDPOINT YOUR_ENDPOINT
EXPOSE 8000
CMD adk web --port=8000 --trace_to_cloud "/app/agents"
'''
with BUILD_DIR.joinpath("Dockerfile").open("w") as f:
    f.write(dockerfile_content)

# @markdown 4. Build agent web app dockers.
VMG_AGENT_UI_CONTAINER_IMAGE_URI = (
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY_NAME}/vmg-adk-ui"
)
! gcloud builds submit --tag $VMG_AGENT_UI_CONTAINER_IMAGE_URI --project $PROJECT_ID --machine-type e2-highcpu-32 $BUILD_DIR
print("The agent UI docker is :")
print(VMG_AGENT_UI_CONTAINER_IMAGE_URI)
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Deploy Agent Web App Dockers To Cloud Run
# @markdown After the deployment, there will be a service URL. You can click
# @markdown the service URL and interact with the agent web app. The agent web
# @markdown is a built-in development UI in [ADK](https://github.com/google/adk-python?tab=readme-ov-file)
# @markdown to help you test, evaluate, debug, and showcase your agent(s).

# @markdown ![ADK WEB UI](https://raw.githubusercontent.com/google/adk-python/main/assets/adk-web-dev-ui-function-call.png)

! gcloud run deploy vmg-agent-ui-1 \
    --port 8000 \
    --image="{VMG_AGENT_UI_CONTAINER_IMAGE_URI}" \
    --region="{REGION}" \
    --platform=managed \
    --allow-unauthenticated \
    --memory=1024Mi \
    --set-env-vars="GOOGLE_MODEL_GARDEN_ENDPOINT={endpoint_resource_name}"
# @markdown Click "Show Code" to see more details.

## Clean up resources

In [None]:
# @markdown  Delete the experiment resources to avoid unnecessary continuous
# @markdown  charges that may incur.

delete_endpoint = False # @param {type:"boolean"}
delete_artifact_registry = False # @param {type:"boolean"}
delete_tutorial_folder = False # @param {type:"boolean"}

if delete_endpoint:
  # Undeploy model and delete endpoint.
  endpoint.delete(force=True)

if delete_artifact_registry:
    ! gcloud artifacts repositories delete $REPOSITORY_NAME \
          --repository-format=docker \
          --location=$REGION \
          --project=$PROJECT_ID

if delete_tutorial_folder:
    import shutil
    shutil.rmtree(TUTORIAL_DIR)
