In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden Integration With Agents

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_integration_with_agent.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_integration_with_agent.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates how to build an agent with a Model Garden open model deployed through Vertex endpoint and [Google Agent Development Kit](https://google.github.io/adk-docs/) (ADK) and [Agent Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/agent-engine/quickstart). The agent can automatically call function tools like `get_weather` and `get_current_time`.


### Objective

- Deploy Vertex AI Model Garden OSS LLMs properly for agent integration
- Test deployed endpoints
- Integrate model garden endpoints with ADK (example: qwen3)
- Integrate model garden endpoints with Agent Engine (example: qwen3, llama3, llama4, deepseek-r1)

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

BUCKET_URI = "gs://"  # @param {type:"string"}

# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 4. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform>=1.84.0'
! pip3 install -qU openai google-auth requests
! pip3 install --upgrade --quiet \
    "google-cloud-aiplatform[agent_engines,langchain]" \
    cloudpickle==3.0.0 \
    langchain-google-community \
    pydantic==2.10.6 \
    requests \
    langchain-openai

# Import the necessary packages
import datetime
import importlib
import os
import uuid
from typing import Tuple
import zoneinfo
from google.cloud import aiplatform

import google.auth
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from vertexai.preview import reasoning_engines
from vertexai import agent_engines
import vertexai


if not os.path.exists("./vertex-ai-samples"):
  ! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

models, endpoints = {}, {}

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    if not os.environ.get("GOOGLE_CLOUD_REGION"):
        raise ValueError(
            "REGION must be set. See"
            " https://cloud.google.com/vertex-ai/docs/general/locations for"
            " available cloud locations."
        )
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)
vertexai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

from etils import epath
TUTORIAL_DIR = epath.Path("vmg_adk_agent_tutorial")
BUILD_DIR = TUTORIAL_DIR / "build"
BUILD_DIR.mkdir(exist_ok=True, parents=True)

REPOSITORY_NAME = "vertex-vision-model-garden-dockers"

!gcloud artifacts repositories create $REPOSITORY_NAME \
      --repository-format=docker \
      --location=$REGION \
      --project=$PROJECT_ID

! gcloud auth configure-docker $REGION-docker.pkg.dev --quiet


# Utils for tool calls.
from langchain.agents.format_scratchpad.tools import format_to_tool_messages
from langchain.memory import ChatMessageHistory
from langchain_core import prompts
from langchain_core.tools import render_text_description
from vertexai.preview.generative_models import ToolConfig
from pydantic import BaseModel, Field
from typing import Any, Dict, List
import json
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.tools import tool

@tool
def get_weather(city: str) -> str:
  """Simulates a web search. Use it get information on weather.

  Args:
      city: A string containing the location to get weather information for.

  Returns:
      A string with the simulated weather information for the queried city.
  """
  if "sf" in city.lower() or "san francisco" in city.lower():
    return "It's 70 degrees and foggy."
  return "It's 80 degrees and sunny."


@tool
def get_current_time(city: str) -> str:
  """Simulates getting the current time for a city.

  Args:
      city: The name of the city to get the current time for.

  Returns:
      A string with the current time information.
  """
  if "sf" in city.lower() or "san francisco" in city.lower():
    tz_identifier = "America/Los_Angeles"
  else:
    return f"Sorry, I don't have timezone information for city: {city}."

  tz = zoneinfo.ZoneInfo(tz_identifier)
  now = datetime.datetime.now(tz)
  return (
      f"The current time for city {city} is"
      f" {now.strftime('%Y-%m-%d %H:%M:%S %Z%z')}"
  )

tools = [get_weather, get_current_time]

# Define prompt template

rendered_tools = render_text_description(tools)
system_prompt_with_tools = f"""You are an assistant that has access to the
following set of tools. Here are the names and descriptions for each tool:

    {rendered_tools}

Given the user input, if the results need to call tools, please append the tool call results to the response, in the format of a JSON blob.
The tool call results should be the name and input of the tool to use, and return your response as a JSON blob with 'name' and 'arguments' keys. The `arguments` should be a dictionary, with keys corresponding to the argument names and the values corresponding to the requested values.

The json body must be in the format as:

```json
tool names and args.
```
The examples of json body are:

```json
{{
  'name': 'get_weather',
  'arguments': {{
    'city': 'SF'
  }}
}}
```

Please make the response as reasonable as possible.

The input is as below.

"""

system_prompt_without_tools = """You are an assistant.
There might be JSON blob in the response.
Please remove the JSON blob and make the response as reasonable as possible.
"""


class ToolCallRequest(BaseModel):
  """A typed dict that shows the inputs into the invoke_tool function."""

  name: str = Field(description="The name of the tool to call.")
  arguments: Dict[str, Any] = Field(
      description="The arguments to pass to the tool."
  )


def call_tools(model_output: str = None) -> List[Any]:
  """Execute the tool calls."""
  if not model_output:
    return []
  tool_map = {tool.name: tool for tool in tools}
  parser = JsonOutputParser(pydantic_object=ToolCallRequest)
  format_tool_calls = []
  try:
    tool_calls = parser.parse(model_output)
    if not tool_calls:
      return ""
    if not isinstance(tool_calls, list):
      tool_calls = [tool_calls]
    for tool_call in tool_calls:
      tool_name = tool_call["name"]
      tool_arguments = tool_call["arguments"]
      if tool_name not in tool_map:
        continue

      tool_function = tool_map[tool_name]
      tool_result = tool_function.invoke(tool_call["arguments"])
      format_tool_calls.append({
          "name": tool_name,
          "arguments": tool_arguments,
          "result": str(tool_result),
      })
  except Exception as ex:  # pylint: disable=broad-except
    print(str(ex))

  return format_tool_calls


## Deploy And Test OSS LLM Models For Agents

In [None]:
# @title Deploy OSS LLMs
# @markdown This section will show how to deploy OSS LLMs properly for agent integration.

# @markdown Note that, 1) if models support tool calls, then the deployment should enable tool calls.

# @markdown - If the models are deployed with *vllm*,
# @markdown the deployment should specify settings like `--enable-auto-tool-choice`
# @markdown and `--tool-call-parser=hermes`.

# @markdown - If the models are deployed with *sglang*, the deployment should specify
# @markdown setting like `--tool-call-parser=qwen25`.
# @markdown Refer to tool calls in
# @markdown [vllm](https://docs.vllm.ai/en/stable/features/tool_calling.html)
# @markdown and [sglang](https://docs.sglang.ai/backend/function_calling.html) for more details.

# @markdown 2) agent engine supports both traditional and dedicated endpoints,
# @markdown but ADK integration does not support dedicated points yet. You can
# @markdown also refer to deployment examples for [qwen3](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_qwen3_deployment.ipynb),
# @markdown [llama4](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama4_deployment.ipynb)
# @markdown and [deepseek-r1](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_deepseek_deployment.ipynb).
# @markdown For simplicity, we deploy models with traditional endpoints by default here.

MODEL_ID = "Qwen3-32B"  # @param ["Qwen3-32B", "llama-4-scout-17b-16e-instruct", "llama-3.3-70b-instruct", "DeepSeek-R1-Distill-Llama-70B", "gemma-3-27b-it"] {isTemplate: true}
accelerator_type = "NVIDIA_H100_80GB"  # @param ["NVIDIA_L4", "NVIDIA_H100_80GB"] {isTemplate: true}

if "Qwen" in MODEL_ID:
  publisher_model_name = f"publishers/qwen/models/qwen3@{MODEL_ID.lower()}"

  if accelerator_type == "NVIDIA_L4":
    accelerator_count = 4
    # Sets machine type to g2-standard-48 for 4 L4's
    machine_type = "g2-standard-48"
  elif accelerator_type == "NVIDIA_H100_80GB":
    accelerator_count = 2
    machine_type = "a3-highgpu-2g"
  else:
    raise ValueError(
        "Recommended machine settings not found for accelerator type: %s"
        % accelerator_type
    )
elif "llama-4" in MODEL_ID:
  publisher_model_name = f"publishers/meta/models/llama4@{MODEL_ID.lower()}"
  if accelerator_type == "NVIDIA_H100_80GB":
    accelerator_count = 8
    machine_type = "a3-highgpu-8g"
  else:
    raise ValueError(
        "Recommended machine settings not found for accelerator type: %s"
        % accelerator_type
    )
elif "llama-3" in MODEL_ID:
  publisher_model_name = f"publishers/meta/models/llama3-3@{MODEL_ID.lower()}"
  if accelerator_type == "NVIDIA_H100_80GB":
    accelerator_count = 4
    machine_type = "a3-highgpu-4g"
  else:
    raise ValueError(
        "Recommended machine settings not found for accelerator type: %s"
        % accelerator_type
    )
elif "DeepSeek-R1" in MODEL_ID:
  publisher_model_name = (
      f"publishers/deepseek-ai/models/deepseek-r1@{MODEL_ID.lower()}"
  )
  if accelerator_type == "NVIDIA_L4":
    accelerator_count = 8
    # Sets machine type to g2-standard-96 for 8 L4's
    machine_type = "g2-standard-96"
  elif accelerator_type == "NVIDIA_H100_80GB":
    accelerator_count = 4
    machine_type = "a3-highgpu-4g"
  else:
    raise ValueError(
        "Recommended machine settings not found for accelerator type: %s"
        % accelerator_type
    )
elif "gemma-3" in MODEL_ID:
  publisher_model_name = f"publishers/google/models/gemma3@{MODEL_ID.lower()}"
  if accelerator_type == "NVIDIA_L4":
    accelerator_count = 4
    # Sets machine type to g2-standard-48 for 4 L4's
    machine_type = "g2-standard-48"
  elif accelerator_type == "NVIDIA_H100_80GB":
    accelerator_count = 2
    machine_type = "a3-highgpu-2g"
  else:
    raise ValueError(
        "Recommended machine settings not found for accelerator type: %s"
        % accelerator_type
    )
else:
  raise ValueError("Unsupported model: %s" % MODEL_ID)

from vertexai.preview import model_garden

model = model_garden.OpenModel(publisher_model_name)

if "Qwen3-32B" == MODEL_ID:
  container_spec = model.list_deploy_options()[0].container_spec
  updated_args = container_spec.args[:-2] + [
      f"--tp={accelerator_count}",
      "--tool-call-parser=qwen25",
  ]
  container_spec.args = updated_args
  accept_eula = False
elif "llama-4-scout-17b-16e-instruct" == MODEL_ID:
  container_spec = model.list_deploy_options()[1].container_spec
  container_spec.image_uri = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/sglang-serve.cu124.0-4.ubuntu2204.py310:20250515-1800-rc0"
  updated_args = container_spec.args[:-1] + [
      f"--tp={accelerator_count}",
      "--tool-call-parser=pythonic",
  ]
  container_spec.args = updated_args
  accept_eula = True
elif "llama-3.3-70b-instruct" == MODEL_ID:
  container_spec = model.list_deploy_options()[0].container_spec
  accept_eula = True
elif "DeepSeek-R1" in MODEL_ID:
  container_spec = model.list_deploy_options()[0].container_spec
  updated_args = container_spec.args[:-1] + [f"--tp={accelerator_count}"]
  container_spec.args = updated_args
  accept_eula = False
elif "gemma-3-27b-it" == MODEL_ID:
  container_spec = model.list_deploy_options()[0].container_spec
  updated_args = []
  for arg in container_spec.args:
    if arg.startswith("--tensor-parallel-size"):
      updated_args.append(f"--tensor-parallel-size={accelerator_count}")
    else:
      updated_args.append(arg)
  container_spec.args = updated_args

  accept_eula = True
else:
  raise ValueError("Unsupported model: %s" % MODEL_ID)

print("The container spec are:")
print(container_spec)

print("Start to check quota for the deployment.")
common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)
print("Finished to check quota for the deployment.")

print("Start to deploy models to endpoints.")
endpoint = model.deploy(
    serving_container_spec=container_spec,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    use_dedicated_endpoint=False,
    spot=False,
    deploy_request_timeout=deploy_request_timeout,
    accept_eula=False,
)
print("Finished to deploy models to endpoints.")
# @markdown After endpoints are deployed successfully, you get the endpoint
# @markdown resource name with the format as
# @markdown `projects/{PROJECT_ID}/locations/{REGION}/endpoints/{ENDPOINT_ID}`.
# @markdown The endpoint resource name will be used in local predictions and
# @markdown integration with ADK below.
endpoint_resource_name = endpoint.resource_name
print("The deployed endpoint resource name is:")
print(endpoint_resource_name)
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Test The Endpoint
# endpoint_resource_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{ENDPOINT_NAME}"
# )
endpoint = aiplatform.Endpoint(endpoint_resource_name)

location = endpoint_resource_name.split("/")[3]
if endpoint.gca_resource.dedicated_endpoint_enabled:
  base_url = f"https://{endpoint.gca_resource.dedicated_endpoint_dns}/v1beta1/{endpoint.resource_name}"
else:
  base_url = f"https://{location}-aiplatform.googleapis.com/v1beta1/{endpoint.resource_name}"

# @markdown Predict locally with some requests.

user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 1000  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
stream = True  # @param {type: "boolean"}

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

client = openai.OpenAI(base_url=base_url, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
  usage = None
  contents = []
  for chunk in model_response:
    if chunk.usage is not None:
      usage = chunk.usage
      continue
    print(chunk.choices[0].delta.content, end="")
    contents.append(chunk.choices[0].delta.content)
  print(f"\n\n{usage}")
else:
  print(model_response.choices[0].message.content)

## Integrate OSS LLMs With ADK

In [None]:
# @title Build Agent Web App Dockers With VMG Endpoints
# @markdown The section will create required python and docker files first, and
# @markdown then build the dockers with cloud build.
# @markdown Note, if a model (e.g.: qwen3) is deployed properly with enabling
# @markdown tools, then the integration with ADK can work properly.

# @markdown 1. Create `agent.py` by loading VMG endpoints and example tool functions.
agent_app = '''
"""This is a sample agent for model garden agents."""

import datetime
import os
import re
import zoneinfo

from google.adk.agents import LlmAgent
from google.adk.models.lite_llm import LiteLlm
import google.auth

_MODEL_GARDEN_ENDPOINT_REGEX = r"projects\/.+\/locations\/.+\/endpoints\/.+"


def get_weather(city: str) -> str:
  """Simulates a web search. Use it get information on weather.

  Args:
      city: A string containing the location to get weather information for.

  Returns:
      A string with the simulated weather information for the queried city.
  """
  if "sf" in city.lower() or "san francisco" in city.lower():
    return "It's 70 degrees and foggy."
  return "It's 80 degrees and sunny."


def get_current_time(city: str) -> str:
  """Simulates getting the current time for a city.

  Args:
      city: The name of the city to get the current time for.

  Returns:
      A string with the current time information.
  """
  if "sf" in city.lower() or "san francisco" in city.lower():
    tz_identifier = "America/Los_Angeles"
  else:
    return f"Sorry, I don't have timezone information for city: {city}."

  tz = zoneinfo.ZoneInfo(tz_identifier)
  now = datetime.datetime.now(tz)
  return (
      f"The current time for city {city} is"
      f" {now.strftime('%Y-%m-%d %H:%M:%S %Z%z')}"
  )


def _get_auth_headers() -> dict[str, str]:
  """Gets the auth headers for the model garden endpoint."""
  creds, _ = google.auth.default(
      scopes=["https://www.googleapis.com/auth/cloud-platform"]
  )
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  return {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {creds.token}",
  }


def _setup_model_garden_endpoint():
  """Sets up the model garden endpoint."""
  endpoint = os.environ.get("GOOGLE_MODEL_GARDEN_ENDPOINT", "")

  if not re.compile(_MODEL_GARDEN_ENDPOINT_REGEX).fullmatch(endpoint):
    raise ValueError(
        f"Invalid model garden endpoint: {endpoint}. Please use the format"
        " projects/{project}/locations/{location}/endpoints/{endpoint}."
    )
  endpoint_parts = endpoint.split("/")
  os.environ.setdefault("GOOGLE_GENAI_USE_VERTEXAI", "True")
  os.environ["VERTEXAI_PROJECT"] = endpoint_parts[1]
  os.environ["VERTEXAI_LOCATION"] = endpoint_parts[3]
  os.environ["LITELLM_LOG"] = "DEBUG"
  return f"vertex_ai/openai/{endpoint_parts[5]}"


auth_headers = _get_auth_headers()
model = _setup_model_garden_endpoint()

print("The current model is: {model}")

root_agent = LlmAgent(
    name="root_agent",
    model=LiteLlm(
        model=model,
        extra_headers=auth_headers,
    ),
    instruction=(
        "You are a helpful AI assistant designed to provide accurate and useful"
        " information. Please output the tool callings with json format if"
        " exists."
    ),
    description="Retrieves the weather and current time using specific tools.",
    tools=[get_weather, get_current_time],
)
'''
with BUILD_DIR.joinpath("agent.py").open("w") as f:
  f.write(agent_app)

# @markdown 2. Create `__init__.py` to load agent.py for ADK apps.
initialize = """
from . import agent
"""
with BUILD_DIR.joinpath("__init__.py").open("w") as f:
  f.write(initialize)

# @markdown 3. Create `Dockerfile` to build agent app dockers.
dockerfile_content = """
FROM python:3.11-slim
WORKDIR /app
RUN adduser --disabled-password --gecos "" myuser
RUN chown -R myuser:myuser /app
USER myuser
ENV PATH="/home/myuser/.local/bin:$PATH"
RUN pip install \
  google-adk~=0.4.0 \
  google-cloud-logging~=3.11.4 \
  opentelemetry-exporter-gcp-trace~=1.9.0 \
  google-cloud-aiplatform[evaluation,agent-engines]~=1.88.0 \
  litellm~=1.66.2

COPY agent.py "/app/agents/model_garden_agents/"
COPY __init__.py "/app/agents/model_garden_agents/"
ENV GOOGLE_MODEL_GARDEN_ENDPOINT YOUR_ENDPOINT
EXPOSE 8000
CMD adk web --port=8000 --trace_to_cloud "/app/agents"
"""
with BUILD_DIR.joinpath("Dockerfile").open("w") as f:
  f.write(dockerfile_content)

# @markdown 4. Build agent web app dockers.
VMG_AGENT_UI_CONTAINER_IMAGE_URI = (
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY_NAME}/vmg-adk-ui"
)
! gcloud builds submit --tag $VMG_AGENT_UI_CONTAINER_IMAGE_URI --project $PROJECT_ID --machine-type e2-highcpu-32 $BUILD_DIR
print("The agent UI docker is :")
print(VMG_AGENT_UI_CONTAINER_IMAGE_URI)
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Deploy Agent Web App Dockers To Cloud Run

! gcloud run deploy vmg-agent-ui-1 \
    --port 8000 \
    --image="{VMG_AGENT_UI_CONTAINER_IMAGE_URI}" \
    --region="{REGION}" \
    --platform=managed \
    --allow-unauthenticated \
    --memory=1024Mi \
    --set-env-vars="GOOGLE_MODEL_GARDEN_ENDPOINT={endpoint_resource_name}"
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Test With Web App
# @markdown After the deployment, there will be a service URL. You can click
# @markdown the service URL and interact with the agent web app. The agent web
# @markdown is a built-in development UI in [ADK](https://github.com/google/adk-python?tab=readme-ov-file)
# @markdown to help you test, evaluate, debug, and showcase your agent(s).

# @markdown ![ADK WEB UI](https://raw.githubusercontent.com/google/adk-python/main/assets/adk-web-dev-ui-function-call.png)

## Integrate OSS LLMs With Agent Engine

In [None]:
# @title Build Agents With Endpoints And Agent Engine

# @markdown This section will build agents using deployed endpoints above and agent engine.
# @markdown If you already have an existing endpoint, which has the format as
# @markdown `projects/{PROJECT_ID}/locations/{REGION}/endpoints/{ENDPOINT_NAME}`,
# @markdown you can load the endpoint by `endpoint = aiplatform.Endpoint(endpoint_resource_name)`.

# endpoint_resource_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{ENDPOINT_NAME}"
# )

endpoint = aiplatform.Endpoint(endpoint_resource_name)
location = endpoint_resource_name.split("/")[3]
if endpoint.gca_resource.dedicated_endpoint_enabled:
  base_url = f"https://{endpoint.gca_resource.dedicated_endpoint_dns}/v1beta1/{endpoint.resource_name}"
else:
  base_url = f"https://{location}-aiplatform.googleapis.com/v1beta1/{endpoint.resource_name}"


def model_builder(
    *,
    model_name: str,
    model_kwargs=None,
    project: str,  # Specified via vertexai.init
    location: str,  # Specified via vertexai.init
    **kwargs,
):
  # Note: the credential expires after 1 hour by default.
  # After expiration, it must be refreshed.
  creds, _ = google.auth.default(
      scopes=["https://www.googleapis.com/auth/cloud-platform"]
  )
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)

  if model_kwargs is None:
    model_kwargs = {}

  return ChatOpenAI(
      model="",
      base_url=base_url,
      api_key=creds.token,
      **model_kwargs,
  )


# @markdown Use the following parameters to generate different answers:
# @markdown *   `max_tokens` to control the max tokens of the response
# @markdown *   `temperature` to control the randomness of the response

max_tokens = 1000  # @param {type:"number"}
temperature = 1.0  # @param {type:"number"}


prompt = {
    "system_prompt": lambda x: x["system_prompt"],
    "history": lambda x: x["history"],
    "input": lambda x: x["input"],
    "ai_prompt": lambda x: x["ai_prompt"],
    "agent_scratchpad": lambda x: format_to_tool_messages(
        x["intermediate_steps"]
    ),
} | prompts.ChatPromptTemplate.from_messages([
    prompts.MessagesPlaceholder(variable_name="system_prompt"),
    prompts.MessagesPlaceholder(variable_name="history"),
    ("user", "{input}"),
    prompts.MessagesPlaceholder(variable_name="ai_prompt"),
    prompts.MessagesPlaceholder(variable_name="agent_scratchpad"),
])

# Initialize session history
store = {}


def get_session_history(session_id: str):
  if session_id not in store:
    store[session_id] = ChatMessageHistory()
  return store[session_id]


agent = agent_engines.LangchainAgent(
    prompt=prompt,
    model="",  # Required.
    chat_history=get_session_history,
    model_builder=model_builder,  # Required.
    model_kwargs={
        "temperature": temperature,  # Optional.
        "max_tokens": max_tokens,  # Optional.
        "extra_body": {},
    },
)

In [None]:
# @title Test The Agent Locally
# @markdown You can check the agent result before deploying to Vertex AI.


def test_agent(running_agent, query, session_id):
  response = running_agent.query(
      input={
          "system_prompt": [("system", system_prompt_with_tools)],
          "input": query,
          "ai_prompt": [],
      },
      config={"configurable": {"session_id": session_id}},
  )
  tool_results = call_tools(response["output"])

  if tool_results:
    temporal_result = response["output"] + json.dumps(tool_results)
    response = running_agent.query(
        input={
            "system_prompt": [("system", system_prompt_without_tools)],
            "ai_prompt": [("ai", temporal_result)],
            "input": query,
        },
        config={"configurable": {"session_id": session_id}},
    )

  print(response["output"])


query = "What is the weather and current time in SF?"  # @param {type:"string"}
session_id = "demo"  # @param {type:"string"}

test_agent(
    running_agent=agent,
    query=query,
    session_id=session_id,
)

In [None]:
# @title Deploy Agent On Vertex AI

# @markdown This section will deploy the agent on Vertex AI.
# @markdown The supported regions for agent engine are listed [here](https://cloud.google.com/vertex-ai/generative-ai/docs/agent-engine/overview#supported-regions).
region_to_deploy_agent = "us-central1"  # @param {type:"string"}
vertexai.init(
    project=PROJECT_ID,
    location=region_to_deploy_agent,
    staging_bucket=BUCKET_URI,
)

remote_agent = agent_engines.create(
    agent,
    requirements=[
        "google-cloud-aiplatform[langchain,agent_engines]",
        "cloudpickle==3.0.0",
        "pydantic==2.10.6",
        "requests",
        "langchain-openai",
    ],
)

# @markdown After the deployment, you can get the agent resource name with the format
# @markdown as `projects/{PROJECT_ID}/locations/{REGION}/endpoints/{RESOURCE_ID}`.
# @markdown The agent resource name will be used below.

remote_agent_resource_name = remote_agent.resource_name
print(
    "The deployed remote agent resource name is: ", remote_agent_resource_name
)

In [None]:
# @title Test The Remote Agent

# remote_agent_resource_name = f"projects/{PROJECT}/locations/{REGION}/reasoningEngines/{RESOURCE_ID}"

remote_agent = agent_engines.get(remote_agent_resource_name)

query = "What is the weather in SF?"  # @param {type:"string"}
session_id = "demo"  # @param {type:"string"}

test_agent(
    running_agent=remote_agent,
    query=query,
    session_id=session_id,
)

## Clean up resources

In [None]:
# @markdown  Delete the experiment resources to avoid unnecessary continuous
# @markdown  charges that may incur.

delete_endpoint = False # @param {type:"boolean"}
delete_artifact_registry = False # @param {type:"boolean"}
delete_tutorial_folder = False # @param {type:"boolean"}
delete_agent_engine = False  # @param {type:"boolean"}

if delete_endpoint:
  # Undeploy model and delete endpoint.
  endpoint.delete(force=True)

if delete_artifact_registry:
    ! gcloud artifacts repositories delete $REPOSITORY_NAME \
          --repository-format=docker \
          --location=$REGION \
          --project=$PROJECT_ID

if delete_tutorial_folder:
    import shutil
    shutil.rmtree(TUTORIAL_DIR)

if delete_agent_engine:
    remote_agent.delete()
