In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving Open-Source LLMs on Vertex AI with LiteLLM and OpenAI-Compatible APIs

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/tree/main/open-models/use-cases/model_garden_litellm_inference.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fuse-cases%2Fmodel_garden_litellm_inference.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/use-cases/model_garden_litellm_inference.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/tree/main/open-models/use-cases/model_garden_litellm_inference.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/model_garden_litellm_inference.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/model_garden_litellm_inference.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/model_garden_litellm_inference.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/model_garden_litellm_inference.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/model_garden_litellm_inference.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author |
| --- |
| [Eliza Huang](https://github.com/lizzij) |

## Overview

This notebook demonstrates how to:
- Deploy an open-source LLM (e.g. DeepSeek, Qwen, LLaMA, Gemma) via [Vertex AI Model Garden](https://cloud.google.com/vertex-ai/docs/generative-ai/model-garden).
- Serve the model with a public Vertex AI endpoint.
- Connect it to [LiteLLM](https://docs.litellm.ai/) using an OpenAI-compatible schema for chat completion and function calling.


## Get started

### Install Vertex AI SDK and other required packages


In [None]:
%pip install --upgrade --force-reinstall --quiet 'google-cloud-aiplatform>=1.93.1' 'litellm' 'openai' 'google-auth' 'requests'

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
    if PROJECT_ID:
        PROJECT_ID = str(PROJECT_ID)

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

In [None]:
import os
import re

from vertexai import model_garden

### Define helpers

In [None]:
def print_models(data_list: list[str], items_per_line: int = 2) -> None:
    """Prints the list with a specified number of items per line with index and emojis,
    and includes the total count."""
    print("🌟--- Models available ---🌟")
    print("\n")
    print(f"🔢 Total models: {len(data_list)} 🔢\n")  # Print the count here

    for i, item in enumerate(data_list):
        print(f"✨ {item} ", end="")
        if (i + 1) % items_per_line == 0:
            print()
        else:
            print(" --- ", end="")

    if len(data_list) % items_per_line != 0:
        print()

## Find the models that you can deploy

In Vertex AI Model Garden, you can discover and deploy a wide range of open-source models.

Many of these models are directly supported in Vertex AI Model Garden with some pre-configured for optimized deployment on Vertex AI. When the open model is not available in Vertex AI Model Garden or you want to deploy your model from HF hub, you can leverage the Hugging Face gallery which gives you access to more that 1M models.

With the Vertex AI Model Garden SDK, you can browse and deploy supported models, including those from the Hugging Face gallery. You can also filter by model name or type to find the best fit for your use case.

Let's explore which Llama models are available in Vertex AI Model Garden.

In [None]:
model_garden_models = model_garden.list_deployable_models(
    model_filter="Llama", list_hf_models=False
)

In [None]:
print_models(model_garden_models, items_per_line=3)

To include Llama models that are available via Hugging Face Gallery, you can enable `list_hf_models` flag.

In [None]:
deployable_models = model_garden.list_deployable_models(
    model_filter="llama", list_hf_models=True
)

In [None]:
print_models(deployable_models)

## Deploy your 1st Model Garden model

In this section, you'll deploy an open-source model from Vertex AI Model Garden to a public endpoint. You can use the Cloud Console or Vertex AI SDK for deployment.

For more advanced workflows (e.g., custom hardware, model tuning, or shared endpoints), refer to the [official SDK notebook](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_model_garden_sdk.ipynb).

In [None]:
model_id = "meta/llama3_1@llama-3.1-8b-instruct"

model = model_garden.OpenModel(model_id)

### Check the deployment configuration

After you initiate the model, use `list_deploy_options()` method to discover the verified deployment configurations supported by a specific model.

This is important to verify if you have enough resources to deploy the model.

In [None]:
deploy_options = model.list_deploy_options(concise=True)
print(deploy_options)

### Deploy the model

Now that you know how the model will be deployed, let's use the `deploy()` method to serve the selected open model to a Vertex AI Endpoint. Depending on the model, the deployment would require some minutes.

> **Note**: If the model has an End User License Agreement (EULA), you can accept it using `accept_eula` flag.


In [None]:
endpoint = model.deploy(
    accept_eula=True,
    serving_container_image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241016_0916_RC00_maas",
    machine_type="g2-standard-12",
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
)

## LiteLLM Inference

### Set up LiteLLM

Configure [LiteLLM](https://docs.litellm.ai/) to route OpenAI-compatible API calls (e.g. chat, function calling) to your Vertex AI endpoint. Use the `vertex_ai/openai/{endpoint_id}` format to enable OpenAI-style behavior. Other formats like `vertex_ai/gemini/{endpoint}` are also supported — see the full list [here](https://litellm-api.up.railway.app/).

In [None]:
def _setup_model_garden_endpoint(endpoint_resource_name: str) -> str:
    """Sets up the model garden endpoint by extracting project, location, and endpoint ID.

    Args:
        endpoint_resource_name: The full resource name of the endpoint,
                                 e.g., "projects/{project}/locations/{location}/endpoints/{endpoint}".

    Returns:
        A formatted string for the endpoint, e.g., "vertex_ai/openai/{endpoint_id}".

    Raises:
        ValueError: If the endpoint_resource_name does not match the expected format.
    """
    _MODEL_GARDEN_ENDPOINT_REGEX = r"^projects/([^/]+)/locations/([^/]+)/endpoints/([^/]+)$"

    match = re.fullmatch(_MODEL_GARDEN_ENDPOINT_REGEX, endpoint_resource_name)
    if not match:
        raise ValueError(
            f"Invalid model garden endpoint: '{endpoint_resource_name}'. "
            "Please use the format 'projects/{{project}}/locations/{{location}}/endpoints/{{endpoint}}'."
        )

    project_id, location_id, endpoint_id = match.groups()

    os.environ.setdefault("GOOGLE_GENAI_USE_VERTEXAI", "True")
    os.environ["VERTEXAI_PROJECT"] = project_id
    os.environ["VERTEXAI_LOCATION"] = location_id
    os.environ["LITELLM_LOG"] = "DEBUG" # Consider if this is always desired in production

    return f"vertex_ai/openai/{endpoint_id}"

deployed_model = _setup_model_garden_endpoint(endpoint.resource_name)
print(f"The current model is: {deployed_model}")

### Chat Completion via LiteLLM

This section demonstrates how to send role-based chat messages to your Vertex AI–hosted model using LiteLLM’s `completion` API. For examples and formatting guidance, refer to the [LiteLLM input documentation](https://docs.litellm.ai/completion/input).

In [None]:
from litellm import completion

response = completion(
    model=deployed_model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What's the capital of Japan?"}
    ]
)

print(response["choices"][0]["message"]["content"])

### Function Calling via LiteLLM

Here we use OpenAI-style function calling with LiteLLM and a Vertex AI–hosted model. You’ll define a function schema, and the model will return structured arguments suitable for downstream execution. For reference, see [`test_parallel_function_call`](https://docs.litellm.ai/docs/completion/function_call#full-code---parallel-function-calling-with-gpt-35-turbo-1106) in the LiteLLM documentation.

In [None]:
import google.auth

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

In [None]:
import litellm
import json

# set openai api key
import os
os.environ['OPENAI_API_KEY'] = creds.token

# Example dummy function hard coded to return the same weather
# In production, this could be your backend API or an external API
def get_current_weather(location, unit="fahrenheit"):
    """Get the current weather in a given location"""
    if "tokyo" in location.lower():
        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
    elif "san francisco" in location.lower():
        return json.dumps({"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"})
    elif "paris" in location.lower():
        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
    else:
        return json.dumps({"location": location, "temperature": "unknown"})


def test_parallel_function_call():
    try:
        # Step 1: send the conversation and available functions to the model
        messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "get_current_weather",
                    "description": "Get the current weather in a given location",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "location": {
                                "type": "string",
                                "description": "The city and state, e.g. San Francisco, CA",
                            },
                            "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                        },
                        "required": ["location"],
                    },
                },
            }
        ]
        response = litellm.completion(
            model=deployed_model,
            messages=messages,
            tools=tools,
            tool_choice="auto",  # auto is default, but we'll be explicit
        )
        print("\nFirst LLM Response:\n", response)
        response_message = response.choices[0].message
        tool_calls = response_message.tool_calls

        print("\nLength of tool calls", len(tool_calls))

        # Step 2: check if the model wanted to call a function
        if tool_calls:
            # Step 3: call the function
            # Note: the JSON response may not always be valid; be sure to handle errors
            available_functions = {
                "get_current_weather": get_current_weather,
            }  # only one function in this example, but you can have multiple
            messages.append(response_message)  # extend conversation with assistant's reply

            # Step 4: send the info for each function call and function response to the model
            for tool_call in tool_calls:
                function_name = tool_call.function.name
                function_to_call = available_functions[function_name]
                function_args = json.loads(tool_call.function.arguments)
                function_response = function_to_call(
                    location=function_args.get("location"),
                    unit=function_args.get("unit"),
                )
                messages.append(
                    {
                        "tool_call_id": tool_call.id,
                        "role": "tool",
                        "name": function_name,
                        "content": function_response,
                    }
                )  # extend conversation with function response
            second_response = litellm.completion(
                model=deployed_model,
                messages=messages,
            )  # get a new response from the model where it can see the function response
            print("\nSecond LLM response:\n", second_response)
            return second_response
    except Exception as e:
      print(f"Error occurred: {e}")

test_parallel_function_call()

## Cleaning up

In [None]:
delete_endpoints = True

if delete_endpoints:
    endpoint.delete(force=True)