In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Chat Completions With Streaming Playground

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_gradio_streaming_chat_completions.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gradio_streaming_chat_completions.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates starting a playground based on [Gradio UI](https://www.gradio.app/) that allows users to interact with the instruction-tuned text generation models via a chatbot UI more easily.

### Objective

This notebook shows how to build a streaming chat UI using [Gradio](https://www.gradio.app/) and models from **Vertex AI Model Garden**.

We cover two options:

1. Public Playground Endpoints — quick demos, no deployment needed.  
2. Self-Deployed Endpoints (via Model Garden SDK) — production-ready, full control over resources, scaling, and networking using [Vertex Online Prediction](https://cloud.google.com/vertex-ai/docs/predictions/get-online-predictions).


### File a Bug

If you encounter issues with this notebook, report them on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new).

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Get Started

### Install Vertex AI SDK and other required packages

In [None]:
%pip install --upgrade --force-reinstall --quiet 'google-cloud-aiplatform>=1.106.0' 'gradio~=4.40.0' 'openai' 'google-auth==2.27.0' 'requests==2.32.3'

### Authenticate the Notebook Environment (Colab only)

If you're running this notebook in Google Colab, run the following cell to authenticate.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud Project Information


To get started with Vertex AI, ensure you have an existing Google Cloud project and that the [Vertex AI API is enabled](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

See the guide on [setting up your project and development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment). Also confirm that [billing is enabled](https://cloud.google.com/billing/docs/how-to/modify-project).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

REGION = "us-west1"  # @param {type: "string", placeholder: "us-west1", isTemplate: true}

if not REGION:
    REGION = os.environ.get("GOOGLE_CLOUD_REGION", "us-west1")

vertexai.init(project=PROJECT_ID, location=REGION)

### Import libraries

In [None]:
import json
from typing import Any, Dict, List, Optional, Tuple

import google.auth
import google.auth.transport.requests
import gradio as gr
import requests
from vertexai import model_garden

## Choose an Endpoint

### [Option 1] Public Playground Endpoint

Google provides some shared endpoints for quick testing. These are **multi-tenant** and intended for experimentation, not production. Use this option if you just want to test the chat UI quickly.

This example is using Gemma-2-2b-it (Public playground).

In [None]:
use_public_endpoint = True
MODEL = "google/796"

### [Option 2] Self-Deployed Endpoint
Deploy a model from Model Garden with your own settings. You control machine type, scaling, etc.

In [None]:
use_public_endpoint = False

#### Choose model variant

You can proceed with the default model variant or select a different one.

To see all deployable model variants available in Model Garden, use:

In [None]:
all_deployable_models = model_garden.list_deployable_models()

Once you've selected a model variant, initialize it:

In [None]:
model = model_garden.OpenModel("openai/gpt-oss@gpt-oss-20b")

#### Check the Deployment Configuration

Use the `list_deploy_options()` method to view the verified deployment configurations for your selected model. This helps ensure you have sufficient resources (e.g., GPU quota) available to deploy it.

> **Note**: Only endpoints with **TGI**, **vLLM**, and **HexLLM** serving container image deployed after August 20, 2024 with a new container image support chat completions and streaming features. If you are not sure, you can deploy a demo endpoint directly from below.

In [None]:
deploy_options = model.list_deploy_options(concise=True)
print(deploy_options)

#### Deploy the Model

Now that you’ve reviewed the deployment options, use the `deploy()` method to serve the selected open model to a Vertex AI endpoint. Deployment time may vary depending on the model size and infrastructure requirements.

> **Note**: If the model requires accepting a license agreement (EULA), set the `accept_eula=True` flag in the deploy call. Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).

In [None]:
use_dedicated_endpoint = False

In [None]:
endpoint = model.deploy(
    accept_eula=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
    serving_container_image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250807_0916_RC01_maas",
    machine_type="a3-highgpu-1g",
    accelerator_type="NVIDIA_H100_80GB",
    accelerator_count=1,
)

## Streaming Chat Function

This function will:
- Take user input + history
- Call the model (streaming)
- Yield partial outputs so the UI updates in real time

In [None]:
def format_payload(
    messages: List[Dict[str, str]], max_tokens: int, model: str = None
) -> Dict[str, Any]:
    """Formats the request payload for the chat completion API."""
    payload = {
        "messages": messages,
        "max_tokens": max_tokens,
        "stream": True,
    }
    # Conditionally add the model for public endpoints
    if model:
        payload["model"] = model
    return payload


class StreamingClient:
    """A wrapper for a streaming client, initialized with either a model (public) or an endpoint (custom)."""

    def __init__(
        self,
        model: Optional[str] = None,
        endpoint: Optional[Any] = None,
        max_tokens: int = 512,
        use_dedicated_endpoint: bool = False,
    ):
        """
        Initializes the client with API configuration.

        :param model: The model ID (e.g., "gemini-2.5-flash") for the public endpoint.
        :param endpoint: An object representing a custom deployed endpoint (must have a resource_name).
        :param max_tokens: The maximum number of tokens to generate.
        :param use_dedicated_endpoint: Flag to use a GCA-dedicated endpoint URL pattern.
        """
        self.max_tokens = max_tokens

        if model is not None and endpoint is not None:
            raise ValueError(
                "Must provide either a 'model' (for public API) OR an 'endpoint' (for custom deployment), not both."
            )
        if model is None and endpoint is None:
            raise ValueError(
                "Must provide a 'model' (for public API) or an 'endpoint' (for custom deployment)."
            )

        self.model = model
        self.use_public_endpoint = model is not None

        if self.use_public_endpoint:
            self.url = f"https://{REGION}-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/openapi/chat/completions"

        elif use_dedicated_endpoint:
            self.url = f"https://{endpoint.dedicated_endpoint_dns}/v1beta1/{endpoint.resource_name}/chat/completions"

        else:
            self.url = f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{endpoint.resource_name}/chat/completions"

    def _get_access_token(self) -> str:
        """Programmatically obtains the access token using google.auth."""
        credentials, _ = google.auth.default(
            scopes=["https://www.googleapis.com/auth/cloud-platform"]
        )
        auth_request = google.auth.transport.requests.Request()
        credentials.refresh(auth_request)
        return credentials.token

    def predict(self, message: str, chat_history: List[Tuple[str, str]]):
        """
        Sends a request to the chat API and streams the response.
        :yields: Chunks of the streamed prediction text.
        """
        messages = []
        for u, a in chat_history:
            messages.append({"role": "user", "content": u})
            messages.append({"role": "assistant", "content": a})
        messages.append({"role": "user", "content": message})

        model_to_use = self.model if self.use_public_endpoint else None
        payload = format_payload(messages, self.max_tokens, model=model_to_use)

        access_token = self._get_access_token()

        response = requests.post(
            self.url,
            headers={"Authorization": f"Bearer {access_token}"},
            json=payload,
            stream=True,
        )

        if not response.ok:
            raise gr.Error(
                f"API Request Failed: {response.status_code} - {response.text}"
            )

        prediction = ""
        for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False):
            if chunk:
                chunk = chunk.decode("utf-8").removeprefix("data:").strip()
                if chunk == "[DONE]":
                    break
                try:
                    data = json.loads(chunk)
                except json.JSONDecodeError:
                    continue

                if not isinstance(data, dict) or "error" in data:
                    raise gr.Error(data)

                delta = data["choices"][0]["delta"].get("content")
                if delta:
                    prediction += delta
                    yield prediction

## Build Gradio Interface
Use Gradio to build a chat interface that calls the `stream_chat` generator: the UI shows messages and the streaming response.

In [None]:
if use_public_endpoint:
    client = StreamingClient(model=MODEL)
else:
    client = StreamingClient(
        endpoint=endpoint, use_dedicated_endpoint=use_dedicated_endpoint
    )

with gr.Blocks(title="Vertex Model Garden Chat", fill_height=True) as demo:
    gr.ChatInterface(client.predict)

demo.launch(share=False, debug=True, show_error=True)

## Cleanup

If you deployed your own endpoint, make sure to delete it to avoid charges:

In [None]:
# endpoint.delete()  # Uncomment when ready