In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Multimodal Live API with Gen AI SDK


<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_multimodal_live_api_genai_sdk.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fmultimodal-live-api%2Fintro_multimodal_live_api_genai_sdk.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/multimodal-live-api/intro_multimodal_live_api_genai_sdk.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_multimodal_live_api_genai_sdk.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_multimodal_live_api_genai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_multimodal_live_api_genai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_multimodal_live_api_genai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_multimodal_live_api_genai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_multimodal_live_api_genai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>


| | |
|-|-|
| Author(s) |  [Eric Dong](https://github.com/gericdong), [Holt Skinner](https://github.com/holtskinner) |

## Overview

The Multimodal Live API enables low-latency bidirectional voice and video interactions with Gemini. Multimodal Live API is designed for server-to-server communication. This notebook demonstrates the following simple examples to help you get started with the Multimodal Live API using the Google Gen AI SDK in Vertex AI.

- Text to text
- Text to audio
- Text to audio in a chat

See the [Multimodal Live API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live) page for more details.

## Getting Started

### Install Google Gen AI SDK for Python


In [None]:
%pip install --upgrade --quiet google-genai

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries


In [3]:
import os

from IPython.display import Audio, Markdown, display
from google import genai
from google.genai.types import LiveConnectConfig
import numpy as np

### Set Google Cloud project information and create client

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [4]:
PROJECT_ID = "[your-project-id]"  # @param {type: "string"}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

In [5]:
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Load the Gemini 2.0 Flash model

Multimodal Live API is a new capability introduced with the [Gemini 2.0 Flash model](https://cloud.google.com/vertex-ai/generative-ai/docs/gemini-v2).

In [6]:
MODEL_ID = "gemini-2.0-flash-exp"  # @param {type: "string"}

## **Example 1**: Text to text

You send one text prompt and receive text response.

In [10]:
config = LiveConnectConfig(response_modalities=["TEXT"])

async with client.aio.live.connect(
    model=MODEL_ID,
    config=config,
) as session:
    text_input = "Hello? Gemini are you there?"
    display(Markdown(f"**Input:** {text_input}"))

    await session.send(input=text_input, end_of_turn=True)

    response = []

    async for message in session.receive():
        if message.server_content.model_turn:
            response.append(message.text)

    display(Markdown(f"**Response >** {''.join(response)}"))

**Input:** Hello? Gemini are you there?

**Response >** Yes, I'm here. How can I help you today?


## **Example 2**: Text to audio

You send text prompts and receive responses in audio.


In [12]:
config = LiveConnectConfig(response_modalities=["AUDIO"])

async with client.aio.live.connect(
    model=MODEL_ID,
    config=config,
) as session:
    text_input = "Hello? Gemini are you there?"
    display(Markdown(f"**Input:** {text_input}"))

    await session.send(input=text_input, end_of_turn=True)

    audio_data = []
    async for message in session.receive():
        if message.server_content.model_turn:
            for part in message.server_content.model_turn.parts:
                if part.inline_data:
                    audio_data.append(
                        np.frombuffer(part.inline_data.data, dtype=np.int16)
                    )

    if audio_data:
        display(Audio(np.concatenate(audio_data), rate=24000, autoplay=True))

**Input:** Hello? Gemini are you there?

## **Example 3**: Text to audio in a chat

**Step 1**: You set up a chat with the API to answer your text prompts and return responses in audio.

In [13]:
config = LiveConnectConfig(response_modalities=["AUDIO"])


async def main() -> None:
    async with client.aio.live.connect(model=MODEL_ID, config=config) as session:

        async def send() -> bool:
            text_input = input("Input > ")
            if text_input.lower() in ("q", "quit", "exit"):
                return False
            await session.send(input=text_input, end_of_turn=True)
            return True

        async def receive() -> None:

            audio_data = []

            async for message in session.receive():
                if message.server_content.model_turn:
                    for part in message.server_content.model_turn.parts:
                        if part.inline_data:
                            audio_data.append(
                                np.frombuffer(part.inline_data.data, dtype=np.int16)
                            )

                if message.server_content.turn_complete:
                    display(Markdown("**Response >**"))
                    display(
                        Audio(np.concatenate(audio_data), rate=24000, autoplay=True)
                    )
                    break

            return

        while True:
            if not await send():
                break
            await receive()

**Step 2** Run the chat, input your prompts, or type `q`, `quit` or `exit` to exit.


In [14]:
await main()

Input > Hello


**Response >**

Input > What can you do?


**Response >**

Input > q


## What's next

- Learn how to [build a web application that enables you to use your voice and camera to talk to Gemini 2.0 through the Multimodal Live API.](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/multimodal-live-api/websocket-demo-app)
- See the [Multimodal Live API reference docs](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live).
- See the [Google Gen AI SDK reference docs](https://googleapis.github.io/python-genai/).
- Explore other notebooks in the [Google Cloud Generative AI GitHub repository](https://github.com/GoogleCloudPlatform/generative-ai).