In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemini Live API Quickstart

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/live_api_quickstart.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fmultimodal-live-api%2Flive_api_quickstart.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/multimodal-live-api/live_api_quickstart.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/live_api_quickstart.ipynb">
      <img width="32px" src="https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<p>
<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/live_api_quickstart.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/live_api_quickstart.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/live_api_quickstart.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/live_api_quickstart.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/live_api_quickstart.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>
</p>

| Authors |
| --- |
| [Eric Dong](https://github.com/gericdong) |

## Overview

This notebook demonstrates how to connect to the **Gemini Live API** for real-time, bidirectional audio streaming. You will learn to establish a session with a Gemini model, simulate an audio stream from a file, and play back the generated audio response.

The guide provides two implementation examples:

- [Gen AI SDK](https://github.com/googleapis/python-genai): A simplified approach using the Google **Gen AI SDK** to manage the session and handle interruptions.

- [WebSocket](https://developer.mozilla.org/en-US/docs/Web/API/WebSockets_API): A low-level approach using standard **WebSockets** to construct the handshake and manage raw JSON payloads.

## Getting Started

### Install required libraries

In [None]:
%pip install --upgrade google-genai

### Import libraries

In [None]:
import asyncio
import base64
import os
import sys
import json
import wave
import websockets
import numpy as np

from IPython.display import Audio, display

from google import genai
from google.genai import types

### Authenticate your notebook environment

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Authenticate your Google Cloud Project for Vertex AI

You can use a Google Cloud Project or an API Key for authentication. This tutorial uses a Google Cloud Project.

- [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)

In [None]:
# fmt: off
PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
# fmt: on
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = "us-central1"  # @param {type: "string", placeholder: "global"}

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Choose a Gemini model



In [None]:
# fmt: off
MODEL_ID = "gemini-live-2.5-flash-preview-native-audio-09-2025"  # @param {type: "string"}
# fmt: on

### About audio streaming

In these guides, you send an audio file to the model and receive audio in response. In production, the input audio would be a microphone stream. The Live API supports the following audio formats:

- **Input audio**: Raw 16-bit PCM audio at 16kHz, little-endian
- **Output audio**: Raw 16-bit PCM audio at 24kHz, little-endian

The client must maintain a playback buffer. The server streams audio in chunks (within server_content messages). The client's responsibility is to:

- **Decode**: Base64 decode the `inline_data`.
- **Buffer**: Append the binary data to a queue.
- **Play**: Feed the data to the audio hardware.


In [None]:
# Download a sample audio input file
audio_file = "input.wav"
audio_file_url = f"https://storage.googleapis.com/cloud-samples-data/generative-ai/audio/tell-a-story.wav"

!wget -q $audio_file_url -O $audio_file

with wave.open(audio_file, 'rb') as wf:
    frames = wf.readframes(wf.getnframes())
    print(f"Read audio: {len(frames)} bytes")
    print(f"Channels: {wf.getnchannels()}")
    print(f"Rate: { wf.getframerate()}Hz")
    print(f"Width: {wf.getsampwidth()} bytes")

display(Audio(filename=audio_file, autoplay=True))

### ðŸš€ Quickstart 1: Using Gen AI SDK

In this quickstart, you learn how to connect to the Live API using the [Google Gen AI SDK](https://github.com/googleapis/python-genai), and establish a session, send an audio file to the model and receive audio in response. This is a simplified approach to manage the session and handle interruptions.


In [None]:
# Configuration
config = {
    "response_modalities": ["audio"],
}


async def main():
    # Establish WebSocket session
    async with client.aio.live.connect(model=MODEL_ID, config=config) as session:
        print("Session established. Sending audio...")

        # Send Input (Simulated from file)
        # In production, this would be a microphone stream
        with open("input.wav", "rb") as f:
            while chunk := f.read(1024):
                await session.send_realtime_input(
                    audio=types.Blob(data=chunk, mime_type="audio/pcm;rate=16000")
                )
                await asyncio.sleep(0.01)  # Simulate real-time stream

        audio_data = []

        # Receive Output
        async for message in session.receive():
            if message.server_content:
                # Check for interruptions (User barge-in)
                if message.server_content.interrupted:
                    print("[Interrupted] Clear client audio buffer immediately.")
                    continue

                # Process Audio Chunks
                model_turn = message.server_content.model_turn
                if model_turn and model_turn.parts:
                    for part in model_turn.parts:
                        if part.inline_data:
                            # Output is PCM, 24kHz, 16-bit, Mono
                            print(
                                f"Received audio chunk: {len(part.inline_data.data)} bytes"
                            )
                            audio_data.append(
                                np.frombuffer(part.inline_data.data, dtype=np.int16)
                            )

            if message.server_content.turn_complete:
                print("Turn complete.")
                display(Audio(np.concatenate(audio_data), rate=24000, autoplay=True))


# Run directly in notebook
await main()

## ðŸš€ Quickstart 2: Using WebSocket

In this quickstart, you learn how to connect to the Live API using [WebSockets](https://developer.mozilla.org/en-US/docs/Web/API/WebSockets_API), and send an audio file to the model and receive audio in response. This is a low-level approach using standard WebSockets to construct the handshake and manage raw JSON payloads.

In [None]:
# Authentication
token_list = !gcloud auth application-default print-access-token
headers = {"Authorization": f"Bearer {token_list[0]}"}

# Configuration
MODEL = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{MODEL_ID}"
config = {
   "response_modalities": ["audio"],
}

# Construct the WSS URL
HOST = f"{LOCATION}-aiplatform.googleapis.com"
URI = f"wss://{HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"

async def main():
    # Connect to the server
    async with websockets.connect(URI, additional_headers=headers) as ws:
      # Send Setup (Handshake)
      await ws.send(json.dumps({
          "setup": {
              "model": MODEL,
              "generation_config": config
          }
      }))
      print("Session established. Sending audio...")

      # Receive setup response
      raw_response = await ws.recv(decode=False)
      setup_response = json.loads(raw_response.decode("ascii"))

      # Send Input (Simulated from file)
      # In production, this would be a microphone stream
      with open("input.wav", "rb") as f:
          while chunk := f.read(1024):
              msg = {
                  "realtime_input": {
                      "media_chunks": [{
                          "mime_type": "audio/pcm;rate=16000",
                          "data": base64.b64encode(chunk).decode("utf-8")
                      }]
                  }
              }
              await ws.send(json.dumps(msg))
              await asyncio.sleep(0.01) # Simulate real-time stream

      audio_data = []

      # Receive chunks of server response
      async for raw_response in ws:
          data = json.loads(raw_response.decode())
          try:
              parts = data["serverContent"]["modelTurn"]["parts"]
              for part in parts:
                  if "inlineData" in part:
                      b64_audio = part["inlineData"]["data"]
                      print(f"Received chunk: {len(b64_audio)} bytes")
                      pcm_data = base64.b64decode(b64_audio)
                      audio_data.append(np.frombuffer(pcm_data, dtype=np.int16))
          except KeyError:
              pass

          if data.get("serverContent", {}).get("turnComplete"):
              print("Turn complete.")
              display(Audio(np.concatenate(audio_data), rate=24000, autoplay=True))
              break


# Run directly in notebook
await main()

## What's next

Now that you have established a basic connection to the Gemini Live API, try these advanced capabilities to build a production-ready application:

- [Getting Started with the Live API in Vertex AI using WebSockets](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_multimodal_live_api.ipynb)
- [Getting Started with the Live API using Gen AI SDK](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_multimodal_live_api_genai_sdk.ipynb)
