In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Getting Started with the Live API Native Audio


<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_live_api_native_audio.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fmultimodal-live-api%2Fintro_live_api_native_audio.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/multimodal-live-api/intro_live_api_native_audio.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_live_api_native_audio.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<p>
<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_live_api_native_audio.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_live_api_native_audio.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_live_api_native_audio.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_live_api_native_audio.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-live-api/intro_live_api_native_audio.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>
</p>

| Authors |
| --- |
| [Eric Dong](https://github.com/gericdong) |
| [Holt Skinner](https://github.com/holtskinner) |

## Overview

This notebook demonstrates how to connect to the Gemini Live API using the Google Gen AI SDK for Python, focusing on **Native Audio** features like **Proactive Audio** and **Affective Dialog**.


## Getting Started

### Install Google Gen AI SDK for Python


In [None]:
%pip install --upgrade --quiet google-genai

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries


In [None]:
from typing import Any, Dict, List, Optional

from IPython.display import Audio, Markdown, display
from google.genai.types import (
    AudioTranscriptionConfig,
    Content,
    LiveConnectConfig,
    Part,
    ProactivityConfig,
)
import numpy as np

### Set Google Cloud project information and create client

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

from google import genai

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

## Using the Gemini 2.5 Flash Native Audio


Gemini 2.5 Flash with Live API features native audio dialog capabilities.


In [None]:
MODEL_ID = "gemini-live-2.5-flash-preview-native-audio-09-2025"  # @param {type: "string"}

## Reusable Live API Modules

The following functions are designed to manage the session configuration, handle a single conversational turn, and execute a multi-turn session.

### `configure_session`

This function creates a flexible `LiveConnectConfig` object to enable or disable features like system instruction, transcription, proactivity, and affective dialog.

In [None]:
def configure_session(
    system_instruction: Optional[str] = None,
    enable_transcription: bool = True,
    enable_proactivity: bool = False,
    enable_affective_dialog: bool = False,
) -> LiveConnectConfig:
    """
    Creates a configuration object for the Live Connect session.
    """
    input_transcription = AudioTranscriptionConfig() if enable_transcription else None
    output_transcription = AudioTranscriptionConfig() if enable_transcription else None
    # NOTE: Proactive Audio requires proactive_audio=True in ProactivityConfig
    proactivity = (
        ProactivityConfig(proactive_audio=True) if enable_proactivity else None
    )

    config = LiveConnectConfig(
        response_modalities=["AUDIO"],
        system_instruction=system_instruction,
        input_audio_transcription=input_transcription,
        output_audio_transcription=output_transcription,
        proactivity=proactivity,
        enable_affective_dialog=enable_affective_dialog,
    )

    return config

### `send_and_receive_turn`

This asynchronous function manages a single user turn: it sends the text, streams the audio and transcription messages back from the model, and displays the results.

In [None]:
async def send_and_receive_turn(
    session: genai.live.AsyncSession, text_input: str
) -> Dict[str, Any]:
    """
    Sends a single text turn to the Live Connect session and processes the streaming response.
    """
    display(Markdown("\n---"))
    display(Markdown(f"**Input:** {text_input}"))

    # 1. Send the user's content
    await session.send_client_content(
        turns=Content(role="user", parts=[Part(text=text_input)])
    )

    audio_data = []
    input_transcriptions = []
    output_transcriptions = []

    # 2. Process the streaming response messages
    async for message in session.receive():
        # Collect input transcription (what the model heard the user say)
        if (
            message.server_content.input_transcription
            and message.server_content.input_transcription.text
        ):
            input_transcriptions.append(message.server_content.input_transcription.text)

        # Collect output transcription (the model's spoken response text)
        if (
            message.server_content.output_transcription
            and message.server_content.output_transcription.text
        ):
            output_transcriptions.append(
                message.server_content.output_transcription.text
            )

        # Collect audio data (the model's spoken response audio chunks)
        if (
            message.server_content.model_turn
            and message.server_content.model_turn.parts
        ):
            for part in message.server_content.model_turn.parts:
                if part.inline_data:
                    # Assuming the audio data is always in np.int16 format (24000Hz rate)
                    audio_data.append(
                        np.frombuffer(part.inline_data.data, dtype=np.int16)
                    )

    # 3. Display the results
    results = {
        "audio_data": audio_data,
        "input_transcription": "".join(input_transcriptions),
        "output_transcription": "".join(output_transcriptions),
    }

    if results["input_transcription"]:
        display(Markdown(f"**Input transcription >** {results['input_transcription']}"))

    if results["audio_data"]:
        # Concatenate all audio chunks into one array
        full_audio = np.concatenate(results["audio_data"])
        display(
            Audio(full_audio, rate=24000, autoplay=True)
        )  # NOTE: 24000 is the required rate
    else:
        # This will be triggered on the turns where the model remains silent due to the system instruction
        display(
            Markdown(
                "**Model Response:** *No audio response received (filtered by system instruction).*"
            )
        )

    if results["output_transcription"]:
        display(
            Markdown(f"**Output transcription >** {results['output_transcription']}")
        )

    return results

### `run_live_session`

This function manages the full conversational context, establishing the connection and running a series of defined `turns`.


In [None]:
async def run_live_session(
    model_id: str,
    config: LiveConnectConfig,
    turns: List[str],
):
    """
    Establishes the Live Connect session and runs a series of conversational turns.
    """
    display(Markdown("## Starting Live Connect Session..."))
    system_instruction = config.system_instruction
    display(Markdown(f"**System Instruction:** *{system_instruction}*"))

    try:
        # Use an asynchronous context manager to establish and manage the session lifecycle
        async with client.aio.live.connect(
            model=model_id,
            config=config,
        ) as session:
            display(
                Markdown(f"**Status:** Session established with model: `{model_id}`")
            )

            all_results = []
            for turn in turns:
                # Send each user input sequentially
                result = await send_and_receive_turn(session, turn)
                all_results.append(result)

            display(Markdown("\n---"))
            display(Markdown("**Status:** All turns complete. Session closed."))
            return all_results
    except Exception as e:
        display(Markdown(f"**Error:** Failed to connect or run session: {e}"))
        return []

## Scenario 1: Proactive Audio (Chime-in Behavior)

This example uses a **System Instruction** and **Proactive Audio** to test the model's ability to remain silent when the topic is off-subject (French cuisine) and chime in only when the conversation shifts to the instructed topic (Italian cooking).

### Conversation Setup and Execution

In [None]:
session_config = configure_session(
    system_instruction="You are an AI assistant in Italian cooking, chime in only when the topic is about Italian cooking.",
    enable_proactivity=True,
)

conversation_turns = [
    # Speaker A speaks, general topic, the model should be silent.
    "Hey, I was just thinking about my dinner plans. I really love cooking.",
    # Speaker B speaks, off-topic (French cuisine). The model should be silent.
    "Oh yes, me too. I love French cuisine, especially making a good coq au vin. I think I'll make that tonight.",
    # Speaker A speaks, shifts to Italian topic. The model should chime in.
    "Hmm, that sounds complicated. I prefer Italian food. Say, do you know how to make a simple Margherita pizza recipe?",
]

results = await run_live_session(MODEL_ID, session_config, conversation_turns)

## Scenario 2: Affective Dialog (Empathy)

This scenario enables **Affective Dialog** (`enable_affective_dialog=True`) and uses a system instruction to create a senior technical advisor persona. The user's input is phrased to convey **frustration**, prompting an empathetic and helpful response from the model.

### Configuration and Execution

In [None]:
affective_config = configure_session(
    enable_transcription=False,
    enable_proactivity=False,
    enable_affective_dialog=True,
    system_instruction="You are a senior technical advisor for a complex AI project.",
)

affective_dialog_turns = [
    "I have been staring at this API docs for two hours now! It's so confusing and I can't even find where to start the streaming request. I'm completely stuck!",
    # A follow-up turn to see if the model maintains the helpful persona
    "Okay, thanks. I'm using Python. What is the single most important parameter I need to set up for a successful streaming connection?",
]

results = await run_live_session(MODEL_ID, affective_config, affective_dialog_turns)

## What's next

- See the [Live API reference docs](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live).
- Explore other notebooks in the [Google Cloud Generative AI GitHub repository](https://github.com/GoogleCloudPlatform/generative-ai).