In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with Gemini-TTS voices using Text-to-Speech

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Faudio%2Fspeech%2Fgetting-started%2Fget_started_with_gemini_tts_voices.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| Authors |
| --- |
| [Ahmet Kizilay](https://github.com/ahmetkizilay) |
| [Gary Chien](https://github.com/goldenchest) |

## Overview

This notebook introduces [Gemini-TTS](https://cloud.google.com/text-to-speech/docs/gemini-tts), the latest evolution of our Text-to-Speech technology that's moving beyond just naturalness to giving granular control over generated audio using text-based prompts. Using Gemini-TTS, you can synthesize speech from short snippets to long-form narratives, precisely dictating style, accent, pace, tone, and even emotional expression, all steerable through natural-language prompts. You can create conversations between two speakers with the same emotional expression and steerability.


There are currently 30 distinct voice options. See [all available voices](https://cloud.google.com/text-to-speech/docs/gemini-tts#voice_options).

There are 80+ locale options to use for synthesis. See [all available locales](https://cloud.google.com/text-to-speech/docs/gemini-tts#language_availability)

In this tutorial, you learn how to:

- How to synthesize speech using real-time (online) processing
- How to use formatting and expressive tags to modify the tone of the speech.
- How to synthesize dialogues with two speakers.

## Get started

### Install Text-to-Speech SDK and other required packages

Minimum google-cloud-texttospeech version  2.31.0 is required to be able to use the Gemini-TTS related fields.

In [1]:
%%bash
# Detect the operating system
os=$(uname -s)

if [[ "$os" == "Linux" ]]; then
  # Linux installation
  sudo apt update -y -qq
  sudo apt install ffmpeg -y -qq
  echo "ffmpeg installed successfully on Linux."
elif [[ "$os" == "Darwin" ]]; then
  # macOS installation
  if command -v brew &> /dev/null; then
    brew install ffmpeg
    if [[ $? -eq 0 ]]; then
        echo "ffmpeg installed successfully on macOS using Homebrew."
    else
        echo "Error installing ffmpeg on macOS using Homebrew."
    fi
  else
    echo "Homebrew is not installed. Please install Homebrew and try again."
  fi
else
  echo "Unsupported operating system: $os"
fi

In [2]:
%pip install --upgrade --quiet google-cloud-texttospeech
%pip show google-cloud-texttospeech

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [3]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize SDK

To get started using the Text-to-Speech API, you must have an existing Google Cloud project and [enable the API](https://console.cloud.google.com/flows/enableapi?apiid=texttospeech.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

For regional availability, see [documentation](https://cloud.google.com/text-to-speech/docs/gemini-tts#regional_availability).

In [4]:
# Use the environment variable if the user doesn't provide Project ID.
import os

# fmt: off
PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
# fmt: on
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

TTS_LOCATION = "global"

In [5]:
! gcloud config set project {PROJECT_ID}
! gcloud auth application-default set-quota-project {PROJECT_ID}
! gcloud auth application-default login -q

### Import libraries

In [6]:
from IPython.display import Audio, display
from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech

### Set constants

Initiate the API endpoint and the text to speech client.


In [7]:
API_ENDPOINT = (
    f"{TTS_LOCATION}-texttospeech.googleapis.com"
    if TTS_LOCATION != "global"
    else "texttospeech.googleapis.com"
)

client = texttospeech.TextToSpeechClient(
    client_options=ClientOptions(api_endpoint=API_ENDPOINT)
)

## Synthesize using Gemini-TTS voices


### Synthesize speech using real-time (online) processing

You define the text you want to convert, select a specific voice and language, and then instruct the API to generate an audio of the spoken text.

This example uses the `Aoede` voice, which is a high-definition voice, offering improved clarity. Feel free to choose another voice from the `voice` drop-down menu.

The code will call the `synthesize_speech` method, which handles the core conversion process, and the output will be an MP3 audio as `bytes`.


In [8]:
MODEL = "gemini-2.5-flash-tts"  # @param ["gemini-2.5-flash-tts", "gemini-2.5-pro-tts"]

# fmt: off
VOICE = "Aoede"  # @param ["Achernar", "Achird", "Algenib", "Algieba", "Alnilam", "Aoede", "Autonoe", "Callirrhoe", "Charon", "Despina", "Enceladus", "Erinome", "Fenrir", "Gacrux", "Iapetus", "Kore", "Laomedeia", "Leda", "Orus", "Puck", "Pulcherrima", "Rasalgethi", "Sadachbia", "Sadaltager", "Schedar", "Sulafat", "Umbriel", "Vindemiatrix", "Zephyr", "Zubenelgenubi"]

LANGUAGE_CODE = "en-us"  # @param ["am-et", "ar-001", "ar-eg",  "az-az",  "be-by",  "bg-bg", "bn-bd", "ca-es", "ceb-ph", "cs-cz",  "da-dk",  "de-de",  "el-gr", "en-au", "en-gb", "en-in",  "en-us",  "es-es",  "es-419", "es-mx", "es-us", "et-ee", "eu-es",  "fa-ir",  "fi-fi",  "fil-ph", "fr-fr", "fr-ca", "gl-es", "gu-in",  "hi-in",  "hr-hr",  "ht-ht",  "hu-hu", "af-za", "hy-am", "id-id",  "is-is",  "it-it",  "he-il",  "ja-jp", "jv-jv", "ka-ge", "kn-in",  "ko-kr",  "kok-in", "la-va",  "lb-lu", "lo-la", "lt-lt", "lv-lv",  "mai-in", "mg-mg",  "mk-mk",  "ml-in", "mn-mn", "mr-in", "ms-my",  "my-mm",  "nb-no",  "ne-np",  "nl-nl", "nn-no", "or-in", "pa-in",  "pl-pl",  "ps-af",  "pt-br",  "pt-pt", "ro-ro", "ru-ru", "sd-in",  "si-lk",  "sk-sk",  "sl-si",  "sq-al", "sr-rs", "sv-se", "sw-ke",  "ta-in",  "te-in",  "th-th",  "tr-tr", "uk-ua", "ur-pk", "vi-vn",  "cmn-cn", "cmn-tw"]
# fmt: on


voice = texttospeech.VoiceSelectionParams(
    name=VOICE, language_code=LANGUAGE_CODE, model_name=MODEL
)

In [9]:
# @title capture emotion with prompts

# fmt: off
PROMPT = "You are having a conversation with a friend. Say the following in a happy and casual way"  # @param {type: "string"}
# fmt: on
TEXT = "hahaha, i did NOT expect that. can you believe it!"  # @param {type: "string"}

# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(text=TEXT, prompt=PROMPT),
    voice=voice,
    # Select the type of audio file you want returned
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    ),
)

# play the generated audio
display(Audio(response.audio_content))

In [None]:
# @title Modify pace of the speech

# fmt: off
PROMPT = "Say the following very fast but still be intelligible"  # @param {type: "string"}
TEXT = "Availability and terms may vary. Check our website or your local store for complete details and restrictions."  # @param {type: "string"}
# fmt: on

# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(text=TEXT, prompt=PROMPT),
    voice=voice,
    # Select the type of audio file you want returned
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    ),
)
# play the generated audio
display(Audio(response.audio_content))

In [None]:
# @title modify text with expressive tags

# NOTE: These tags are not strict syntax. Feel free to experiment with different
# expressions and formats.

PROMPT = "Say the following with a sarcastic tone"  # @param {type: "string"}
# fmt: off
TEXT = "So.. [chuckling] tell me about this [coughs] AI thing."  # @param {type: "string"}
# fmt: on

# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(text=TEXT, prompt=PROMPT),
    voice=voice,
    # Select the type of audio file you want returned
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    ),
)
# play the generated audio
display(Audio(response.audio_content))

## Multi-speaker (Dialog) Speech Synthesis

You can create a dialog between two speakers. Using `multi_speaker_voice_config`, you can specify the speakers, and assign a custom speaker name to reference in the input text.

There are two ways to structure the multi-speaker input

In [None]:
# @title Explicit turn-based syntax

SPEAKER_ALIAS_1 = "Zizu"  # @param {type: "string"}
# fmt: off
SPEAKER_1 = "Fenrir"  # @param ["Achernar", "Achird", "Algenib", "Algieba", "Alnilam", "Aoede", "Autonoe", "Callirrhoe", "Charon", "Despina", "Enceladus", "Erinome", "Fenrir", "Gacrux", "Iapetus", "Kore", "Laomedeia", "Leda", "Orus", "Puck", "Pulcherrima", "Rasalgethi", "Sadachbia", "Sadaltager", "Schedar", "Sulafat", "Umbriel", "Vindemiatrix", "Zephyr", "Zubenelgenubi"]

SPEAKER_ALIAS_2 = "Gary"  # @param {type: "string"}
SPEAKER_2 = "Orus"  # @param ["Achernar", "Achird", "Algenib", "Algieba", "Alnilam", "Aoede", "Autonoe", "Callirrhoe", "Charon", "Despina", "Enceladus", "Erinome", "Fenrir", "Gacrux", "Iapetus", "Kore", "Laomedeia", "Leda", "Orus", "Puck", "Pulcherrima", "Rasalgethi", "Sadachbia", "Sadaltager", "Schedar", "Sulafat", "Umbriel", "Vindemiatrix", "Zephyr", "Zubenelgenubi"]

LANGUAGE_CODE = "en-gb"  # @param ["am-et", "ar-001", "ar-eg",  "az-az",  "be-by",  "bg-bg", "bn-bd", "ca-es", "ceb-ph", "cs-cz",  "da-dk",  "de-de",  "el-gr", "en-au", "en-gb", "en-in",  "en-us",  "es-es",  "es-419", "es-mx", "es-us", "et-ee", "eu-es",  "fa-ir",  "fi-fi",  "fil-ph", "fr-fr", "fr-ca", "gl-es", "gu-in",  "hi-in",  "hr-hr",  "ht-ht",  "hu-hu", "af-za", "hy-am", "id-id",  "is-is",  "it-it",  "he-il",  "ja-jp", "jv-jv", "ka-ge", "kn-in",  "ko-kr",  "kok-in", "la-va",  "lb-lu", "lo-la", "lt-lt", "lv-lv",  "mai-in", "mg-mg",  "mk-mk",  "ml-in", "mn-mn", "mr-in", "ms-my",  "my-mm",  "nb-no",  "ne-np",  "nl-nl", "nn-no", "or-in", "pa-in",  "pl-pl",  "ps-af",  "pt-br",  "pt-pt", "ro-ro", "ru-ru", "sd-in",  "si-lk",  "sk-sk",  "sl-si",  "sq-al", "sr-rs", "sv-se", "sw-ke",  "ta-in",  "te-in",  "th-th",  "tr-tr", "uk-ua", "ur-pk", "vi-vn",  "cmn-cn", "cmn-tw"]
# fmt: on

PROMPT = "Read the following dialogue between two friends"  # @param {type: "string"}

multi_speaker_voice_config = texttospeech.MultiSpeakerVoiceConfig(
    speaker_voice_configs=[
        texttospeech.MultispeakerPrebuiltVoice(
            speaker_alias=SPEAKER_ALIAS_1, speaker_id=SPEAKER_1
        ),
        texttospeech.MultispeakerPrebuiltVoice(
            speaker_alias=SPEAKER_ALIAS_2, speaker_id=SPEAKER_2
        ),
    ]
)

multi_speaker_markup = texttospeech.MultiSpeakerMarkup(
    turns=[
        texttospeech.MultiSpeakerMarkup.Turn(
            speaker=SPEAKER_ALIAS_1,
            text="Have you tried the new multi-speaker feature on Gemini?",
        ),
        texttospeech.MultiSpeakerMarkup.Turn(
            speaker=SPEAKER_ALIAS_2, text="Yes! I am super excited about it"
        ),
    ]
)
response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(
        multi_speaker_markup=multi_speaker_markup, prompt=PROMPT
    ),
    voice=texttospeech.VoiceSelectionParams(
        language_code=LANGUAGE_CODE,
        model_name=MODEL,
        multi_speaker_voice_config=multi_speaker_voice_config,
    ),
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16
    ),
)
# play the generated audio
display(Audio(response.audio_content))

In [None]:
# @title Inline dialog text input

multi_speaker_voice_config = texttospeech.MultiSpeakerVoiceConfig(
    speaker_voice_configs=[
        texttospeech.MultispeakerPrebuiltVoice(
            speaker_alias=SPEAKER_ALIAS_1,
            speaker_id=SPEAKER_1,
        ),
        texttospeech.MultispeakerPrebuiltVoice(
            speaker_alias=SPEAKER_ALIAS_2,
            speaker_id=SPEAKER_2,
        ),
    ]
)
response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(
        text="Zizu: Have you tried the new multi-speaker feature on Gemini?\nGary: Yes! I am super excited about it",
        prompt=PROMPT,
    ),
    voice=texttospeech.VoiceSelectionParams(
        language_code=LANGUAGE_CODE,
        model_name=MODEL,
        multi_speaker_voice_config=multi_speaker_voice_config,
    ),
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16
    ),
)
# play the generated audio
display(Audio(response.audio_content))

## Relax safety filters
Accounts with [monthly invoiced billing](https://cloud.google.com/billing/docs/how-to/invoiced-billing) may relax Gemini TTS's harmful content filters by setting `relax_safety_filters` in [AdvancedVoiceOptions](https://cloud.devsite.corp.google.com/python/docs/reference/texttospeech/latest/google.cloud.texttospeech_v1.types.AdvancedVoiceOptions).

This field is not enabled for accounts without monthly invoiced billing.

In [None]:
# @title Demonstrate relaxing safety filters

# fmt: off
PROMPT = "Say the following"  # @param {type: "string"}
TEXT = "This is input that would normally be blocked by Gemini TTS."  # @param {type: "string"}
# fmt: on

# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(text=TEXT, prompt=PROMPT),
    voice=voice,
    # Select the type of audio file you want returned
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    ),
    advanced_voice_options=texttospeech.AdvancedVoiceOptions(
        relax_safety_filters=True
    )
)
# play the generated audio
display(Audio(response.audio_content))

### Synthesize speech using streaming processing

You can use the `StreamingSynthesizeRequest` method to get audio streamed back as soon as it is ready. This method is more suitable for real-time scenarios where fast response time is important for better user-experience.

A function like `request_generator` can be used to stream text into the API, for example from an LLM which generates the text as a response to a user action.

The audio stream will start after the client stops sending the text input, as indicated by half-close message. In the example below, the completion of the `request_generator` model implies the half-close operation.

In real-time applications, the streaming responses are meant to be heard immediately as the responses are sent from the TTS server. For example, in a web server scenario, where the client is connected to your webserver via websockets, you could use `emit("audio", response.audio_content)` to pass the audio to the client immediately.

In [18]:
# @title Calling Streaming synthesize

import datetime

import numpy as np

# fmt: off
PROMPT = "Say the following with a respectful tone"  # @param {type: "string"} # fmt: skip
TEXT = "So.. tell me about this [coughs] AI thing. I would be super interested in learning the fundamentals and jump into the world of vibe coding"  # @param {type: "string"} # fmt: skip
# fmt: on


config_request = texttospeech.StreamingSynthesizeRequest(
    streaming_config=texttospeech.StreamingSynthesizeConfig(
        voice=texttospeech.VoiceSelectionParams(
            name=VOICE, language_code=LANGUAGE_CODE, model_name=MODEL
        )
    )
)


def request_generator():
    yield config_request

    yield texttospeech.StreamingSynthesizeRequest(
        input=texttospeech.StreamingSynthesisInput(text=TEXT, prompt=PROMPT)
    )


request_start_time = datetime.datetime.now()
streaming_responses = client.streaming_synthesize(request_generator())

is_first_chunk_received = False
final_audio_data = np.array([])
num_chunks_received = 0
for response in streaming_responses:
    # just a simple progress indicator
    num_chunks_received += 1
    print(".", end="")
    if num_chunks_received % 40 == 0:
        print("")

    # measuring time to first audio
    if not is_first_chunk_received:
        is_first_chunk_received = True
        first_chunk_received_time = datetime.datetime.now()

    # accumulating audio. In a web-server scenario, you would want to "emit" audio
    # to the frontend as soon as it arrives.
    #
    # For example using flask socketio, you could do the following
    # from flask_socketio import SocketIO, emit
    # emit("audio", response.audio_content)
    # socketio.sleep(0)
    audio_data = np.frombuffer(response.audio_content, dtype=np.int16)
    final_audio_data = np.concatenate((final_audio_data, audio_data))

time_to_first_audio = first_chunk_received_time - request_start_time
time_to_completion = datetime.datetime.now() - request_start_time
audio_duration = len(final_audio_data) / 24_000  # default sampling rate.

print("\n")
print(f"Time to first audio: {time_to_first_audio.total_seconds()} seconds")
print(f"Time to completion: {time_to_completion.total_seconds()} seconds")
print(f"Audio duration: {audio_duration} seconds")

display(Audio(final_audio_data, rate=24_000, autoplay=False))

## Further details


Feel free to review the [Cloud Text-to-Speech Python SDK documentation](https://cloud.google.com/python/docs/reference/texttospeech/latest) to explore all available fields and options to customize the API behavior.


To learn more about Gemini-TTS offering on Vertex AI, make sure to check out the [Gemini-TTS Guide](https://cloud.google.com/text-to-speech/docs/gemini-tts).