In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with Gemini-TTS voices using Text-to-Speech

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Faudio%2Fspeech%2Fgetting-started%2Fget_started_with_gemini_tts_voices.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_gemini_tts_voices.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| Authors |
| --- |
| [Ahmet Kizilay](https://github.com/ahmetkizilay) |
| [Gary Chien](https://github.com/goldenchest) |

## Overview

This notebook introduces [Gemini-TTS](https://cloud.google.com/text-to-speech/docs/gemini-tts), the latest evolution of our Text-to-Speech technology that's moving beyond just naturalness to giving granular control over generated audio using text-based prompts. Using Gemini-TTS, you can synthesize speech from short snippets to long-form narratives, precisely dictating style, accent, pace, tone, and even emotional expression, all steerable through natural-language prompts.


There are currently 30 distinct voice options. See [all available voices](https://cloud.google.com/text-to-speech/docs/gemini-tts#voice_options).

In this tutorial, you learn how to:

- How to synthesize speech using real-time (online) processing
- How to use formatting and expressive tags to modify the tone of the speech.

## Get started

### Install Text-to-Speech SDK and other required packages

Minimum google-cloud-texttospeech version  2.29.0 is required to be able to use the Gemini-TTS related fields.

In [None]:
%%bash
# Detect the operating system
os=$(uname -s)

if [[ "$os" == "Linux" ]]; then
  # Linux installation
  sudo apt update -y -qq
  sudo apt install ffmpeg -y -qq
  echo "ffmpeg installed successfully on Linux."
elif [[ "$os" == "Darwin" ]]; then
  # macOS installation
  if command -v brew &> /dev/null; then
    brew install ffmpeg
    if [[ $? -eq 0 ]]; then
        echo "ffmpeg installed successfully on macOS using Homebrew."
    else
        echo "Error installing ffmpeg on macOS using Homebrew."
    fi
  else
    echo "Homebrew is not installed. Please install Homebrew and try again."
  fi
else
  echo "Unsupported operating system: $os"
fi

In [None]:
%pip install --upgrade --quiet google-cloud-texttospeech
%pip show google-cloud-texttospeech

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize SDK

To get started using the Text-to-Speech API, you must have an existing Google Cloud project and [enable the API](https://console.cloud.google.com/flows/enableapi?apiid=texttospeech.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

For regional availability, see [documentation](https://cloud.google.com/text-to-speech/docs/gemini-tts#regional_availability).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = (
    ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
)
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

TTS_LOCATION = "global"

In [None]:
! gcloud config set project {PROJECT_ID}
! gcloud auth application-default set-quota-project {PROJECT_ID}
! gcloud auth application-default login -q

### Import libraries

In [None]:
from IPython.display import Audio, display
from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech

### Set constants

Initiate the API endpoint and the text to speech client.


In [None]:
API_ENDPOINT = (
    f"{TTS_LOCATION}-texttospeech.googleapis.com"
    if TTS_LOCATION != "global"
    else "texttospeech.googleapis.com"
)

client = texttospeech.TextToSpeechClient(
    client_options=ClientOptions(api_endpoint=API_ENDPOINT)
)

## Synthesize using Gemini-TTS voices


### Synthesize speech using real-time (online) processing

You define the text you want to convert, select a specific voice and language, and then instruct the API to generate an audio of the spoken text.

This example uses the `Aoede` voice, which is a high-definition voice, offering improved clarity. Feel free to choose another voice from the `voice` drop-down menu.

The code will call the `synthesize_speech` method, which handles the core conversion process, and the output will be an MP3 audio as `bytes`.


In [None]:
MODEL = "gemini-2.5-flash-preview-tts"  # @param ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
VOICE = "Aoede"  # @param ["Aoede", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Zephyr"]

LANGUAGE_CODE = "en-US"

voice = texttospeech.VoiceSelectionParams(
    name=VOICE, language_code=LANGUAGE_CODE, model_name=MODEL
)

In [None]:
# @title capture emotion with prompts

PROMPT = "You are having a conversation with a friend. Say the following in a happy and casual way"  # @param {type: "string"}
TEXT = "hahaha, i did NOT expect that. can you believe it!"  # @param {type: "string"}

# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(text=TEXT, prompt=PROMPT),
    voice=voice,
    # Select the type of audio file you want returned
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    ),
)

# play the generated audio
display(Audio(response.audio_content))

In [None]:
# @title Modify pace of the speech

PROMPT = (
    "Say the following very fast but still be intelligible"  # @param {type: "string"}
)
TEXT = "Availability and terms may vary. Check our website or your local store for complete details and restrictions."  # @param {type: "string"}

# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(text=TEXT, prompt=PROMPT),
    voice=voice,
    # Select the type of audio file you want returned
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    ),
)
# play the generated audio
display(Audio(response.audio_content))

In [None]:
# @title modify text with expressive tags

# NOTE: These tags are not strict syntax. Feel free to experiment with different
# expressions and formats.

PROMPT = "Say the following with a sarcastic tone"  # @param {type: "string"}
TEXT = (
    "So.. [chuckling] tell me about this [coughs] AI thing."  # @param {type: "string"}
)

# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(text=TEXT, prompt=PROMPT),
    voice=voice,
    # Select the type of audio file you want returned
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    ),
)
# play the generated audio
display(Audio(response.audio_content))

## Further details


Feel free to review the [Cloud Text-to-Speech Python SDK documentation](https://cloud.google.com/python/docs/reference/texttospeech/latest) to explore all available fields and options to customize the API behavior.


To learn more about Gemini-TTS offering on Vertex AI, make sure to check out the [Gemini-TTS Guide](https://cloud.google.com/text-to-speech/docs/gemini-tts).