In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with Chirp 3: Instant custom voice

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp3_instant_custom_voice.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Faudio%2Fspeech%2Fgetting-started%2Fget_started_with_chirp3_instant_custom_voice.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/audio/speech/getting-started/get_started_with_chirp3_instant_custom_voice.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp3_instant_custom_voice.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp3_instant_custom_voice.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp3_instant_custom_voice.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp3_instant_custom_voice.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp3_instant_custom_voice.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp3_instant_custom_voice.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| [Ivan Nardini](https://github.com/inardini) |

## Overview

This notebook introduces [Chirp 3 Instant Custom Voice](https://cloud.google.com/text-to-speech/docs/chirp3-hd), a powerful feature of Google Cloud's Text-to-Speech (TTS) API that allows you to create personalized voice models.

With Instant Custom Voice, you can generate unique, custom voices by training a model with your own high-quality audio recordings. This enables the rapid generation of personal voices that can then be used to synthesize audio using the Cloud TTS API, supporting both streaming and long-form text output. Instant Custom Voice creation and synthesis is supported in more than 25 language.

In this tutorial, you will learn how to:

- Create an Instant Custom Voice.
- Synthesize text using your custom voice both in real-time and streaming.
- Build a simple Gradio app to use your custom voice.

<div class="alert alert-block alert-warning">
<b>⚠️ Due to safety considerations, access to this voice cloning capability is restricted to allow-listed users. To access this feature, contact a member of the Google Cloud team to be added to the allow list. ⚠️</b>
</div>


## Get started

### Install required packages


In [None]:
%pip install --upgrade --quiet gradio

### Set Google Cloud project information

To get started using the Text-to-Speech API, you must have an existing Google Cloud project and [enable the API](https://console.cloud.google.com/flows/enableapi?apiid=texttospeech.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

Please note the **available regions** for Chirp 3 Instant Custom voice, see [documentation](https://cloud.google.com/text-to-speech/docs/chirp3-instant-custom-voice#regional_availability).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

TTS_LOCATION = "global"

In [None]:
! gcloud config set project {PROJECT_ID}
! gcloud auth application-default set-quota-project {PROJECT_ID}
! gcloud auth application-default login -q

### Get Google Credentials

Use the `google.auth` library to automatically find and load your credentials.


In [None]:
import google.auth
import google.auth.transport.requests

credentials, _ = google.auth.default()
authentication = google.auth.transport.requests.Request()
credentials.refresh(authentication)

### Import libraries

Import Python tools you'll need.

In [None]:
import base64
import json
import os

from IPython.display import Audio, display
import gradio as gr
import numpy as np
import requests

### Set constants

Initiate the API endpoint and the text to speech client.


In [None]:
API_ENDPOINT = (
    f"{TTS_LOCATION}-texttospeech.googleapis.com"
    if TTS_LOCATION != "global"
    else "texttospeech.googleapis.com"
)

ACCESS_TOKEN = credentials.token

### Helpers

To keep our main logic clean and readable, you define several helper functions here. These encapsulate tasks like making API calls, processing audio data, and interacting with the Gradio interface components.

*   **`create_instant_custom_voice_key(reference_audio_bytes, consent_audio_bytes)`** to create a temporary custom voice key.

*   **`create_voice_with_masking(reference_audio, consent_audio)`** to mask the custom voice key.

*   **`synthesize_text_with_cloned_voice(voice_key, text)`** to create your custom voice using the `voice_cloning_key` and the desired text as input.

*   **`wav_to_base64(file_path)`** to read a WAV audio file from a local path, encode its binary content into a base64 string (which is how audio data is sent in the JSON payload), and return the string.

*   **`create_voice(reference_audio, consent_audio, progress)`** to create the custom voice in the Gradio app using "Create Voice" button.

*   **`generate_speech(voice_key, text, progress)`**: to synthesize any text with the custom voice using "Generate Speech" button in the Gradio app.

*   **`reset_interface()`** to effectively reset the Gradio UI.

In [None]:
def create_instant_custom_voice_key(
    reference_audio_bytes: bytes, consent_audio_bytes: bytes
) -> str:
    """Creates a temporary custom voice key"""

    url = f"https://{API_ENDPOINT}/v1beta1/voices:generateVoiceCloningKey"

    request_body = {
        "reference_audio": {
            "audio_config": {"audio_encoding": "LINEAR16", "sample_rate_hertz": 24000},
            "content": reference_audio_bytes,
        },
        "voice_talent_consent": {
            "audio_config": {"audio_encoding": "LINEAR16", "sample_rate_hertz": 24000},
            "content": consent_audio_bytes,
        },
        "consent_script": "I am the owner of this voice and I consent to Google using this voice to create a synthetic voice model.",
        "language_code": "en-US",
    }

    try:
        headers = {
            "Authorization": f"Bearer {ACCESS_TOKEN}",
            "x-goog-user-project": PROJECT_ID,
            "Content-Type": "application/json; charset=utf-8",
        }

        response = requests.post(url, headers=headers, json=request_body)
        response.raise_for_status()

        response_json = response.json()
        return response_json.get("voiceCloningKey")

    except requests.exceptions.RequestException as e:
        print(f"Error making API request: {e}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON response: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


def synthesize_text_with_cloned_voice(voice_key: str, text: str) -> None:
    """Synthesizes text with the cloned voice"""

    url = f"https://{API_ENDPOINT}/v1beta1/text:synthesize"

    request_body = {
        "input": {"text": text},
        "voice": {
            "language_code": "en-US",
            "voice_clone": {
                "voice_cloning_key": voice_key,
            },
        },
        "audioConfig": {"audioEncoding": "LINEAR16", "sample_rate_hertz": 24000},
    }

    try:
        headers = {
            "Authorization": f"Bearer {ACCESS_TOKEN}",
            "x-goog-user-project": PROJECT_ID,
            "Content-Type": "application/json; charset=utf-8",
        }

        response = requests.post(url, headers=headers, json=request_body)
        response.raise_for_status()

        response_json = response.json()
        audio_content = response_json.get("audioContent")

        if audio_content:
            display(Audio(base64.b64decode(audio_content), rate=24000))
        else:
            print("Error: Audio content not found in the response.")
            print(response_json)

    except requests.exceptions.RequestException as e:
        print(f"Error making API request: {e}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON response: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


def wav_to_base64(file_path: str) -> str:
    """Convert a WAV file to base64 encoded string"""
    try:
        with open(file_path, "rb") as wav_file:
            encoded_string = base64.b64encode(wav_file.read()).decode("utf-8")
            return encoded_string
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def create_voice(
    reference_audio: gr.Audio,
    consent_audio: gr.Audio,
    progress: gr.Progress | None = None,
) -> str:
    """Create a custom voice using reference and consent audio"""
    if reference_audio is None or consent_audio is None:
        return "Please upload both reference and consent audio files."

    if not progress:
        progress = gr.Progress()

    progress(0.2, desc="Processing audio files...")
    reference_audio_b64 = wav_to_base64(reference_audio)
    consent_audio_b64 = wav_to_base64(consent_audio)

    if reference_audio_b64 is None or consent_audio_b64 is None:
        return "Error processing audio files."

    progress(0.5, desc="Creating voice clone...")
    voice_key = create_instant_custom_voice_key(reference_audio_b64, consent_audio_b64)

    if voice_key:
        progress(1.0, desc="Voice created successfully!")
        return voice_key
    else:
        return "Failed to create voice. Check the logs for details."


def create_voice_with_masking(
    reference_audio: gr.Audio, consent_audio: gr.Audio
) -> tuple:
    """A wrapper function for create_voice to handle masking"""
    key = create_voice(reference_audio, consent_audio)
    if key and len(key) > 8:
        masked_key = key[:5] + "..."
    else:
        masked_key = key
    return key, masked_key


def generate_speech(
    voice_key: str, text: str, progress: gr.Progress | None = None
) -> tuple:
    """Generate speech using the cloned voice"""
    if not voice_key or not text:
        return None, "Please create a voice key and enter text to synthesize."

    if not progress:
        progress = gr.Progress()

    progress(0.3, desc="Generating speech...")

    try:
        url = f"https://{API_ENDPOINT}/v1beta1/text:synthesize"

        request_body = {
            "input": {"text": text},
            "voice": {
                "language_code": "en-US",
                "voice_clone": {
                    "voice_cloning_key": voice_key,
                },
            },
            "audioConfig": {"audioEncoding": "LINEAR16", "sample_rate_hertz": 24000},
        }

        headers = {
            "Authorization": f"Bearer {ACCESS_TOKEN}",
            "x-goog-user-project": PROJECT_ID,
            "Content-Type": "application/json; charset=utf-8",
        }

        progress(0.6, desc="Processing audio...")
        response = requests.post(url, headers=headers, json=request_body)
        response.raise_for_status()

        response_json = response.json()
        audio_content = response_json.get("audioContent")

        if audio_content:
            progress(1.0, desc="Speech generated!")
            audio_bytes = base64.b64decode(audio_content)
            audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
            return (24000, audio_array), "Speech generated successfully!"
        else:
            return None, "Error: Audio content not found in the response."

    except Exception as e:
        return None, f"Error generating speech: {str(e)}"


def reset_interface_with_state() -> tuple:
    return None, None, "", "", None, None, "Interface reset."

# Create an Instant Custom Voice

Let's start by creating the custom voice directly.

You would first define the file paths to your pre-recorded reference audio (`.wav` recommended, ideally a few seconds clear speech) and the consent audio (where the speaker explicitly states the consent script).

Then you use `wav_to_base64` to read these files and encode them into the base64 format required by the API and you create your custom voice using the `create_instant_custom_voice_key` helper function, passing in the base64-encoded audio data.

If the request is successful and your project is allow-listed, the API returns a `voice_cloning_key` which acts as a temporary identifier for your custom voice.


In [None]:
reference_audio_path = "[your-reference-audio-path]"  # @param {type: "string", placeholder: "[your-reference-audio-path]", isTemplate: true}
consent_audio_path = "[your-consent-audio-path]"  # @param {type: "string", placeholder: "[your-consent-audio-path]", isTemplate: true}

In [None]:
reference_audio_bytes = wav_to_base64(reference_audio_path)
consent_audio_bytes = wav_to_base64(consent_audio_path)

voice_key = create_instant_custom_voice_key(reference_audio_bytes, consent_audio_bytes)

# Use Instant Custom Voice

Now that you (theoretically) have a `voice_key`, let's use it to synthesize speech.

## Perform a Sync Request

Define the text you want your custom voice to say in the `text_to_synthesize` variable. Try different sentences!

Then, call the `synthesize_text_with_cloned_voice` helper function, providing the `voice_key` you obtained earlier and the text. This function sends the request to the `text:synthesize` endpoint, specifying your custom voice.

If successful, it gets the audio data back, decodes it, and should play it directly in the notebook output below the cell using an embedded audio player.

In [None]:
text_to_synthesize = """
Breaking news! Chirp 3, Google Cloud's audio model, now has Instant Custom Voice.

With Instant Custom Voice, you can generate custom voices with just 10 seconds of audio to empower your AI narration.
Chirp 3 Instant Custom Voice is now available in preview with allowlist.  Check out the link below.

And yes, this voice is generated using Chirp 3 Instant Custom Voice!
"""
synthesize_text_with_cloned_voice(voice_key, text_to_synthesize)

# Build a simple Instant custom voice app

While calling the API directly works, it's often more convenient to have an interactive interface. Let's build a simple web app using Gradio to easily upload audio, create a voice, and synthesize text.

### Define the app

Here, you define the structure and components of our Gradio user interface.

In [None]:
with gr.Blocks(
    theme=gr.themes.Default(
        primary_hue="blue", secondary_hue="blue", neutral_hue="gray"
    )
) as app:

    # Create a state variable to store the full voice key
    full_voice_key = gr.State("")

    # Define title
    gr.Markdown(
        """
        # Chirp 3 - Instant custom voice demo
        Upload reference and consent audio files to create a custom voice, then synthesize speech.
        """
    )

    # Define input and output components
    with gr.Row():
        with gr.Column(scale=1):
            reference_audio = gr.Audio(
                label="Reference Voice", type="filepath", elem_id="reference_audio"
            )
            consent_audio = gr.Audio(
                label="Consent Audio", type="filepath", elem_id="consent_audio"
            )

            create_btn = gr.Button("Create Voice", variant="primary")
            voice_key_output = gr.Textbox(label="Voice Key", elem_id="voice_key")

        with gr.Column(scale=1):
            text_input = gr.Textbox(
                label="Text to Synthesize",
                placeholder="Enter the text you want the voice to say...",
                lines=5,
                elem_id="text_input",
            )
            generate_btn = gr.Button("Generate Speech", variant="primary")
            audio_output = gr.Audio(label="Generated Audio", elem_id="audio_output")
            status_output = gr.Textbox(label="Status", elem_id="status_output")

    with gr.Row():
        clear_btn = gr.Button("Clear All", variant="secondary")

    # Set up event handlers
    create_btn.click(
        create_voice_with_masking,
        inputs=[reference_audio, consent_audio],
        outputs=[full_voice_key, voice_key_output],
    )

    generate_btn.click(
        generate_speech,
        inputs=[full_voice_key, text_input],
        outputs=[audio_output, status_output],
    )

    clear_btn.click(
        reset_interface,
        inputs=[],
        outputs=[
            reference_audio,
            consent_audio,
            voice_key_output,
            full_voice_key,
            text_input,
            audio_output,
            status_output,
        ],
    )

    # Apply custom CSS for Google styling
    gr.Markdown(
        """
        <style>
        .gradio-container {
            font-family: 'Google Sans', 'Roboto', sans-serif !important;
        }
        .gr-button-primary {
            background-color: #4285F4 !important;
        }
        .gr-button-secondary {
            border-color: #4285F4 !important;
            color: #4285F4 !important;
        }
        h1 {
            font-family: 'Google Sans', 'Roboto', sans-serif !important;
            color: #202124 !important;
        }
        </style>
        """
    )

### Launch the app

Showtime! This cell launches the Gradio app we just defined.

In [None]:
app.launch(share=True)

Close the app once you finish to play with it.

In [None]:
app.close()