In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Narrate a Multi-character Story with Gemini and Text-to-Speech

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/storytelling/storytelling.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Faudio%2Fspeech%2Fuse-cases%2Fstorytelling%2Fstorytelling.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/audio/speech/use-cases/storytelling/storytelling.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/storytelling/storytelling.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/storytelling/storytelling.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/storytelling/storytelling.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/storytelling/storytelling.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/storytelling/storytelling.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/storytelling/storytelling.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            


| Author |
| --- |
| [Holt Skinner](https://github.com/holtskinner) |

## Overview

This notebook demonstrates how to use the [Gemini API in Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs) to generate a play script and create an audio performance with each character having a distinct voice using the [Text-to-Speech API](https://cloud.google.com/text-to-speech).

The steps performed include:

- Create a story using Gemini
- Assign each character to a voice.
- Synthesize each line based on character voice.
- Combine audio files into one MP3 file.

### Costs

This tutorial uses billable components of Google Cloud:

* Gemini API in Vertex AI
* Text-to-Speech
* Cloud Storage

Learn about [Text-to-Speech pricing](https://cloud.google.com/text-to-speech/pricing),
and [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Getting Started


### Install Google Gen AI SDK, other packages and their dependencies

Install the following packages required to execute this notebook.

In [None]:
%pip install --upgrade -q google-genai google-cloud-texttospeech pydub pandas tqdm

If you're running on a Mac, you will need to install [FFmpeg](https://ffmpeg.org/).

In [2]:
%%bash
# Check if the system is macOS
if [[ "$(uname -s)" == "Darwin" ]]; then
    # Install using Homebrew
    brew install ffmpeg
fi

Set the project and region.

* Please note the **available regions** for Text-to-Speech, see [documentation](https://cloud.google.com/text-to-speech/docs/endpoints)

In [11]:
# Use the environment variable if the user doesn't provide Project ID.
import os

from google import genai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "global")

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

TTS_LOCATION = "global"  # @param {type:"string"}

### Authenticating your notebook environment

* If you are using **Colab** to run this notebook, run the cell below and continue.
* If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env).

In [5]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

! gcloud config set project {PROJECT_ID}
! gcloud auth application-default set-quota-project {PROJECT_ID}
! gcloud auth application-default login -q

### Import libraries

In [12]:
import os
import random
import re

from IPython.display import Audio
from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech
from google.genai.types import GenerateContentConfig
import pandas as pd
from pydantic import BaseModel
from pydub import AudioSegment
from tqdm import tqdm

### Define constants

In [28]:
DEFAULT_LANGUAGE = "en"
NARRATOR_VOICE = "en-GB-Chirp3-HD-Zubenelgenubi"
DEFAULT_VOICE = "en-GB-Chirp3-HD-Umbriel"

SILENCE_LENGTH = 200  # In Milliseconds
TXT_EXTENSION = ".txt"

api_endpoint = "texttospeech.googleapis.com"
if TTS_LOCATION != "global":
    api_endpoint = f"{TTS_LOCATION}-{api_endpoint}"

tts_client = texttospeech.TextToSpeechClient(
    client_options=ClientOptions(api_endpoint=api_endpoint)
)

SYSTEM_INSTRUCTION = """You are a creative and ambitious play writer. Your goal is to write a play for audio performance. Include a narrator character to describe the scenes and actions occurring."""

MODEL_ID = "gemini-2.5-flash"


class Character(BaseModel):
    name: str
    gender: str


class DialogueLine(BaseModel):
    speaker: str
    line: str


class Scene(BaseModel):
    setting: str
    dialogue: list[DialogueLine]


class Story(BaseModel):
    title: str
    characters: list[Character]
    scenes: list[Scene]

### Helper functions

In [29]:
def list_voices(language_code: str = DEFAULT_LANGUAGE) -> list[dict]:
    response = tts_client.list_voices(language_code=language_code)

    return [
        {
            "name": voice.name,
            "gender": texttospeech.SsmlVoiceGender(voice.ssml_gender).name.lower(),
        }
        for voice in response.voices
        if ("Chirp3" in voice.name)
        and voice.name != NARRATOR_VOICE
        and "en-IN" not in voice.name
    ]


def create_character_map(
    characters: list[Character], voices: list[dict]
) -> dict[str, str]:
    """Maps characters to voices based on gender identified by Gemini."""

    if len(characters) > len(voices):
        print(f"Too many characters {len(characters)}. Max {len(voices)}")

    character_map: dict[str, str] = {}
    male_voices = [voice["name"] for voice in voices if voice["gender"] == "male"]
    female_voices = [voice["name"] for voice in voices if voice["gender"] == "female"]

    for character in characters:
        name = character.name
        gender = character.gender.lower()

        if name == "Narrator":
            voice = NARRATOR_VOICE
        elif gender == "female" and female_voices:
            voice = female_voices.pop(random.randrange(len(female_voices)))
        elif gender == "male" and male_voices:
            voice = male_voices.pop(random.randrange(len(male_voices)))
        else:
            if male_voices and female_voices:
                chosen_pool = random.choice([male_voices, female_voices])
            elif male_voices:
                chosen_pool = male_voices
            elif female_voices:
                chosen_pool = female_voices
            else:
                raise ValueError("Not enough voices to assign to all characters.")

            voice = chosen_pool.pop(random.randrange(len(chosen_pool)))

        character_map[name] = voice

    return character_map


def synthesize_text(
    file_prefix: str, file_index: str, text: str, voice_name: str
) -> str:
    output_file = f"{file_prefix}-{file_index}.mp3"

    language_code = re.search(r"\b[a-z]{2}-[A-Z]{2}\b", voice_name).group()

    response = tts_client.synthesize_speech(
        input=texttospeech.SynthesisInput(text=text),
        voice=texttospeech.VoiceSelectionParams(
            language_code=language_code,
            name=voice_name,
        ),
        audio_config=texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3
        ),
    )

    # The response's audio_content is binary.
    with open(output_file, "wb") as f:
        f.write(response.audio_content)
    return output_file


def combine_audio_files(audio_files: list[str], file_prefix: str) -> str:
    full_audio = AudioSegment.silent(duration=SILENCE_LENGTH)

    for file in audio_files:
        sound = AudioSegment.from_mp3(file)
        silence = AudioSegment.silent(duration=SILENCE_LENGTH)
        full_audio += sound + silence
        os.remove(file)

    outfile_name = f"{file_prefix}-complete.mp3"
    full_audio.export(outfile_name, format="mp3")
    return outfile_name


def generate_audio_clips(
    story: Story, character_map: dict[str, str]
) -> tuple[list[str], str]:
    file_prefix = re.sub(r"[^\w.-]", "_", story.title).lower()
    output_files: list[str] = []

    lines: list[dict] = [
        {
            "line": story.title,
            "voice": character_map.get("Narrator", NARRATOR_VOICE),
        }
    ]

    # Process each scene in the play
    for scene in story.scenes:
        # Add the scene setting with the Narrator's voice
        lines.append(
            {
                "line": "Setting... " + scene.setting,
                "voice": character_map.get("Narrator", NARRATOR_VOICE),
            }
        )

        # Process each dialogue in the scene
        for dialogue in scene.dialogue:
            lines.append(
                {
                    "line": dialogue.line,
                    "voice": character_map.get(dialogue.speaker, DEFAULT_VOICE),
                }
            )

    for file_index, line in tqdm(enumerate(lines, start=1), "Generating audio clips"):
        output_files.append(
            synthesize_text(file_prefix, file_index, line["line"], line["voice"])
        )

    return output_files, file_prefix

## Generate play with Gemini

In [None]:
PROMPT = """Write an interesting and humorous version of the play Macbeth by William Shakespeare."""

response = client.models.generate_content(
    model=MODEL_ID,
    contents=PROMPT,
    config=GenerateContentConfig(
        system_instruction=SYSTEM_INSTRUCTION,
        max_output_tokens=65535,
        temperature=1.5,
        top_p=0.95,
        response_mime_type="application/json",
        response_schema=Story,
    ),
)
story = response.parsed

Alternatively, load a pre-generated play.

In [16]:
with open("macbeth_the_sitcom.json") as f:
    story = Story.model_validate_json(f.read())

In [None]:
story

## Get available English voices

In [None]:
all_voices = list_voices()
pd.DataFrame(all_voices)

## Assign voices to characters

In [None]:
character_to_voice = create_character_map(story.characters, all_voices)
character_to_voice

## Send play text to Text-to-Speech and output each line as an audio file

The Text-to-Speech API can only create audio with one voice per API call, so we need to create separate files for each line.

In [None]:
output_files, file_prefix = generate_audio_clips(story, character_to_voice)

## Combine audio files into a single file


In [None]:
outfile_name = combine_audio_files(output_files, file_prefix)
print(f"Audio content written to file {outfile_name}")

## Listen to the audio

In [35]:
Audio(outfile_name)