In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Narrate a Multi-character Story with Text-to-Speech and Gemini

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/multimodal_rag_langchain.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/language/getting-started/intro_palm_api.ipynb">
      <svg xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" preserveAspectRatio="xMidYMid meet" focusable="false" style="pointer-events: none; width: 32px; height: 32px;"><g><path d="M4.54,9.46,2.19,7.1a6.93,6.93,0,0,0,0,9.79l2.36-2.36A3.59,3.59,0,0,1,4.54,9.46Z" style="" fill="#1A73E8"></path><path d="M2.19,7.1,4.54,9.46a3.59,3.59,0,0,1,5.08,0l1.71-2.93h0l-.1-.08h0A6.93,6.93,0,0,0,2.19,7.1Z" style="" fill="#669df6"></path><path d="M11.34,17.46h0L9.62,14.54a3.59,3.59,0,0,1-5.08,0L2.19,16.9a6.93,6.93,0,0,0,9,.65l.11-.09" style="" fill="#669df6"></path><path d="M12,7.1a6.93,6.93,0,0,0,0,9.79l2.36-2.36a3.59,3.59,0,1,1,5.08-5.08L21.81,7.1A6.93,6.93,0,0,0,12,7.1Z" style="" fill="#669df6"></path><path d="M21.81,7.1,19.46,9.46a3.59,3.59,0,0,1-5.08,5.08L12,16.9A6.93,6.93,0,0,0,21.81,7.1Z" style="" fill="#1A73E8"></path></g></svg><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/multimodal_rag_langchain.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/multimodal_rag_langchain.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>


---

* Author: Holt Skinner
* Created: Jan 2024

---

## Overview

This notebook demostrates how to use the [Text-to-Speech API](https://cloud.google.com/text-to-speech) to read a story with each character having a distinct voice.

### Objective

This tutorial uses the following Google Cloud AI services and resources:

- [Cloud Text-to-Speech API](https://cloud.google.com/text-to-speech/docs)
- Cloud Storage

The steps performed include:

- Parse the input story text in play script format. (`Character: Lines`)
- Assign each character to a voice.
- Synthesize each line based on character voice.
- Combine audio files into one MP3 file.

Planned expansions:

- Upload audio to Cloud Storage
- Read in story text using [Document AI OCR](https://cloud.google.com/document-ai)
- Convert story to play script format using Gemini.
- Create alternative implementation using LangChain.
- Assign character voices using Gemini.
- Add [Journey voices](https://cloud.google.com/text-to-speech/docs/voice-types#journey_voices) once more voices are supported.


### Costs

This tutorial uses billable components of Google Cloud:

* Text-to-Speech
* Cloud Storage

Learn about [Text-to-Speech pricing](https://cloud.google.com/text-to-speech/pricing),
and [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Getting Started


### Install Vertex AI SDK, other packages and their dependencies

Install the following packages required to execute this notebook.

In [None]:
# Install the packages
%pip install --user --upgrade -q google-cloud-aiplatform google-cloud-texttospeech pydub gender-guesser

If you're running on a Mac, you will need to install [FFmpeg](https://ffmpeg.org/)

In [None]:
!brew install ffmpeg

### Colab only: Run the following cell to restart the kernel.

***Colab only***: Run the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top.

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

Set the project and region.

* Please note the **available regions** for Text-to-Speech, see [documentation](https://cloud.google.com/text-to-speech/docs/endpoints)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

LOCATION = "us"  # @param {type:"string"}

### Authenticating your notebook environment

* If you are using **Colab** to run this notebook, run the cell below and continue.
* If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env).

In [None]:
import sys
import vertexai

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

    # Initialize Vertex AI
    vertexai.init(project=PROJECT_ID, location=LOCATION)

### Download source texts from Google Cloud Storage

This public bucket contains some stories generated by PaLM.

In [None]:
! gsutil cp gs://github-repo/speech/storytelling/*.txt .

### Import libraries

In [None]:
from random import choice
import sys
import os
from typing import Dict, List, Tuple

from gender_guesser.detector import Detector
from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech
from pydub import AudioSegment

### Define constants

In [None]:
DEFAULT_LANGUAGE = "en"
# Voice used for narration, scene details, etc.
DEFAULT_VOICE = ("en-GB-Neural2-B", "en-GB")

tts_client = texttospeech.TextToSpeechClient(
    client_options=ClientOptions(api_endpoint=f"{LOCATION}-texttospeech.googleapis.com")
)
gender_detector = Detector()

SILENCE_LENGTH = 200
TXT_EXTENSION = ".txt"

### Helper functions

In [None]:
def list_voices_by_gender(
    language_code: str = DEFAULT_LANGUAGE,
) -> Tuple[Dict[str, List], int]:
    gender_to_voices: Dict[str, List[Tuple[str, str]]] = {}
    total_voices = 0

    # Performs the list voices request
    response = tts_client.list_voices(language_code=language_code)

    for voice in response.voices:
        if "Neural2" not in voice.name or DEFAULT_VOICE[0] == voice.name:
            continue

        ssml_gender = texttospeech.SsmlVoiceGender(voice.ssml_gender).name.lower()

        if gender_to_voices.get(ssml_gender):
            gender_to_voices[ssml_gender].append((voice.name, voice.language_codes[0]))
        else:
            gender_to_voices[ssml_gender] = [(voice.name, voice.language_codes[0])]

        total_voices += 1

    return gender_to_voices, total_voices


def print_gender_map(gender_to_voices: Dict[str, List]):
    print("Gender\t| Voice Name")
    for gender, voices in gender_to_voices.items():
        for voice in voices:
            print(f"{gender}\t| {voice}")
    print("\n")


def create_character_map(
    names: List[str], gender_to_voices: Dict[str, List]
) -> Dict[str, Tuple]:
    character_to_voice: Dict[str, Tuple] = {}
    supported_genders = list(gender_to_voices.keys())

    # For character names that are not supported by gender_guesser
    CHARACTER_GENDER = {"Macbeth": "male", "Lady Macbeth": "female", "Ariel": "female"}

    for name in names:
        if name == "Narrator":
            character_to_voice[name] = DEFAULT_VOICE
            continue

        if name in CHARACTER_GENDER:
            gender = CHARACTER_GENDER[name]
        else:
            gender = gender_detector.get_gender(name)

        # If gender is indeterminate/androgynous, pick random one
        if gender not in supported_genders:
            gender = choice(supported_genders)

        # Assign Voice to Character and don't reuse.
        voice = choice(gender_to_voices[gender])
        gender_to_voices[gender].remove(voice)
        character_to_voice[name] = voice

    return character_to_voice


def print_character_map(character_to_voice: Dict[str, Tuple]):
    print("Character\t| Voice Name")
    for name, voice in character_to_voice.items():
        print(f"{name}\t| {voice}")
    print("\n")


def synthesize_text(
    text: str, output: str, voice_name: str, language_code: str = DEFAULT_LANGUAGE
):
    input_text = texttospeech.SynthesisInput(text=text)

    voice = texttospeech.VoiceSelectionParams(
        language_code=language_code, name=voice_name
    )

    # Note: you can pass in multiple effects_profile_id. They will be applied
    # in the same order they are provided.
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
    )

    response = tts_client.synthesize_speech(
        input=input_text, voice=voice, audio_config=audio_config
    )

    # The response's audio_content is binary.
    with open(output, "wb") as out:
        out.write(response.audio_content)


def combine_audio_files(audio_files: List, filename: str) -> str:
    full_audio = AudioSegment.silent(duration=SILENCE_LENGTH)

    for file in audio_files:
        sound = AudioSegment.from_mp3(file)
        silence = AudioSegment.silent(duration=SILENCE_LENGTH)
        full_audio = full_audio + sound + silence

        os.remove(file)

    outfile_name = f"{filename}-complete.mp3"
    full_audio.export(outfile_name, format="mp3")
    return outfile_name


def get_characters(input_file) -> List:
    character_list = []
    with open(input_file, "r") as f:
        lines = f.readlines()

    start_line = lines.index("Characters:\n")

    for i in range(start_line + 2, len(lines)):
        if lines[i] == "\n":
            break
        character_list.append(lines[i].strip())
    return character_list


def parse_file(input_file: str, character_to_voice: Dict[str, Tuple]) -> List[str]:
    with open(input_file, "r") as f:
        lines = f.readlines()

    line_number = 1
    output_files = []
    filename = file_prefix(input_file)

    for line in lines:
        split_line = line.strip().split(": ", 1)

        character = split_line[0]
        # Skip blank lines
        if not character:
            continue

        voice = character_to_voice.get(character, DEFAULT_VOICE)

        if len(split_line) <= 1:
            dialogue = split_line[0]
        elif "Scene" in split_line[0]:
            dialogue = split_line[0] + split_line[1]
        else:
            dialogue = split_line[1]

        output_file = f"{filename}-{line_number}.mp3"
        output_files.append(output_file)
        synthesize_text(dialogue, output_file, voice[0], voice[1])
        line_number += 1

    return output_files


def file_prefix(input_file: str) -> str:
    return input_file.replace(TXT_EXTENSION, "")

## Call the Text-to-Speech API with script content

In [None]:
input_file = "Macbeth.txt"  # @param {type:"string"}

# Get voices and genders
gender_to_voices, total_voices = list_voices_by_gender()
print_gender_map(gender_to_voices)

# List all characters
character_list = get_characters(input_file)

if len(character_list) > total_voices:
    print(f"Too many characters {len(character_list)}. Max {total_voices}")

# Map Characters to Voices
character_to_voice = create_character_map(character_list, gender_to_voices)
print_character_map(character_to_voice)

# Parse input text and output each line as audio
output_files = parse_file(input_file, character_to_voice)

# Combine audio files into a single file
outfile_name = combine_audio_files(output_files, file_prefix(input_file))
print(f"Audio content written to file {outfile_name}")

In [None]:
IPython.display.Audio(outfile_name)