In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Create a multi speaker podcast with Gemini Controlled Generation & Text-to-Speech

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/podcast/multi-speaker-podcast.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Faudio%2Fspeech%2Fuse-cases%2Fpodcast%2Fmulti-speaker-podcast.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/audio/speech/use-cases/podcast/multi-speaker-podcast.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/podcast/multi-speaker-podcast.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>


<table align="left">
  <td style="text-align: center">
<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/podcast/multi-speaker-podcast.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/podcast/multi-speaker-podcast.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/podcast/multi-speaker-podcast.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/podcast/multi-speaker-podcast.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/use-cases/podcast/multi-speaker-podcast.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>
</td>

| | |
|-|-|
| Author(s) |  [Souvik Mukherjee](https://github.com/talktosauvik/), [Holt Skinner](https://github.com/holtskinner) |

## Overview

This notebook demonstrates how to use the [Gemini API in Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/overview) to generate an engaging multi-speaker podcast using studio voices [Text-to-Speech API](https://cloud.google.com/text-to-speech). This can be useful for creating interviews, interactive storytelling, video games, e-learning platforms, and accessibility solutions.

The steps performed include:

- Download content from a Google Cloud Storage bucket
- Summarize the content using Gemini 1.5 Pro
- Return a pre-defined JSON schema using Controlled Generation
- Create a multi speaker conversation from the JSON script using Gemini.
- Generate the audio as MP3 file.

For a more advanced example using LangGraph, check out [Build Your Own AI Podcasting Agent with LangGraph & Gemini: AI-Powered Podcast Creation with Automated Research, Writing, and Refinement](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/orchestration/langgraph_gemini_podcast.ipynb)

## Get started

### Install Vertex AI SDK, other packages and their dependencies

Install the following packages required to execute this notebook.

In [None]:
# Install the packages
%pip install --user --upgrade -q google-cloud-aiplatform google-cloud-texttospeech

If you're running on a Mac, you will need to install [FFmpeg](https://ffmpeg.org/).

In [None]:
import platform

# Check if the system is macOS
if platform.system() == "Darwin":
    # Install using Homebrew
    !brew install ffmpeg

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


Set the project and region.

* Please note the **available regions** for Text-to-Speech, see [documentation](https://cloud.google.com/text-to-speech/docs/endpoints)

In [None]:
PROJECT_ID = "document-ai-test-337818"  # @param {type:"string"}

TTS_LOCATION = "us"  # @param {type:"string"}
VERTEXAI_LOCATION = "us-central1"  # @param {type:"string"}

### Authenticating your notebook environment

* If you are using **Colab** to run this notebook, run the cell below and continue.
* If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env).

In [None]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

! gcloud config set project {PROJECT_ID}
! gcloud auth application-default set-quota-project {PROJECT_ID}
! gcloud auth application-default login -q

Initialize the [Vertex AI SDK](https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk)

In [None]:
import vertexai

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=VERTEXAI_LOCATION)

### Import libraries

In [None]:
import json

from IPython.display import Audio
from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech
from vertexai.generative_models import GenerationConfig, GenerativeModel, Part

### Define constants

In [None]:
DEFAULT_LANGUAGE = "en-US"
STUDIO_VOICE = "en-US-Studio-MultiSpeaker"

SYSTEM_INSTRUCTION = """You are a podcast writer. Your task is to generate a short podcast-style dialogue between two speakers, Speaker R and Speaker S"""

response_schema = {
    "type": "object",
    "properties": {
        "dialogue": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "speaker": {"type": "string"},
                    "line": {"type": "string"},
                },
                "required": ["speaker", "line"],
            },
        }
    },
    "required": ["dialogue"],
}

model = GenerativeModel(
    "gemini-1.5-pro",
    system_instruction=SYSTEM_INSTRUCTION,
    generation_config=GenerationConfig(
        temperature=1,
        top_p=0.95,
        max_output_tokens=8192,
        response_mime_type="application/json",
        response_schema=response_schema,
    ),
)

### Helper functions

In [None]:
def generate_podcast_script(gcs_uri: str) -> list:
    """Generates a podcast script using Gemini with controlled JSON output."""
    prompt = """
    The dialogue should be engaging and natural, with each speaker contributing roughly equal amounts.  Return the dialogue as a JSON array of objects, where each object has a 'speaker' (either 'R' or 'S') and a 'line' property.

    Use the following information to create the content for the podcast dialogue:
    """

    try:
        response = model.generate_content(
            [prompt, Part.from_uri(gcs_uri, mime_type="application/pdf")]
        )
        generated_json = json.loads(response.text)
        return generated_json["dialogue"]
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error generating or parsing JSON script: {e}. Returning empty list.")
        return []


def synthesize_podcast(dialogue: list[dict], output_file: str):
    """Synthesizes speech for a podcast using MultiSpeakerMarkup."""
    tts_client = texttospeech.TextToSpeechClient(
        client_options=ClientOptions(
            api_endpoint=f"{TTS_LOCATION}-texttospeech.googleapis.com"
        )
    )
    multi_speaker_markup = texttospeech.MultiSpeakerMarkup()

    for turn_data in dialogue:
        multi_speaker_markup.turns.append(
            texttospeech.MultiSpeakerMarkup.Turn(
                text=turn_data["line"], speaker=turn_data["speaker"]
            )
        )

    response = tts_client.synthesize_speech(
        input=texttospeech.SynthesisInput(multi_speaker_markup=multi_speaker_markup),
        voice=texttospeech.VoiceSelectionParams(
            language_code=DEFAULT_LANGUAGE, name=STUDIO_VOICE
        ),
        audio_config=texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3
        ),
    )

    with open(output_file, "wb") as out:
        out.write(response.audio_content)
    print(f'Audio content written to file "{output_file}"')

## Call the Text-to-Speech API with script content

### Generate the podcast script from the content

For this example, we will be using the [Gemini 1.5 paper from arXiv](https://arxiv.org/abs/2403.05530).

In [None]:
dialogue = generate_podcast_script("gs://github-repo/2403_05530.pdf")
print("Generated Dialogue:", dialogue)

### Write the audio content into the output file

In [None]:
if dialogue:
    output_filename = "podcast_output.mp3"
    synthesize_podcast(dialogue, output_filename)
else:
    print("No dialogue generated. Skipping audio synthesis.")

### Listen to the audio file

In [None]:
Audio(output_filename)