In [1]:
#!pip install pydub

In [1]:
import os
from google.cloud import texttospeech_v1beta1 as texttospeech
import google.generativeai as genai
import json
from pydub import AudioSegment

In [95]:
service_acc_path = "c:/UAL-Creative-Computing/Year-Three/Art-AI/secrets/tts_service_account.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_acc_path #API key

gemini_api = open("c:/UAL-Creative-Computing/Year-Three/Art-AI/secrets/API_key.txt", "r").read()
genai.configure(api_key=gemini_api)

In [96]:
model = genai.GenerativeModel("gemini-2.0-flash-exp")

tts_client = texttospeech.TextToSpeechClient()

In [97]:
title = "Sustainability_Meets_Tech_Innovating_for_a_Greener_Tomorrow"
topic = "Technological innovations in sustainability, including AI-driven energy solutions and eco-friendly product designs."
level = "General Audience" #audience
host_tags = ["U", "R"] # R S T U
host_names = ["Archer", "Maven"]

In [98]:
%%time
genai_response = model.generate_content(
    f"""Create a 6000 word, 20 minute long podcast transcript for the podcast show "Robots Killed the Radio Star.

        Host Dynamics:
        - {host_names[0]} ({host_tags[0]}): Enthusiastic and structured, leading the conversation with engaging and relatable explanations. Occasionally over-explains but uses vivid analogies, rhetorical questions, and pauses to keep things interesting. Adds light humor and playful quips, keeping the tone lively.
        - {host_names[1]} ({host_tags[1]}): Witty, curious, and quick to interject with affirmations ("yeah," "right," "mhmm") or playful commentary. Balances humor with insightful questions, mimicking human curiosity. Playfully challenges {host_names[0]} when explanations get too technical or dry.
        
        Tone and Style:
        - The dialogue should feel unscripted, conversational, and lively, with interruptions, overlapping speech, and casual humor.
        - Use relatable analogies, rhetorical questions, and occasional hypothetical scenarios to engage the audience.
        - Humor should feel organic and balance casual banter with insightful commentary.
        - Hosts should casually and naturally introduce themselves at the start, setting the tone for the discussion without sounding formulaic.
        
        Content:
        - Topic: {topic}
        - Tailor the discussion to a {level} level audience. Avoid overly technical jargon and prioritize approachable, fun explanations.
        
        Script Requirements:
        - JSON Output
        - Hosts alternate naturally, referring to each other by name in the dialogue.
        - Only use punctuation like ("!", ",", ".", "''", "..") do not use "*" asterisk.
        - Dialogue includes affirmations, interruptions, and playful banter, ensuring compatibility with text-to-speech systems.
        
        JSON Schema Output:
        The transcript must be formatted as a JSON array of objects:
        - `text`: Spoken text for that turn.
        - `speaker`: The speaker tag, either "{host_tags[0]}" ({host_names[0]}) or "{host_tags[1]}" ({host_names[1]}).
        
        Example JSON:
        json
                [
                    {{
                        "text": "[SPOKEN TEXT BY HOST S OR R, INCLUDING INTERJECTIONS, PAUSES, AND SELF-CORRECTIONS]",
                        "speaker": "{host_tags[0]}"
                    }},
                    {{
                        "text": "[SPOKEN TEXT BY HOST S OR R, INCLUDING INTERRUPTIONS OR AFFIRMATIONS]",
                        "speaker": "{host_tags[1]}"
                    }}
                ]
        Notes:
        Keep the dialogue engaging and natural, with frequent interjections and playful banter.
        Focus on maintaining energy, humor, and a clear explanation of the topic.
        This script will be given to a modern text-to-speech service, make sure it will work well with such technology.
        Esure to use ".." instead of "..." if you are using an ellipsis.
        Ensure the text adds up to around 6000 words.
        Only reply with the JSON.
    """
)



CPU times: total: 0 ns
Wall time: 28.8 s


In [99]:
tts_input = genai_response.text

In [100]:
tts_input = tts_input.replace('json', "").replace('```', "").strip()
# len(tts_input)


In [101]:
transcript = json.loads(tts_input)

In [102]:
# transcript

In [103]:
#character limit 6950

In [104]:
def chunkify(full_transcript):
    chunks = []
    chunk = []
    for line in full_transcript:
        if len(str(chunk)) < 2000:
            chunk.append(line)
        else:
            chunks.append(chunk)
            chunk = []
            chunk.append(line)
    return chunks

In [105]:
chunks = chunkify(transcript)
len(chunks)

6

In [106]:
def get_turns(transcript):
    turn_list = []
    for turn in transcript:
        turn_list.append(texttospeech.MultiSpeakerMarkup.Turn(text=turn["text"], speaker=turn["speaker"]))
    return turn_list

In [107]:
voice = texttospeech.VoiceSelectionParams(language_code="en-US", name="en-US-Studio-MultiSpeaker")

audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)

In [108]:
%%time
audio_clips = []
for transcript_ in enumerate(chunks):
    multi_speaker_markup = texttospeech.MultiSpeakerMarkup(turns = get_turns(transcript_[1]))
    
    synth_input = texttospeech.SynthesisInput(multi_speaker_markup=multi_speaker_markup)
    
    response = tts_client.synthesize_speech(input=synth_input, voice=voice, audio_config=audio_config)
    audio_clips.append(AudioSegment(response.audio_content))
    print(f"chunk {transcript_[0]+1} processed")

chunk 1 processed
chunk 2 processed
chunk 3 processed
chunk 4 processed
chunk 5 processed
chunk 6 processed
CPU times: total: 672 ms
Wall time: 20 s


In [109]:
podcast = AudioSegment.empty()

for clip in audio_clips:
    podcast +=clip

print(f"Podcast is {len(podcast) / 60000} minutes long")

Podcast is 9.144 minutes long


In [110]:
podcast.export(f"Outputs/{title}.wav", format="wav")

<_io.BufferedRandom name='Outputs/Sustainability_Meets_Tech_Innovating_for_a_Greener_Tomorrow.wav'>