In [22]:
import os
from google.cloud import texttospeech_v1beta1 as texttospeech
import google.generativeai as genai
import json


In [23]:
service_acc_path = "c:/UAL-Creative-Computing/Year-Three/Art-AI/secrets/tts_service_account.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_acc_path #API key

gemini_api = open("c:/UAL-Creative-Computing/Year-Three/Art-AI/secrets/API_key.txt", "r").read()
genai.configure(api_key=gemini_api)

In [24]:
model = genai.GenerativeModel("gemini-2.0-flash-exp")

tts_client = texttospeech.TextToSpeechClient()

In [26]:
topic = "Neural networks in art"
level = "Intermediate" #audience
host_tags = ["S", "R"] # R S T U
host_names = ["Archer", "Soup"]

In [46]:
%%time
genai_response = model.generate_content(
    f"""Create a 100-second long podcast transcript for the podcast show "Robots Killed the Radio Star."

        Host Dynamics:
        - {host_names[0]} ({host_tags[0]}): Enthusiastic and structured, leading the conversation with engaging and relatable explanations. Occasionally over-explains but uses vivid analogies, rhetorical questions, and pauses to keep things interesting. Adds light humor and playful quips, keeping the tone lively.
        - {host_names[1]} ({host_tags[1]}): Witty, curious, and quick to interject with affirmations ("yeah," "right," "mhmm") or playful commentary. Balances humor with insightful questions, mimicking human curiosity. Playfully challenges {host_names[0]} when explanations get too technical or dry.
        
        Tone and Style:
        - The dialogue should feel unscripted, conversational, and lively, with interruptions, overlapping speech, and casual humor.
        - Use relatable analogies, rhetorical questions, and occasional hypothetical scenarios to engage the audience.
        - Humor should feel organic and balance casual banter with insightful commentary.
        - Hosts should naturally introduce themselves at the start, setting the tone for the discussion without sounding formulaic.
        
        Content:
        - Topic: {topic}
        - Tailor the discussion to a {level} audience. Avoid overly technical jargon and prioritize approachable, fun explanations.
        
        Script Requirements:
        - JSON Output: The total JSON output must not exceed 5000 bytes.
        - Character Limit: Keep the total JSON transcript within 2000 characters.
        - Hosts alternate naturally, referring to each other by name in the dialogue.
        - Only use punctuation like ("!", ",", ".", "''", "..") don't use "*"
        - Dialogue includes affirmations, interruptions, and playful banter, ensuring compatibility with text-to-speech systems.
        
        JSON Schema Output:
        The transcript must be formatted as a JSON array of objects:
        - `text`: Spoken text for that turn.
        - `speaker`: The speaker tag, either "{host_tags[0]}" ({host_names[0]}) or "{host_tags[1]}" ({host_names[1]}).
        
        Example JSON:
        json
                [
                    {{
                        "text": "[SPOKEN TEXT BY HOST S OR R, INCLUDING INTERJECTIONS, PAUSES, AND SELF-CORRECTIONS]",
                        "speaker": "{host_tags[0]}"
                    }},
                    {{
                        "text": "[SPOKEN TEXT BY HOST S OR R, INCLUDING INTERRUPTIONS OR AFFIRMATIONS]",
                        "speaker": "{host_tags[1]}"
                    }}
                ]
        Notes:
        
        Keep the dialogue engaging and natural, with frequent interjections and playful banter.
        Ensure the total output is under 5000 bytes and no longer than 2000 characters.
        Focus on maintaining energy, humor, and a clear explanation of the topic.
        Only reply with the JSON
    """,
    tools="code_execution"
)



CPU times: total: 15.6 ms
Wall time: 4.61 s


In [47]:
tts_input = genai_response.text

In [60]:
tts_input = tts_input.replace('json', "").replace('```', "").strip()
len(tts_input)

1738

In [58]:
transcript = json.loads(tts_input)

In [59]:
transcript

[{'text': "Hey, I'm Archer!", 'speaker': 'S'},
 {'text': "And I'm Soup, welcome to Robots Killed the Radio Star!",
  'speaker': 'R'},
 {'text': "Today we're diving into neural networks, but like, the artsy kind.",
  'speaker': 'S'},
 {'text': 'Mhmm, not just spreadsheets, right?', 'speaker': 'R'},
 {'text': 'Exactly Soup! Imagine a robot that can paint like Van Gogh, but…not really, I mean it’s inspired, right?',
  'speaker': 'S'},
 {'text': 'So, like, a robot mimic?', 'speaker': 'R'},
 {'text': "More than a mimic! It learns from thousands of paintings, sees patterns, and then, boom, makes its own. It's like a super art student that never sleeps.",
  'speaker': 'S'},
 {'text': "Whoa, that's kinda cool..", 'speaker': 'R'},
 {'text': 'And it’s not just painting, Archer, they can compose music, write poetry, everything!',
  'speaker': 'S'},
 {'text': "Robots writing poems? I'm scared but also intrigued.",
  'speaker': 'R'},
 {'text': "It's wild, Soup. Think of it like a kid learning to dr

In [32]:
def get_turns(transcript):
    turn_list = []
    for turn in transcript:
        turn_list.append(texttospeech.MultiSpeakerMarkup.Turn(text=turn["text"], speaker=turn["speaker"]))
    return turn_list

In [33]:
multi_speaker_markup = texttospeech.MultiSpeakerMarkup(
    turns = get_turns(transcript)
)

In [34]:
synth_input = texttospeech.SynthesisInput(
    multi_speaker_markup=multi_speaker_markup
)

voice = texttospeech.VoiceSelectionParams(
    language_code="en-US", name="en-US-Studio-MultiSpeaker"
)

audio_config = texttospeech.AudioConfig(
    audio_encoding=texttospeech.AudioEncoding.MP3
)

In [37]:
response = tts_client.synthesize_speech(
    input=synth_input, voice=voice, audio_config=audio_config
)

In [38]:
with open("output.mp3", "wb") as out:
    out.write(response.audio_content)
    print('Audio content written to file "output.mp3"')

Audio content written to file "output.mp3"
