<a href="https://colab.research.google.com/github/HernanDL/AI-Podcast-generator/blob/main/Podcast_Generator_Gemini_Elevenlabs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AI-Podcast-Generator - Colab Interface

This notebook provides a user-friendly interface for creating high-quality podcasts from text input. With options for voice customization and automated script generation, you can easily generate complete podcast episodes. Simply input your text, adjust the settings, and listen to the generated content in real-time.

*Getting started:* Upload your text input, choose a voice profile, and generate a sample podcast episode!


In [None]:
# Advanced PDF to Podcast Generator with GPT and ElevenLabs
# Dependencies Installation
!pip install google-generativeai elevenlabs transformers torch nltk pydub PyMuPDF

Collecting elevenlabs
  Downloading elevenlabs-1.12.1-py3-none-any.whl.metadata (8.0 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting websockets>=11.0 (from elevenlabs)
  Downloading websockets-14.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading elevenlabs-1.12.1-py3-none-any.whl (150 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.5/150.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading websockets-14.0-cp310-cp310-manylinux_2_5_x8

In [None]:
import io
import requests
import time
import os
import google.generativeai as genai
import torch
import fitz  # PyMuPDF
import textwrap
import re
import json
from IPython.display import Audio, display, HTML
from pydub import AudioSegment
from google.colab import files, userdata
from transformers import pipeline
from elevenlabs import ElevenLabs

class PodcastGenerator:
    def __init__(self, genai_api_key, elevenlabs_api_key):
        """Initialize the podcast generator with API keys and models"""
        # Set up API keys
        genai.configure(api_key=genai_api_key)
        self.client = ElevenLabs(api_key=elevenlabs_api_key)
        self.elevenlabs_api_key = elevenlabs_api_key

        # Initialize summarization pipeline
        self.summarizer = pipeline(
            "summarization",
            model="facebook/bart-large-cnn",
            device=0 if torch.cuda.is_available() else -1
        )

        '''
        Available Voices:
        Name: Aria, ID: 9BWtsMINqrJLrRacOk9x
        Name: Roger, ID: CwhRBWXzGAHq8TQ4Fs17
        Name: Sarah, ID: EXAVITQu4vr4xnSDxMaL
        Name: Laura, ID: FGY2WhTYpPnrIDTdsKH5
        Name: Charlie, ID: IKne3meq5aSn9XLyUdCD
        Name: George, ID: JBFqnCBsd6RMkjVDRZzb
        Name: Callum, ID: N2lVS1w4EtoT3dr4eOWO
        Name: River, ID: SAz9YHcvj6GT2YYXdXww
        Name: Liam, ID: TX3LPaxmHKxFdv7VOQHJ
        Name: Charlotte, ID: XB0fDUnXU5powFXDhCwa
        Name: Alice, ID: Xb7hH8MSUJpSbSDYk0k2
        Name: Matilda, ID: XrExE9yKIg1WjnnlVkGX
        Name: Will, ID: bIHbv24MWmeRgasZH58o
        Name: Jessica, ID: cgSgspJ2msm6clMCkdW9
        Name: Eric, ID: cjVigY5qzO86Huf0OWal
        Name: Chris, ID: iP95p4xoKVk53GoZ742B
        Name: Brian, ID: nPczCjzI2devNBz1zQrb
        Name: Daniel, ID: onwK4e9ZLuTAKqWW03F9
        Name: Lily, ID: pFZP5JQG7iQjIQuC4Bku
        Name: Bill, ID: pqHfZKP75CvOlQylNhV4
        Name: Argento2, ID: 3LT2az3wbBTUVgsPmXf7
        Name: Adam Stone - late night radio, ID: NFG5qt843uXKj4pFvR7C
        Name: Hernan, ID: RKyCsBLJTEkCbmT7KMN5
        Name: Edu, ID: SQiV2ueVUnCN3MBXlY6E
        Name: Argento1, ID: bX7TNprCvH6K7ssVt8Uh
        Name: Mele, ID: x7D4b10Jgx1cmanBoVFO
        '''
        # Define voices for different speakers
        self.voices = {
            "host": "x7D4b10Jgx1cmanBoVFO",  # Mele
            "guest": "RKyCsBLJTEkCbmT7KMN5" # Hernan
        }

    def clean_extracted_pdf_text(self, text):
        # Remove file paths, timestamps, and trailing time patterns using regex
        # This regex pattern matches file paths, timestamps, and time formats followed by a closing bracket
        cleaned_text = re.sub(r'file://[^\s]+|\[\d{1,2}/\d{1,2}/\d{4} \d{2}:\d{2}:\d{2}\]|\s*\d{2}:\d{2}:\d{2}\]', '', text)
        return cleaned_text.strip()  # Remove leading and trailing whitespace

    def clean_extracted_dialog(self, text):
        # Use regex to remove the ```json at the start and ``` at the end
        cleaned_text = re.sub(r'^\s*```json\s*|\s*```$', '', text, flags=re.MULTILINE)
        return cleaned_text.strip()

    def extract_text_and_metadata(self, pdf_path):
        # Open the PDF file
        document = fitz.open(pdf_path)

        # Extract text
        text = ""
        for page in document:
            text += page.get_text()  # Extract text from each page

        # Access metadata
        metadata = document.metadata
        title = metadata.get("title", "No Title Found")
        author = metadata.get("author", "No Author Found")

        # Close the document
        document.close()

        return self.clean_extracted_pdf_text(text), title, author

    def create_dialogue(self, document, title, author):

        prompt = f"""
        Objective: Convert the provided document into an engaging, podcast-style dialogue between a Host and a Guest (portrayed as the document’s author or an expert on its content). The goal is to create a natural, enjoyable, and informative experience for the listeners.

        Document Title: {title}
        Author: {author}
        Document:
        {document}

        #### Podcast Script Requirements

        1. **Document Analysis:**
          - Thoroughly read and analyze the document to grasp the main themes, insights, and tone.
          - Identify any anecdotes, unique points, and the author’s voice that can be highlighted in a conversational format.

        2. **Podcast Setup:**
          - Construct the dialogue as a dynamic conversation between two characters: a Host and a Guest.
          - The Host should introduce the Guest as either the author or an expert on the document’s topic, setting a warm, enthusiastic tone.
          - The Guest shares their expertise, adding depth to the discussion and engaging the audience.

        3. **Tone and Style:**
          - Keep the conversation friendly, insightful, and light-hearted where appropriate.
          - The Host should guide the Guest with thoughtful, open-ended questions that draw out key ideas, interesting stories, and vivid examples from the document.
          - Responses from the Guest should reflect the document’s content directly, summarizing or quoting specific details when relevant.
          - Use light humor or emotional expressions to keep the conversation relatable and captivating.

        4. **Content Structure:**
          - **Introduction:**
            - Start with the Host introducing the episode’s theme and welcoming the Guest, establishing the document’s context.
            - The Host can make a relatable remark or express curiosity about the document’s topic to pique listener interest.
          - **Core Conversation:**
            - The Host should ask relevant, engaging questions that allow the Guest to explain the document’s main points and stories naturally.
            - The Guest responds with clear, concise insights, referring directly to the document’s content, including anecdotes or key details.
            - Add expressions of surprise, humor, or excitement to make the exchange feel authentic.
          - **Conclusion:**
            - The Host wraps up with a summary of the conversation’s highlights, thanks the Guest, and leaves listeners with a memorable takeaway or reflection on the topic.

        5. **Language and Emotional Tone:**
          - If necessary, **translate the entire dialogue into the document’s original language** to preserve authenticity and context.
          - Use expressions that naturally match the document’s content, such as enthusiasm for highlights, a reflective tone for introspective moments, or light humor for relatable scenarios.

        6. **Formatting:**
          - Format the output as a JSON array with each conversational turn as an object containing two keys:
            - `"speaker"` (either `"Host"` or `"Guest"`) and
            - `"text"` (the dialogue content).
          - Keep each turn short and focused (2-4 sentences) to maintain an engaging, dynamic flow.

        7. **Length:**
          - Limit the entire conversation to about a 10-minute audio length, focusing on key insights while keeping the exchange lively and concise.
        """

        model = genai.GenerativeModel('models/gemini-1.5-pro')
        response = model.generate_content(contents=prompt)

        # Clean up the response by removing markdown formatting if present
        cleaned_response = self.clean_extracted_dialog(response.text)
        print(cleaned_response)

        try:
            return json.loads(cleaned_response)
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

    def generate_audio(self, dialogue, output_path="podcast_output"):
        audio_generator = PodcastAudioGenerator(api_key=self.elevenlabs_api_key, voices=self.voices)
        audio_files = audio_generator.generate_audio(dialogue)
        return audio_files

class PodcastAudioGenerator:
    def __init__(self, api_key, voices):
        self.api_key = api_key
        self.voices = voices  # A dictionary mapping speaker names to voice IDs

    def generate_audio(self, dialogue, output_path="podcast_output"):
        """Convert dialogue to audio using ElevenLabs"""
        os.makedirs(output_path, exist_ok=True)

        """Merge multiple audio files into a single file."""
        combined = AudioSegment.empty()  # Start with an empty AudioSegment

        for i, entry in enumerate(dialogue):
            text = entry["text"]
            speaker = entry["speaker"].lower()

            if speaker not in self.voices:
                print(f"Error: No voice found for speaker '{speaker}'")
                continue

            try:
                # Generate audio using the ElevenLabs API
                audio = self._generate_audio_from_text(text, self.voices[speaker])

                if audio:
                    filename = f"{output_path}/segment_{i:03d}_{speaker}.mp3"
                    with open(filename, 'wb') as f:
                        f.write(audio)
                    segment = AudioSegment.from_file(filename)
                    combined += segment  # Concatenate the current segment
                    time.sleep(1)  # Rate limiting
                else:
                    print(f"Failed to generate audio for segment {i}")

            except Exception as e:
                print(f"Error generating audio for segment {i}: {str(e)}")
                continue

        return combined

    def _generate_audio_from_text(self, text, voice_id):
        """Helper method to call the ElevenLabs API and generate audio"""
        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"

        headers = {
            "Accept": "audio/mpeg",
            "xi-api-key": self.api_key,
            "Content-Type": "application/json"
        }

        data = {
            "text": text,
            "model_id": "eleven_multilingual_v2",  # More info: https://help.elevenlabs.io/hc/en-us/articles/17883183930129-What-models-do-you-offer-and-what-is-the-difference-between-them
            "voice_settings": {                    # Mode Info: https://elevenlabs.io/docs/product/speech-synthesis/voice-settings
                "stability": 0.3,           # Stability
                "similarity_boost": 0.7,    # Similarity
                "style": 0.0,               # Style Exaggeration
                "use_speaker_boost": True
                }
        }

        response = requests.post(url, json=data, headers=headers)

        if response.status_code == 200:
            return response.content  # Return the audio bytes
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None  # Return None if there was an error

def create_interface():
    """Create an interactive interface for the notebook"""
    display(HTML("""
    <div style="background-color: #f8f9fa; padding: 20px; border-radius: 5px;">
        <h2>🎙️ PDF to Podcast Generator</h2>
        <p>Upload a PDF document to generate a professional podcast-style summary with natural voices.</p>
        <p><strong>Features:</strong></p>
        <ul>
            <li>PDF text extraction and processing</li>
            <li>GPT-powered dialogue generation</li>
            <li>Professional voice synthesis using ElevenLabs</li>
            <li>Natural conversation flow</li>
            <li>Engaging content presentation</li>
        </ul>
    </div>
    """))


create_interface()

# Get API keys
genai_api_key = userdata.get('GOOGLE_API_KEY')
elevenlabs_api_key = userdata.get('ELEVENLABS_API_KEY')

# Initialize generator
generator = PodcastGenerator(genai_api_key, elevenlabs_api_key)



config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
# Upload PDF file
print("\nUpload your PDF file:")
uploaded = files.upload()

if not uploaded:
    print("No file was uploaded.")

filename = list(uploaded.keys())[0]
if not filename.lower().endswith('.pdf'):
    print("Please upload a PDF file.")


Upload your PDF file:


Saving Hernan de Lahitte - Everest y las Lecciones de Vida.pdf to Hernan de Lahitte - Everest y las Lecciones de Vida.pdf


In [None]:
# Process PDF
print("\n📑 Processing PDF...")
extracted_text, pdf_title, pdf_author = generator.extract_text_and_metadata(filename)

print(f"\nTitle: {pdf_title}")
print(f"Author: {pdf_author}")
print(f"Total Characters: {len(extracted_text)}")
#print(f"Text: {extracted_text}")



📑 Processing PDF...

Title: Hernan de Lahitte - Everest y las Lecciones de Vida
Author: herna
Total Characters: 30528


In [None]:
# Create dialogue
print("\n🤖 Generating dialogue...")
dialogue = generator.create_dialogue(extracted_text, pdf_title, pdf_author)

# Display dialogue
print("\n📝 Generated Dialogue:")
#print(json.dumps(dialogue, indent=2))


🤖 Generating dialogue...
[
  {
    "speaker": "Host",
    "text": "¡Hola a todos y bienvenidos a un nuevo episodio! Hoy nos acompaña Hernán de Lahitte, un montañista experimentado, para hablar sobre su inspiradora experiencia: 'Everest y las Lecciones de Vida'.  Hernán, ¡bienvenido! Me intriga mucho saber qué lecciones de vida se pueden aprender en la cima del mundo."
  },
  {
    "speaker": "Guest",
    "text": "Gracias por la invitación.  Escalar el Everest fue un desafío inmenso, tanto física como mentalmente.  Aprendí la importancia de la perseverancia, la preparación meticulosa y el trabajo en equipo.  Como escribí, '...cada paso era una victoria, cada respiro una conquista'."
  },
  {
    "speaker": "Host",
    "text": "Increíble. Mencionas la preparación, ¿podrías contarnos un poco más sobre eso? ¿Qué tipo de entrenamiento realizaste para enfrentar semejante desafío?"
  },
  {
    "speaker": "Guest",
    "text": "Claro que sí.  El entrenamiento fue extenuante, años de preparaci

In [None]:
# Generate audio
print("\n🎵 Generating audio file...")
combined_audio = generator.generate_audio(dialogue)
audio_file = f"podcast_{pdf_title}.mp3"

# Export the combined audio to a single file
combined_audio.export(audio_file, format="mp3")  # Change format as needed

display(Audio(audio_file))


🎵 Generating audio file...


In [None]:
files.download(audio_file)

### Extras (API listings)

In [None]:
from elevenlabs import ElevenLabs

client = ElevenLabs(
    api_key=elevenlabs_api_key,
)
client.models.get_all()
