<a href="https://colab.research.google.com/github/HernanDL/AI-Podcast-generator/blob/main/Podcast_Generator_Gemini_Elevenlabs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AI-Podcast-Generator - Colab Interface

This notebook provides a user-friendly interface for creating high-quality podcasts from text input. With options for voice customization and automated script generation, you can easily generate complete podcast episodes. Simply input your text, adjust the settings, and listen to the generated content in real-time.

*Getting started:* Upload your text input, choose a voice profile, and generate a sample podcast episode!


In [None]:
# Advanced PDF to Podcast Generator with GPT and ElevenLabs
# Dependencies Installation
!pip install google-generativeai elevenlabs torch nltk pydub PyMuPDF

Collecting elevenlabs
  Downloading elevenlabs-1.12.1-py3-none-any.whl.metadata (8.0 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting websockets>=11.0 (from elevenlabs)
  Downloading websockets-14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading elevenlabs-1.12.1-py3-none-any.whl (150 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.5/150.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading websockets-14.1-cp310-cp310-manylinux_2_5_x8

In [None]:
import io
import requests
import time
import os
import google.generativeai as genai
import torch
import fitz  # PyMuPDF
import textwrap
import re
import json
from IPython.display import Audio, display, HTML
from pydub import AudioSegment
from google.colab import files, userdata
from elevenlabs import ElevenLabs

class PodcastGenerator:
    def __init__(self, genai_api_key, elevenlabs_api_key):
        """Initialize the podcast generator with API keys and models"""
        # Set up API keys
        genai.configure(api_key=genai_api_key)
        self.client = ElevenLabs(api_key=elevenlabs_api_key)
        self.elevenlabs_api_key = elevenlabs_api_key
        '''
        English:
        Spanish (Arg)
        Voice: voice_id='9oPKasc15pfAbMr7N6Gs' name='Valeria'
        Voice: voice_id='D7fO4LMKxU3UYXGDpTnA' name='Maxi'
        '''
        # Define voices for different speakers
        self.voices = {
            "speaker1": "cgSgspJ2msm6clMCkdW9", # Jessica
            "speaker2": "iP95p4xoKVk53GoZ742B"  # Chris
        }

    def clean_extracted_pdf_text(self, text):
        # Remove file paths, timestamps, and trailing time patterns using regex
        # This regex pattern matches file paths, timestamps, and time formats followed by a closing bracket
        cleaned_text = re.sub(r'file://[^\s]+|\[\d{1,2}/\d{1,2}/\d{4} \d{2}:\d{2}:\d{2}\]|\s*\d{2}:\d{2}:\d{2}\]', '', text)
        return cleaned_text.strip()  # Remove leading and trailing whitespace

    def clean_extracted_dialog(self, text):
        # Use regex to remove the ```json at the start and ``` at the end
        cleaned_text = re.sub(r'^\s*```json\s*|\s*```$', '', text, flags=re.MULTILINE)
        return cleaned_text.strip()

    def extract_text_and_metadata(self, pdf_path):
        # Open the PDF file
        document = fitz.open(pdf_path)

        # Extract text
        text = ""
        for page in document:
            text += page.get_text()  # Extract text from each page

        # Access metadata
        metadata = document.metadata
        title = metadata.get("title", "No Title Found")
        author = metadata.get("author", "No Author Found")

        # Close the document
        document.close()

        return self.clean_extracted_pdf_text(text), title, author

    def create_dialogue(self, document, title, author):

        prompt = f"""
        ### Objective

        Convert the provided document into an engaging, podcast-style dialogue between two presenters.
        One speaker acts as a facilitator, while the other is portrayed as an expert offering insight into the document’s themes and content.
        Neither is the author of the document, but the second presenter provides in-depth knowledge, commentary, and interpretations.

        #### **Podcast Episode Details**

        - **Episode Title:** *{title}*
        - **Document Title:** *{title}*
        - **Author:** *{author}*
        - **Document:** *{document}*

        #### **Podcast Script Requirements**

        1. **Document Analysis:**
          - Read and analyze the document thoroughly to identify main themes, anecdotes, and key insights.
          - Highlight unique elements or related information that enriches the discussion.

        2. **Conversation Setup:**
          - The first presenter introduces the episode and its theme with enthusiasm, setting the stage for the second presenter to explore the document in detail.
          - The second presenter uses their expertise to interpret the document, sharing engaging insights and practical takeaways for the listeners.
          - Do not include any names for the presenters.

        3. **Tone and Style:**
          - Keep the conversation lively, insightful, and entertaining.
          - The first presenter prompts with thoughtful, open-ended questions to guide the second presenter through various aspects of the text.
          - Responses from the second presenter should reflect the document's content, summarizing or quoting details where relevant.
          - Incorporate humor, excitement, and emotional responses to keep the audience engaged.
          - Use short sentences of less than 15 words suitable for good dialog flow.
          - Include filler words like "amazing", "Oh!", "ah" and "you know..." for a natural conversational flow.
          - Highlight emotional moments, lessons, and any intriguing resolutions from the document.
          - Refer to the author of the document by the first name.

        4. **Key Deliverables:**
          - Explore the document’s themes, anecdotes, and the emotional journey it represents.
          - Discuss the practical and reflective elements of the content.
          - The first presenter wraps up with a summary and leaves listeners with an engaging reflection or takeaway.

        5. **Formatting:**
          - Format the output as a JSON array with each conversational turn represented by an object containing two keys:
            `"speaker"` (either `"speaker1"` or `"speaker2"`) and `"text"` (the dialogue content).
          - Do not name the speaker, just simply use generically speaker1 or speaker2 as values for speaker key.

        6. **Length:**
          - Limit the entire conversation to about a 30000 words, focusing on key insights while keeping the exchange lively and concise.
          - Translate everythng to the same languague of the provided document.
        """

        model = genai.GenerativeModel('models/gemini-1.5-pro')
        response = model.generate_content(contents=prompt)

        # Clean up the response by removing markdown formatting if present
        cleaned_response = self.clean_extracted_dialog(response.text)

        try:
            return json.loads(cleaned_response)
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

    def generate_audio(self, dialogue, output_path="podcast_output"):
        audio_generator = PodcastAudioGenerator(api_key=self.elevenlabs_api_key, voices=self.voices)
        audio_files = audio_generator.generate_audio(dialogue)
        return audio_files

class PodcastAudioGenerator:
    def __init__(self, api_key, voices):
        self.api_key = api_key
        self.voices = voices  # A dictionary mapping speaker names to voice IDs

    def generate_audio(self, dialogue, output_path="podcast_output"):
        """Convert dialogue to audio using ElevenLabs"""
        os.makedirs(output_path, exist_ok=True)

        """Merge multiple audio files into a single file."""
        combined = AudioSegment.empty()  # Start with an empty AudioSegment

        for i, entry in enumerate(dialogue):
            text = entry["text"]
            speaker = entry["speaker"].lower()

            if speaker not in self.voices:
                print(f"Error: No voice found for speaker '{speaker}'")
                continue

            try:
                # Generate audio using the ElevenLabs API
                audio = self._generate_audio_from_text(text, self.voices[speaker])

                if audio:
                    filename = f"{output_path}/segment_{i:03d}_{speaker}.mp3"
                    with open(filename, 'wb') as f:
                        f.write(audio)
                    segment = AudioSegment.from_file(filename)
                    combined += segment  # Concatenate the current segment
                    time.sleep(1)  # Rate limiting
                else:
                    print(f"Failed to generate audio for segment {i}")

            except Exception as e:
                print(f"Error generating audio for segment {i}: {str(e)}")
                continue

        return combined

    def _generate_audio_from_text(self, text, voice_id):
        """Helper method to call the ElevenLabs API and generate audio"""
        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"

        headers = {
            "Accept": "audio/mpeg",
            "xi-api-key": self.api_key,
            "Content-Type": "application/json"
        }

        data = {
            "text": text,
            "model_id": "eleven_multilingual_v2",  # More info: https://help.elevenlabs.io/hc/en-us/articles/17883183930129-What-models-do-you-offer-and-what-is-the-difference-between-them
            "voice_settings": {                    # Mode Info: https://elevenlabs.io/docs/product/speech-synthesis/voice-settings
                "stability": 0.3,           # Stability
                "similarity_boost": 0.7,    # Similarity
                "style": 0.0,               # Style Exaggeration
                "use_speaker_boost": True
                }
        }

        response = requests.post(url, json=data, headers=headers)

        if response.status_code == 200:
            return response.content  # Return the audio bytes
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None  # Return None if there was an error

def create_interface():
    """Create an interactive interface for the notebook"""
    display(HTML("""
    <div style="background-color: #f8f9fa; padding: 20px; border-radius: 5px;">
        <h2>🎙️ PDF to Podcast Generator</h2>
        <p>Upload a PDF document to generate a professional podcast-style summary with natural voices.</p>
        <p><strong>Features:</strong></p>
        <ul>
            <li>PDF text extraction and processing</li>
            <li>GPT-powered dialogue generation</li>
            <li>Professional voice synthesis using ElevenLabs</li>
            <li>Natural conversation flow</li>
            <li>Engaging content presentation</li>
        </ul>
    </div>
    """))


create_interface()

# Get API keys
genai_api_key = userdata.get('GOOGLE_API_KEY')
elevenlabs_api_key = userdata.get('ELEVENLABS_API_KEY')

# Initialize generator
generator = PodcastGenerator(genai_api_key, elevenlabs_api_key)



### Upload PDF/TXT
#### Speech generation will be in the document language. However this beta version is better in US English.

In [14]:
# Upload PDF file
print("\nUpload your PDF or txt file:")
uploaded = files.upload()

if not uploaded:
    print("No file was uploaded.")

filename = list(uploaded.keys())[0]
if not filename.lower().endswith('.pdf') or not filename.lower().endswith('.txt'):
   print("Please upload a PDF or txt file.")

# Process PDF
print("\n📑 Processing file...")
extracted_text, pdf_title, pdf_author = generator.extract_text_and_metadata(filename)

print(f"\nTitle: {pdf_title}")
print(f"Author: {pdf_author}")
print(f"Total Characters: {len(extracted_text)}")
#print(f"Text: {extracted_text}")


Upload your PDF or txt file:


Saving Everest and Life Lessons.pdf to Everest and Life Lessons (1).pdf
Please upload a PDF or txt file.

📑 Processing file...

Title: Everest and Life Lessons
Author: Hernan de Lahitte
Total Characters: 22628


In [None]:
# Create dialogue
print("\n🤖 Generating dialogue...")
dialogue = generator.create_dialogue(extracted_text, pdf_title, pdf_author)

# Display dialogue
print("\n📝 Generated Dialogue:")
print(json.dumps(dialogue, indent=2))


🤖 Generating dialogue...
[
  {
    "speaker": "speaker1",
    "text": "Welcome, everyone, to another gripping episode! Today, we're diving into an incredible story of resilience and reflection from 'Everest and Life Lessons'."
  },
  {
    "speaker": "speaker2",
    "text": "Absolutely! This account is a powerful blend of adventure, introspection, and the raw reality of facing nature's ultimate test."
  },
  {
    "speaker": "speaker1",
    "text": "The narrative begins with a dramatic rescue. Can you set the scene for us?"
  },
  {
    "speaker": "speaker2",
    "text": "Imagine this: the narrator, already struggling with altitude sickness, is descending through the Western Cwm. His legs give way. It's a terrifying moment. He's forced to call for a helicopter evacuation."
  },
  {
    "speaker": "speaker1",
    "text": "Oh wow! That sounds absolutely harrowing. What led to this sudden collapse?"
  },
  {
    "speaker": "speaker2",
    "text": "It seems a rushed approach to base camp 

In [15]:
# Generate audio
print("\n🎵 Generating audio file...")
combined_audio = generator.generate_audio(dialogue)
audio_file = f"podcast_{pdf_title}.mp3"

# Export the combined audio to a single file
combined_audio.export(audio_file, format="mp3")  # Change format as needed

display(Audio(audio_file))


🎵 Generating audio file...
Error: 401 - {"detail":{"status":"quota_exceeded","message":"This request exceeds your  quota. You have 0 credits remaining, while 52 credits are required for this request."}}
Failed to generate audio for segment 20
Error: 401 - {"detail":{"status":"quota_exceeded","message":"This request exceeds your  quota. You have 0 credits remaining, while 180 credits are required for this request."}}
Failed to generate audio for segment 21
Error: 401 - {"detail":{"status":"quota_exceeded","message":"This request exceeds your  quota. You have 0 credits remaining, while 109 credits are required for this request."}}
Failed to generate audio for segment 22
Error: 401 - {"detail":{"status":"quota_exceeded","message":"This request exceeds your  quota. You have 0 credits remaining, while 160 credits are required for this request."}}
Failed to generate audio for segment 23
Error: 401 - {"detail":{"status":"quota_exceeded","message":"This request exceeds your  quota. You have 0

In [None]:
files.download(audio_file)

### Extras (API listings)

In [None]:
from elevenlabs import ElevenLabs

client = ElevenLabs(api_key=elevenlabs_api_key)

client.models.get_all()


Voices


In [None]:
from elevenlabs import ElevenLabs

# Initialize the ElevenLabs API client
client = ElevenLabs(api_key=elevenlabs_api_key)

# Function to retrieve all voices
def get_all_voices():
    try:
        # Use the correct method to get voices
        voices = client.voices.get_all()
        return voices
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
voices = get_all_voices()

if voices:
    # Iterate through the list of Voice objects within the response
    for voice in voices.voices:  # Access the 'voices' attribute of the GetVoicesResponse object
        print(f"Voice: {voice}")  # Access attributes using dot notation


Voice: voice_id='9BWtsMINqrJLrRacOk9x' name='Aria' samples=None category='premade' fine_tuning=FineTuningResponse(is_allowed_to_fine_tune=True, state={'eleven_multilingual_v2': 'fine_tuned', 'eleven_turbo_v2_5': 'fine_tuned', 'eleven_turbo_v2': 'fine_tuned'}, verification_failures=[], verification_attempts_count=0, manual_verification_requested=False, language='en', progress={}, message={'eleven_multilingual_v2': '', 'eleven_turbo_v2_5': '', 'eleven_turbo_v2': ''}, dataset_duration_seconds=None, verification_attempts=None, slice_ids=None, manual_verification=None, max_verification_attempts=5, next_max_verification_attempts_reset_unix_ms=1700000000000, finetuning_state=None) labels={'accent': 'American', 'description': 'expressive', 'age': 'middle-aged', 'gender': 'female', 'use_case': 'social media'} description=None preview_url='https://storage.googleapis.com/eleven-public-prod/premade/voices/9BWtsMINqrJLrRacOk9x/405766b8-1f4e-4d3c-aba1-6f25333823ec.mp3' available_for_tiers=[] setting