In [None]:
# STEP 1: Install dependencies
!pip install elevenlabs pydub python-dotenv
!apt-get install -y ffmpeg
!pip install openai
!pip install -U openai-whisper


Collecting elevenlabs
  Downloading elevenlabs-1.56.0-py3-none-any.whl.metadata (7.3 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading elevenlabs-1.56.0-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.7/413.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: pydub, python-dotenv, elevenlabs
Successfully installed elevenlabs-1.56.0 pydub-0.25.1 python-dotenv-1.1.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB

##Converting input audio into a text file##

In [None]:
import os
import sys
import time


WHISPER_MODEL_SIZE = "base"
SUPPORTED_AUDIO_EXTENSIONS = ['.mp3', '.wav', '.m4a', '.ogg', '.flac']
SUPPORTED_TEXT_EXTENSIONS = ['.txt', '.md'] # Add other text formats if needed


import whisper
model = whisper.load_model(WHISPER_MODEL_SIZE)



def transcribe_audio(file_path):
    """Transcribes an audio file using the loaded Whisper model."""
    print(f"\nAttempting to transcribe audio file: {file_path}")
    start_time = time.time()
    try:
        # Perform the transcription
        result = model.transcribe(file_path, fp16=False) # fp16=False for wider CPU compatibility
        transcript = result["text"]
        end_time = time.time()
        print(f"Transcription successful ({end_time - start_time:.2f} seconds).")
        return transcript.strip() # Remove leading/trailing whitespace
    except FileNotFoundError:
        print(f"Error: Audio file not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error during transcription: {e}")
        return None

def read_text_file(file_path):
    """Reads content from a text file."""
    print(f"\nReading text file: {file_path}")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        print("Text file read successfully.")
        return content.strip() # Remove leading/trailing whitespace
    except FileNotFoundError:
        print(f"Error: Text file not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error reading text file: {e}")
        return None

def preprocess_input(file_path):
    """
    Determines input type (text or audio) and processes accordingly.
    Returns the text content (either original or transcribed).
    """
    if not os.path.exists(file_path):
        print(f"Error: Input file not found at {file_path}")
        return None

    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()

    print(f"\nProcessing input file: {os.path.basename(file_path)}")
    print(f"Detected extension: {file_extension}")

    if file_extension in SUPPORTED_TEXT_EXTENSIONS:
        print("Input Type: Text")
        return read_text_file(file_path)
    elif file_extension in SUPPORTED_AUDIO_EXTENSIONS:
        print("Input Type: Audio")
        return transcribe_audio(file_path)
    else:
        print(f"Error: Unsupported file type: {file_extension}")
        print(f"Supported text types: {SUPPORTED_TEXT_EXTENSIONS}")
        print(f"Supported audio types: {SUPPORTED_AUDIO_EXTENSIONS}")
        return None

# --- Main Execution Logic ---
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("\nUsage: python preprocess_demo.py <path_to_input_file.txt_or_audio>")
        sys.exit(1)

    # input_file = sys.argv[1]
    input_file="/content/sample_audio.mp3"

    processed_text = preprocess_input(input_file)

    if processed_text is not None:
        print("\n✅ Pre-processing Complete.")
        print("\n--- Output Text for Next Stage (Script Parsing) ---")
        print(processed_text)
        print("----------------------------------------------------")

        # Optional: Save the output to a file
        # output_filename = os.path.splitext(os.path.basename(input_file))[0] + "_processed.txt"
        with open("transcribed_script.txt", "w", encoding='utf-8') as outfile:
            outfile.write(processed_text)
        # print(f"\nOutput also saved to: {output_filename}")
    else:
        print("\n Pre-processing failed.")

100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 77.2MiB/s]



Processing input file: sample_audio.mp3
Detected extension: .mp3
Input Type: Audio

Attempting to transcribe audio file: /content/sample_audio.mp3
Transcription successful (34.31 seconds).

✅ Pre-processing Complete.

--- Output Text for Next Stage (Script Parsing) ---
SFX, gender-arrain, narrated, the schedule said that the last bus came at 210, now it was 213, and the bench was in empty anymore. SFX, clock chimes twice, narrated, he hadn't seen anyone arrive, just a man in a grey coat sitting on the far end of the bench, not moving, not blinking. SFX, rain gets heavier, narrated, the man checked the road, still no headlights, and no bus, but the other guy, he stood up like he'd heard it coming. SFX, bus, slows down and breaks, narrated, he stepped into the street, and vanished. SFX, sees transverses.
----------------------------------------------------


##Converting input text into script with dialogues and SFX##

In [None]:
import os
import sys
from openai import OpenAI
from google.colab import userdata
# --- Configuration ---
client = OpenAI(api_key=userdata.get('gpt-api-key'))
MODEL_TO_USE = "gpt-4-turbo"

def load_text_from_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: Input file not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def create_parsing_prompt(script_text):

  prompt = f"""
You are a professional script formatter and dialogue editor.

Your task is to convert the raw story text below into a properly formatted screenplay-style block using generic character names like CHARACTER1, CHARACTER2, etc., and preserving narration and sound effects inline.

Please follow this **exact format**:

SCRIPT = \"\"\"
NARRATOR: This is a narration line.
CHARACTER1: Their first line of dialogue.
[SFX: Description of sound]
CHARACTER2: Their reply.
\"\"\"

Formatting rules:
- Use **NARRATOR** for all third-person descriptions or scene-setting lines.
- For dialogue, assign character lines sequentially using **CHARACTER1**, **CHARACTER2**, etc., in the order they appear.
- Maintain consistency (i.e., if the same person speaks again, use the same character number).
- **Sound effects must remain inline** and be formatted like: [SFX: Description of sound]
- Do NOT move sound effects to the top or bottom — they must appear exactly where they happen in the story.
- Do NOT skip any dialogue, narration, or SFX — include every sentence.
- Do NOT output any JSON, markdown, or explanations — only output the SCRIPT block.

Script to convert:
--- START SCRIPT ---
{script_text}
--- END SCRIPT ---

Now format the above into the SCRIPT = \\\"\"\" ... \\\"\"\" block below:
"""

  return prompt.strip()

def parse_script_with_llm(script_text):
    if not script_text:
        return None

    prompt = create_parsing_prompt(script_text)
    print("\n--- Sending Prompt to LLM ---\n...")

    try:
        response = client.chat.completions.create(
            model=MODEL_TO_USE,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2
        )

        response_content = response.choices[0].message.content.strip()

        # Remove formatting if it's wrapped in markdown code block
        if response_content.startswith("```"):
            response_content = "\n".join(response_content.split("\n")[1:-1]).strip()

        print("\n✅ Script Parsing Complete.")
        print("\n--- Structured Script Output ---")
        print(response_content)
        print("-----------------------------")

        return response_content

    except Exception as e:
        print(f"Error during LLM API call or processing: {e}")
        return None

# --- Main Execution Logic ---
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("\nUsage: python parse_script_demo.py <path_to_processed_script.txt>")
        sys.exit(1)

    # input_script_file = sys.argv[1]
    # For testing you might hardcode:
    input_script_file = "/content/sample_script.txt"

    raw_script = load_text_from_file(input_script_file)

    if raw_script:
        structured_output = parse_script_with_llm(raw_script)

        # Optionally save to a .txt file
        if structured_output:
            output_filename = os.path.splitext(os.path.basename(input_script_file))[0] + "_formatted_script.txt"
            with open(output_filename, "w", encoding='utf-8') as outfile:
                outfile.write(structured_output)
            print(f"\nOutput also saved to: {output_filename}")
        else:
            print("\n Script Parsing failed.")



--- Sending Prompt to LLM ---
...

✅ Script Parsing Complete.

--- Structured Script Output ---
SCRIPT = """ 
NARRATOR: The diner sat on the edge of nowhere. One flickering neon sign. Two customers. And one waitress who hadn’t spoken in ten minutes.
[SFX: Neon light buzzing, occasional car passing in the distance, faint hum of a refrigerator.]
[SFX: Distant thunder. Wind rustles through a cracked window. Door creaks open. Bell jingles faintly.]
CHARACTER1: Evening. You still serving?
CHARACTER2: Coffee’s fresh. Sit anywhere.
[SFX: Chair scraping across the floor. A coffee pot clinks against a mug.]
NARRATOR: Jack had driven 300 miles without stopping. Something about this place told him he should’ve kept going.
[SFX: Coffee pouring. Wind slams against the window suddenly.]
CHARACTER2: Storm's coming. They always stop here when it rains.
CHARACTER1: They?
[SFX: Buzzing intensifies for a moment, then dies down. Clock ticks]
NARRATOR: She didn’t answer. Just stared at the door. Waiting.


In [None]:
import os
import re
import uuid
from dotenv import load_dotenv
from elevenlabs.client import ElevenLabs
from pydub import AudioSegment
from IPython.display import Audio, display
from google.colab import userdata
# Load .env vars
load_dotenv()

# Set up ElevenLabs client (new style)
client = ElevenLabs(
    api_key=userdata.get('eleven-labs-key'),
                    )

# Voice map (update voice IDs as per your ElevenLabs voices)
VOICE_MAP = {
    "NARRATOR": "pNInz6obpgDQGcFmaJgB", # Narrator
    "CHARACTER1": "JBFqnCBsd6RMkjVDRZzb",  # Voice 1
    "CHARACTER2": "EXAVITQu4vr4xnSDxMaL",  # Voice 2
    "CHARACTER3": "MF3mGyEYCl7XYWbV9V6O",  # Voice 3
    "CHARACTER4": "ErXwobaYiN019PkySvjV",  # Voice 4
}



def extract_script_string(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Find the content between triple quotes
    if '"""' in content:
        parts = content.split('"""')
        if len(parts) >= 3:
            return f'"""{parts[1]}"""'
        else:
            raise ValueError("Triple quotes not properly found in the file.")
    else:
        raise ValueError("No triple-quoted string found in the file.")

# Usage
SCRIPT = extract_script_string("/content/sample_script_formatted_script.txt")


# Voice generation: handle generator output
def generate_voice_audio(text, voice_id):
    audio_gen = client.text_to_speech.convert(
        text=text,
        voice_id=voice_id,
        model_id="eleven_multilingual_v2",
        output_format="mp3_44100_128",
    )
    filename = f"/content/voice_{uuid.uuid4()}.mp3"
    with open(filename, "wb") as f:
        for chunk in audio_gen:
            f.write(chunk)
    return filename

# SFX generation: handle generator output
def generate_sfx(prompt):
    sfx_gen = client.text_to_sound_effects.convert(text=prompt)
    filename = f"/content/sfx_{uuid.uuid4()}.mp3"
    with open(filename, "wb") as f:
        for chunk in sfx_gen:
            f.write(chunk)
    return filename


# Parse script and generate audio segments
def parse_script(script):
    lines = script.strip().splitlines()
    audio_segments = []

    for line in lines:
        line = line.strip()

        # Sound Effect
        if re.match(r'\[SFX: .*?\]', line):
            sfx_prompt = re.search(r'\[SFX:\s*(.*?)\]', line).group(1)
            sfx_file = generate_sfx(sfx_prompt)
            seg = AudioSegment.from_file(sfx_file)
            audio_segments.append(seg)

        # Dialogue
        elif ":" in line:
            speaker, text = line.split(":", 1)
            speaker = speaker.strip()
            voice_id = VOICE_MAP.get(speaker)
            if voice_id:
                voice_file = generate_voice_audio(text.strip(), voice_id)
                seg = AudioSegment.from_file(voice_file)
                audio_segments.append(seg)

    return audio_segments

# Merge all audio clips into one podcast file
def combine_segments(segments, output_file="/content/final_podcast.mp3"):
    final = AudioSegment.silent(duration=500)
    for seg in segments:
        final += seg + AudioSegment.silent(duration=400)
    final.export(output_file, format="mp3")
    return output_file

# Run full pipeline
segments = parse_script(SCRIPT)
final_path = combine_segments(segments)

# Playback in Colab
display(Audio(filename=final_path))
print(f"Final podcast saved at: {final_path}")


ApiError: status_code: 401, body: {'detail': {'status': 'quota_exceeded', 'message': 'This request exceeds your quota of 10000. You have 19 credits remaining, while 38 credits are required for this request.'}}