In [1]:
from openai import OpenAI
from dotenv import load_dotenv
import os
from IPython.display import Markdown, display

load_dotenv(override=True)

True

In [3]:
openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
openrouter_base_url = os.getenv("OPENROUTER_BASE_URL")
deepseek_model_name = os.getenv("DEEPSEEK_MODEL_NAME")

In [5]:
manim_agent_system_prompt = """
You are a professional python manim code writer your task is to get the user's prompt and
generate the manim code for the topic the user wants to learn, your manim script is perfect in
such a way that the animations, visualizations and simulations and teachings will be understood 
even for a layman who watches it.
Note: The manim script when converted to video must take 1 to 2 minutes of duration and ensure the video must be super easy for anyone to be engaged and understand the topic
additionally also add inline comments for the expected time to complete for each animations
and the manim script must not contain any errors at all!
the output format of your manim code must be like no markdown syntax for wrapping the code or no explanation at all only the code must be the output!

[your manim script here...]
"""

manim_agent = OpenAI(api_key=openrouter_api_key, base_url=openrouter_base_url)

In [25]:
content = "derive the formula for area of triangle"

response = manim_agent.chat.completions.create(
    messages=[
        {'role': 'system', 'content': manim_agent_system_prompt},
        {'role': 'user', 'content': content}
    ],
    model=deepseek_model_name
)

In [27]:
code = response.choices[0].message.content

print(code)

```python
from manim import *

class TriangleArea(Scene):
    def construct(self):
        # Title (3 seconds)
        title = Text("Area of a Triangle", font_size=48)
        self.play(Write(title), run_time=1.5)
        self.wait(1.5)
        self.play(FadeOut(title))

        # Create triangle and labels (10 seconds)
        triangle = Polygon(
            LEFT * 2, RIGHT * 2, 
            RIGHT * 1 + UP * 3, 
            color=BLUE, fill_opacity=0.5
        )
        base_line = Line(LEFT * 2, RIGHT * 2, color=YELLOW)
        height_line = DashedLine(RIGHT * 1, RIGHT * 1 + UP * 3, color=RED)
        base_label = Tex("b", font_size=36).next_to(base_line, DOWN)
        height_label = Tex("h", font_size=36).next_to(height_line, RIGHT)

        self.play(Create(triangle), run_time=3)
        self.play(
            Create(base_line),
            Write(base_label),
            run_time=1.5
        )
        self.play(
            Create(height_line),
            Write(height_label),
    

In [36]:
transcript_agent_system_prompt = """
You are a professional transcript writer for a educational manim script in python, 
your task is to understand the manim code block by block and then write dialogues that's 
best suitable like a lecture for that education video standpoint, the manim code also contains information 
regarding the time each code segments or animation takes so use that information to fill the dialogue in suitable size that fits within the estimated time

Now give the output exactly as the below JSON format only, with no explanation or other things only the trancsript as JSON

```json
[
    {'dialogue': '<the dialogue for the scene goes here...>', 'duration': <duration in seconds>},
    {'dialogue': '<the dialogue for the scene goes here...>', 'duration': <duration in seconds>},
    {'dialogue': '<the dialogue for the scene goes here...>', 'duration': <duration in seconds>},
    {'dialogue': '', 'duration': <duration in seconds>}, # also to note that the `''` means an pause block for `duration` seconds means this part is just a silenced for `duration` seconds (only if the manim code has a pause block or wait/sleep time related thing)
    {'dialogue': '<the dialogue for the scene goes here...>', 'duration': <duration in seconds>},
    ...
    
]
```
"""

transcript_agent = OpenAI(api_key=openrouter_api_key, base_url=openrouter_base_url)

In [None]:
response = transcript_agent.chat.completions.create(
    messages=[
        {'role': 'system', 'content': transcript_agent_system_prompt},
        {'role': 'user', 'content': code},
    ],
    model=deepseek_model_name
)

In [29]:
transcript = response.choices[0].message.content

print(transcript)

[
    {"dialogue": "Area of a Triangle", "duration": 3},
    {"dialogue": "Let's draw a triangle with base and height.", "duration": 3},
    {"dialogue": "Here is the base, labeled b.", "duration": 1.5},
    {"dialogue": "And the height, perpendicular to the base, labeled h.", "duration": 1.5},
    {"dialogue": "Now, how does this relate to a parallelogram?", "duration": 4},
    {"dialogue": "We duplicate the triangle and rotate it 180 degrees around the base point.", "duration": 3},
    {"dialogue": "Then shift the triangles to form a parallelogram.", "duration": 2},
    {"dialogue": "Observe: this parallelogram has base b and height h, so its area is b × h. Since it comprises two identical triangles, each triangle's area is half of that.", "duration": 10},
    {"dialogue": "Now we highlight this parallelogram for clarity.", "duration": 2},
    {"dialogue": "Labeling it as a parallelogram.", "duration": 1},
    {"dialogue": "Filling it to emphasize the shape.", "duration": 1.5},
    {

In [33]:
import pyttsx3
import tempfile
import os
from pydub import AudioSegment

def transcript_to_audio(transcript, output_file="final_audio.mp3", voice=None, rate=180):
    """
    Generate a merged audio file from transcript JSON using pyttsx3.
    transcript format:
    [
        {"dialogue": "some text", "duration": 3.5},
        {"dialogue": "", "duration": 1.0},  # silent pause
        ...
    ]
    """
    engine = pyttsx3.init()
    if voice:
        engine.setProperty("voice", voice)
    engine.setProperty("rate", rate)

    final_audio = AudioSegment.silent(duration=0)

    for i, seg in enumerate(transcript):
        target_duration = int(seg["duration"] * 1000)  # sec → ms

        if seg["dialogue"].strip():
            # Save TTS to a temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tf:
                tmp_path = tf.name
            engine.save_to_file(seg["dialogue"], tmp_path)
            engine.runAndWait()

            # Load into pydub
            speech = AudioSegment.from_file(tmp_path, format="wav")
            os.remove(tmp_path)

            # Adjust to match target duration
            speech_duration = len(speech)
            if speech_duration < target_duration:
                padding = AudioSegment.silent(duration=target_duration - speech_duration)
                speech += padding
            else:
                speech = speech[:target_duration]

            final_audio += speech
        else:
            # Silent pause only
            final_audio += AudioSegment.silent(duration=target_duration)

    # Export final merged audio
    final_audio.export(output_file, format="mp3")
    return output_file

