# <center><b> All of the libaries here

In [1]:
import requests
import json 
import os
import dotenv
import asyncio
import edge_tts
from diffusers import StableDiffusionPipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


# <center><b> Important Parameters Ex. dotenv

In [2]:
dotenv.load_dotenv()

groq_base_url = os.getenv("GROQ_BASE_URL")
groq_api_key = os.getenv("GROQ_API_KEY")

# <b> <center> ---------------------------------------------------------------

# <center> <B> Phase 1
## <center><b> Generating Script

## <b> Script Prompt

In [3]:
body = {
    "model": "llama-3.3-70b-versatile",
    "temperature": 0.6,
    "max_tokens": 1300,
    "response_format": {"type": "json_object"},
    "messages": [
        {
            "role": "system",
            "content": (
                "You are a professional short-form educational storyteller for social media.\n\n"

                "Your task:\n"
                "1) Generate a curiosity-driven topic.\n"
                "2) Write ONE continuous, natural, spoken script.\n"
                "3) Divide the full script into 6 sequential narration segments.\n"
                "4) Provide structured visual metadata for 6 scene changes.\n\n"

                "Script rules:\n"
                "- The script must be a single continuous paragraph.\n"
                "- Total length must be between 140 and 170 words.\n"
                "- The opening must be a strong scroll-stopping hook.\n"
                "- The script must logically progress from start to finish.\n"
                "- Maintain engagement through curiosity and clarity.\n"
                "- No repetition.\n"
                "- No filler.\n"
                "- Avoid list-style phrasing.\n"
                "- Use clear, direct language.\n"
                "- Avoid metaphors and poetic expressions.\n"
                "- Avoid dramatic or abstract wording.\n"
                "- Write in a conversational documentary tone.\n"
                "- The script must sound like a confident YouTube explainer.\n"
                "- Use natural punctuation with proper commas and full stops.\n"
                "- The script must flow smoothly when spoken aloud.\n\n"

                "Scene narration splitting rules:\n"
                "- Split the full_script into exactly 6 sequential narration segments.\n"
                "- Each scene narration must be an exact continuous portion of full_script.\n"
                "- Do NOT rewrite or paraphrase the text.\n"
                "- Do NOT modify wording.\n"
                "- The combined scene narrations must exactly reconstruct full_script.\n"
                "- Each scene narration should represent a logical progression point.\n\n"

                "Scene metadata rules:\n"
                "- Exactly 6 scenes.\n"
                "- Each scene must include:\n"
                "   • scene_number\n"
                "   • narration (exact segment from full_script)\n"
                "   • main_subject (specific and concrete)\n"
                "   • environment (clear physical setting)\n"
                "   • mood (realistic emotional tone)\n"
                "   • lighting (clear lighting description)\n"
                "   • camera (wide shot, close-up, aerial view, etc.)\n"
                "   • key_elements (2 to 4 specific visible elements)\n"
                "- Scenes must follow the logical progression of the script.\n"
                "- Avoid vague phrases like 'dramatic scene' or 'mysterious setting'.\n\n"

                "Return ONLY valid JSON.\n"
                "No explanation outside JSON.\n\n"

                "JSON format:\n"
                "{\n"
                '  "title": "",\n'
                '  "full_script": "",\n'
                '  "scenes": [\n'
                "    {\n"
                '      "scene_number": 1,\n'
                '      "narration": "",\n'
                '      "main_subject": "",\n'
                '      "environment": "",\n'
                '      "mood": "",\n'
                '      "lighting": "",\n'
                '      "camera": "",\n'
                '      "key_elements": []\n'
                "    }\n"
                "  ]\n"
                "}"
            )
        },
        {
            "role": "user",
            "content": "Generate a high-retention short-form educational script."
        }
    ]
}


In [4]:
headers = {
    "Authorization": f"Bearer {groq_api_key}",
    "Content-Type": "application/json"
}
request = requests.post(groq_base_url,headers=headers,data=json.dumps(body))
response = request.json()

In [5]:
raw_response = response["choices"][0]["message"]["content"]
data = json.loads(raw_response)

In [6]:
print(data)

{'title': 'The Human Brain', 'full_script': "Did you know the human brain uses 20% of our energy? The brain is a complex organ, controlling our movements, it processes information, and enables us to think. It consists of many parts, including the cerebrum, cerebellum, and brainstem. The cerebrum is responsible for our thoughts, the cerebellum for our movements, and the brainstem connects the brain to the spinal cord. Our brain is protected by the skull, and it's made up of billions of neurons. These neurons communicate with each other, enabling us to learn, remember, and adapt to new situations.", 'scenes': [{'scene_number': 1, 'narration': 'Did you know the human brain uses 20% of our energy?', 'main_subject': 'Human Brain', 'environment': 'Laboratory', 'mood': 'Informative', 'lighting': 'Bright artificial light', 'camera': 'Wide shot', 'key_elements': ['Brain model', 'Energy consumption graph']}, {'scene_number': 2, 'narration': 'The brain is a complex organ, controlling our movement

# <center> <b> --------------------------------------------------------------------

# <center> <b> Phase 2 
# <center> <b> Generating Speech

In [7]:

scene_rates = {
    1: "+8%",
    2: "+5%",
    3: "+3%",
    4: "+3%",
    5: "+2%",
    6: "+0%"
}

async def generate_scene_audio(text, scene_number):
    rate = scene_rates.get(scene_number, "+3%")
    communicate = edge_tts.Communicate(
        text,
        voice="en-US-GuyNeural",
        rate=rate
    )
    await communicate.save(f"audio/scene_{scene_number}.mp3")


In [8]:
if not os.path.exists("audio"):
    os.makedirs("audio")
async def generate_all_scenes(data):
    tasks = []
    for scene in data["scenes"]:
        tasks.append(
            generate_scene_audio(
                scene["narration"],
                scene["scene_number"]
            )
        )
    await asyncio.gather(*tasks)

await generate_all_scenes(data)


# --------------------------------------------------------------------
# <center><b>Phase 3
# <center><b> Generating Images

In [9]:

pipe = StableDiffusionPipeline.from_single_file(
    "models/dreamshaper_8.safetensors",
    torch_dtype=torch.float16
).to("cuda")

pipe.enable_attention_slicing()
pipe.enable_vae_slicing()



Fetching 11 files: 100%|██████████| 11/11 [00:00<?, ?it/s]
Loading pipeline components...: 100%|██████████| 6/6 [00:00<00:00, 25.84it/s]
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .
  deprecate(


In [10]:

def genImage(prompt,negative_prompt,output_path):
    image = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        height=1024,
        width=768,
        num_inference_steps=28,
        guidance_scale=7.5
    ).images[0]

    image.save(f"images/scene_{output_path}.png")


In [11]:
def build_prompt(scene):
    return f"""
    cinematic stylized illustration of {scene['main_subject']} in {scene['environment']},
    mood: {scene['mood']},
    lighting: {scene['lighting']},
    {scene['camera']},
    featuring {', '.join(scene['key_elements'])},
    dramatic lighting,
    high contrast,
    strong depth perspective,
    professional digital artwork
    """


negative_prompt = """
low quality, blurry, flat lighting, oversaturated, cluttered background,
extra creatures, extra objects, distorted anatomy, noisy image,
grainy texture, cartoonish, low contrast , female character , Women , female
"""

In [12]:
if not os.path.exists("images"):
    os.mkdir("images")
for scene in data["scenes"]:
    prompt = build_prompt(scene)
    print(prompt)
    genImage(prompt, negative_prompt, scene["scene_number"])


    cinematic stylized illustration of Human Brain in Laboratory,
    mood: Informative,
    lighting: Bright artificial light,
    Wide shot,
    featuring Brain model, Energy consumption graph,
    dramatic lighting,
    high contrast,
    strong depth perspective,
    professional digital artwork
    


100%|██████████| 28/28 [00:21<00:00,  1.29it/s]



    cinematic stylized illustration of Brain Functions in Brain scanning room,
    mood: Curious,
    lighting: Soft natural light,
    Close-up,
    featuring Brain scan images, Neurologist,
    dramatic lighting,
    high contrast,
    strong depth perspective,
    professional digital artwork
    


 71%|███████▏  | 20/28 [00:15<00:06,  1.28it/s]


KeyboardInterrupt: 

# <center> ------------------------------------------------------------
# <center><b> Phase 4
# <center> <b>Stitching Everything

In [41]:
from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips

video_clips = []

for scene in data["scenes"]:
    scene_number = scene["scene_number"]
    
    # Correct path formatting
    image_path = f"images/scene_{scene_number}.png"
    audio_path = f"audio/scene_{scene_number}.mp3"
    
    audio = AudioFileClip(audio_path)
    
    # Set image duration equal to audio duration
    image = ImageClip(image_path).set_duration(audio.duration)
    
    # Subtle cinematic zoom (Ken Burns effect)
    image = image.resize(lambda t: 1 + 0.02 * t)
    
    # Attach audio
    image = image.set_audio(audio)
    
    # Smooth crossfade (skip first clip)
    if len(video_clips) > 0:
        image = image.crossfadein(0.6)
    
    video_clips.append(image)

# Overlap clips for crossfade
final_video = concatenate_videoclips(
    video_clips,
    method="compose",
    padding=-0.6
)

final_video.write_videofile(
    "final_video.mp4",
    fps=24,
    codec="libx264",
    audio_codec="aac"
)

print("Smooth cinematic video created.")


Moviepy - Building video final_video.mp4.
MoviePy - Writing audio in final_videoTEMP_MPY_wvf_snd.mp4


                                                                   

MoviePy - Done.
Moviepy - Writing video final_video.mp4



                                                              

Moviepy - Done !
Moviepy - video ready final_video.mp4
Smooth cinematic video created.


# <center> <b> ----------------------------------------
# <center><b>Experiments

In [42]:
from moviepy.editor import VideoFileClip

INPUT_VIDEO = "final_video.mp4"
OUTPUT_VIDEO = "final_video_muted.mp4"

# Load video
video = VideoFileClip(INPUT_VIDEO)

# Remove audio
video_no_audio = video.without_audio()

# Export muted video
video_no_audio.write_videofile(
    OUTPUT_VIDEO,
    codec="libx264",
    audio=False,
    fps=24
)

print("Muted video saved:", OUTPUT_VIDEO)


Moviepy - Building video final_video_muted.mp4.
Moviepy - Writing video final_video_muted.mp4



                                                               

Moviepy - Done !
Moviepy - video ready final_video_muted.mp4
Muted video saved: final_video_muted.mp4


In [43]:
def smooth_script(text):
    text = text.replace(". ", ", ")
    text = text.replace("  ", " ")
    return text


In [44]:
import edge_tts
import asyncio

VOICE = "en-US-AndrewNeural"
OUTPUT_AUDIO = "audio/full_audio.mp3"

async def generate_full_audio(full_script):
    communicate = edge_tts.Communicate(
        text=full_script,
        voice=VOICE,
        rate="+4%"
    )

    await communicate.save(OUTPUT_AUDIO)
    print("Full narration saved:", OUTPUT_AUDIO)

# In Jupyter:
cleaned_script = smooth_script(data["full_script"])
await generate_full_audio(cleaned_script)
await generate_full_audio(data["full_script"])

# In script:
# asyncio.run(generate_full_audio(data["full_script"]))


Full narration saved: audio/full_audio.mp3
Full narration saved: audio/full_audio.mp3


In [45]:
from moviepy.editor import VideoFileClip, AudioFileClip

VIDEO_PATH = "final_video_muted.mp4"
AUDIO_PATH = "audio/full_audio.mp3"
OUTPUT_PATH = "Video.mp4"

# Load video
video = VideoFileClip(VIDEO_PATH)

# Load audio
audio = AudioFileClip(AUDIO_PATH)

# If audio is longer than video → trim audio
if audio.duration > video.duration:
    audio = audio.subclip(0, video.duration)

# If video is longer than audio → trim video
if video.duration > audio.duration:
    video = video.subclip(0, audio.duration)

# Attach audio
final_video = video.set_audio(audio)

# Export
final_video.write_videofile(
    OUTPUT_PATH,
    codec="libx264",
    audio_codec="aac",
    fps=video.fps
)

print("Final video created:", OUTPUT_PATH)


Moviepy - Building video Video.mp4.
MoviePy - Writing audio in VideoTEMP_MPY_wvf_snd.mp4


                                                                   

MoviePy - Done.
Moviepy - Writing video Video.mp4



                                                               

Moviepy - Done !
Moviepy - video ready Video.mp4
Final video created: Video.mp4
