<a href="https://colab.research.google.com/github/NandiniDuggineni/2d-image-creator/blob/main/ursmentor_int_word_skip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ================================
# Generate test Telugu audio
# ================================
test_path_telugu = "/content/test_telugu.mp3"

async def generate_telugu_tts():
    await synthesize_edge_tts(
        "manam roju chucee vaati nundi start chedham, first chet-lu, trees... fruites, phalaalu... Animals, jantuvulu",
        test_path_telugu,
        voice="te-IN-ShrutiNeural",
        rate="-10%"  # corrected
    )

# Run in Colab
await generate_telugu_tts()

# Play audio
from IPython.display import Audio
Audio(test_path_telugu)


NameError: name 'synthesize_edge_tts' is not defined

In [3]:
from google.colab import files

# Upload file (you’ll get a "Choose file" button in Colab)
uploaded = files.upload()

# Check uploaded file name
for filename in uploaded.keys():
    print(f"Uploaded file: {filename}")

Saving ursmentor.mp4 to ursmentor.mp4
Uploaded file: ursmentor.mp4


In [4]:
from PIL import Image
import os
from google.colab import files

# ✅ 1. Upload thumbnails
uploaded = files.upload()

# ✅ 2. Target size (YouTube standard) and max size
TARGET_SIZE = (1280, 720)
MAX_FILESIZE = 2 * 1024 * 1024  # 2 MB

# ✅ 3. Create output folder
os.makedirs("resized_thumbnails", exist_ok=True)

def resize_and_compress(img_path, save_path, target_size=TARGET_SIZE, max_size=MAX_FILESIZE):
    img = Image.open(img_path).convert("RGB")

    # Resize with aspect ratio + padding
    img.thumbnail(target_size, Image.Resampling.LANCZOS)
    new_img = Image.new("RGB", target_size, (0, 0, 0))  # black padding
    new_img.paste(img, ((target_size[0] - img.size[0]) // 2,
                        (target_size[1] - img.size[1]) // 2))

    # Compress loop until under 2MB
    quality = 95
    while quality > 10:
        new_img.save(save_path, "JPEG", quality=quality, optimize=True)
        if os.path.getsize(save_path) <= max_size:
            break
        quality -= 5

    print(f"✅ Saved {save_path} | Size: {os.path.getsize(save_path)/1024:.1f} KB | Quality: {quality}")

for filename in uploaded.keys():
    base, _ = os.path.splitext(filename)
    save_path = os.path.join("resized_thumbnails", f"{base}.jpg")  # save as JPEG
    resize_and_compress(filename, save_path)

    # ✅ Direct download of each resized image
    files.download(save_path)



In [5]:
# ================================
# Install required packages
# ================================
!pip install moviepy opencv-python-headless transformers accelerate torch soundfile pydub edge-tts nest_asyncio

# ================================
# Imports
# ================================
import nest_asyncio, asyncio, edge_tts
from IPython.display import Video, display
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip, concatenate_audioclips, concatenate_videoclips
from moviepy.audio.fx.all import audio_loop
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import cv2, numpy as np, torch, soundfile as sf, textwrap
nest_asyncio.apply()

# ================================
# Utility: clean special characters
# ================================
def clean_text(text):
    return text.replace("’", "'").replace("–", "-").replace("…", "...")

# ================================
# 1. Load main story, intro, and outro videos
# ================================
story_path = "/content/ursmentor.mp4"
intro_path = "/content/intro.mp4"
outro_path = "/content/outro.mp4"

story_clip = VideoFileClip(story_path)
intro_clip = VideoFileClip(intro_path).resize(story_clip.size)
outro_clip = VideoFileClip(outro_path).resize(story_clip.size)

# ================================
# 2. Story slides
# ================================

slides_content = [
    {"title": "Introduction", "content": "Many learners struggle to speak English despite studying for years. The problem isn’t intelligence—it’s approach and practice."},
    {"title": "Step 1: Stop Focusing Only on Grammar", "content": "Overthinking grammar can block your speaking. Start speaking simple sentences first; correct mistakes gradually."},
    {"title": "Step 2: Practice Speaking Daily", "content": "Speak to yourself, record your voice, or use apps like HelloTalk/Tandem. Consistent speaking builds confidence and fluency."},
    {"title": "Step 3: Learn High-Frequency Words", "content": "Focus on common vocabulary that covers most daily conversations. Use Quizlet or Anki to memorize and practice these words."},
    {"title": "Step 4: Listen Actively", "content": "Watch English shows, YouTube videos, and podcasts. Repeat sentences (shadowing) to improve pronunciation and fluency."},
    {"title": "Step 5: Think in English", "content": "Stop translating from your native language. Describe your actions, thoughts, and surroundings directly in English to train your brain."},
    {"title": "Conclusion", "content": "By speaking daily, focusing on key words, listening actively, and thinking in English, you can overcome speaking blocks and become fluent faster."}
]

# ================================
# 3. Edge-TTS function (slow -5%)
# ================================
async def synthesize_edge_tts(text, output_path, voice="en-GB-RyanNeural", rate="-10%"):
    communicate = edge_tts.Communicate(text, voice=voice, rate=rate)
    await communicate.save(output_path)

# ================================
# 4. Generate story narration (skip "Introduction" in speech)
# ================================
narration_clips = []
timings = []

for i, slide in enumerate(slides_content):
    audio_path = f"/content/audio_{i}.mp3"
    # Skip "Introduction" in TTS
    if slide["title"].lower() == "introduction":
        text_to_speak = clean_text(slide["content"])
    else:
        text_to_speak = clean_text(f"{slide['title']}. {slide['content']}")
    asyncio.get_event_loop().run_until_complete(
        synthesize_edge_tts(text_to_speak, audio_path)
    )
    clip = AudioFileClip(audio_path)
    narration_clips.append(clip)
    timings.append((slide, clip.duration))

full_narration = concatenate_audioclips(narration_clips)
story_duration = full_narration.duration

# ================================
# 5. Generate intro/outro TTS
# ================================
intro_tts_path = "/content/intro_tts.mp3"
outro_tts_path = "/content/outro_tts.mp3"

asyncio.get_event_loop().run_until_complete(
    synthesize_edge_tts("Hi! and Welcome to my channel, Your's Mentor!", intro_tts_path)
)
asyncio.get_event_loop().run_until_complete(
    synthesize_edge_tts("That's the end of the video. Thanks for watching.See you in the next video!", outro_tts_path)
)

intro_tts_clip = AudioFileClip(intro_tts_path)
outro_tts_clip = AudioFileClip(outro_tts_path)

# ================================
# 6. Generate AI background music
# ================================
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

prompt = ["soft acoustic guitar with warm chords, gentle piano notes, subtle ambient pads, calm and neutral storytelling background"]
inputs = processor(text=prompt, return_tensors="pt")
audio_values = model.generate(**inputs, max_new_tokens=1024)

raw_music_path = "/content/bg_music_raw.wav"
sf.write(raw_music_path, audio_values[0,0].cpu().numpy(), 32000)

bg_music = AudioFileClip(raw_music_path)

# ================================
# 7. Loop background music for intro, story, outro
# ================================
intro_music = audio_loop(bg_music, duration=intro_clip.duration).volumex(0.2)
story_music = audio_loop(bg_music, duration=story_duration).volumex(0.2)
outro_music = audio_loop(bg_music, duration=outro_clip.duration).volumex(0.2)

# ================================
# 8. Text overlay (fixed OpenCV calls)
# ================================
def add_text_centered(get_frame, t, title_content_gap=40, line_spacing_content=15):
    frame = get_frame(t).copy()
    elapsed = 0
    for slide, dur in timings:
        if elapsed <= t < elapsed + dur:
            title = clean_text(slide["title"])
            content = clean_text(slide["content"])

            wrapped_title = textwrap.wrap(title, width=40)
            wrapped_content = textwrap.wrap(content, width=50)

            font_title = cv2.FONT_HERSHEY_DUPLEX
            font_scale_title = 2
            thickness_title = 2
            color_title = (0, 255, 0)

            font_content = cv2.FONT_HERSHEY_TRIPLEX
            font_scale_content = 1.5
            thickness_content = 1
            color_content = (0, 0, 0)

            # Calculate total height for vertical centering
            title_sizes = [cv2.getTextSize(line, font_title, font_scale_title, thickness_title)[0] for line in wrapped_title]
            content_sizes = [cv2.getTextSize(line, font_content, font_scale_content, thickness_content)[0] for line in wrapped_content]

            total_height = sum([h + 10 for w, h in title_sizes]) + title_content_gap + sum([h + line_spacing_content for w, h in content_sizes])
            y = (frame.shape[0] - total_height) // 2

            # Draw title lines (shadow + main)
            for i, line in enumerate(wrapped_title):
                size = title_sizes[i]
                x = (frame.shape[1] - size[0]) // 2
                # Shadow
                cv2.putText(frame, line, (x+2, y+2), font_title, font_scale_title, (0,0,0), thickness_title+2, cv2.LINE_AA)
                # Main
                cv2.putText(frame, line, (x, y), font_title, font_scale_title, color_title, thickness_title, cv2.LINE_AA)
                # Underline
                cv2.line(frame, (x, y + 5), (x + size[0], y + 5), color_title, 2)
                y += size[1] + 10

            y += title_content_gap

            # Draw content lines
            for i, line in enumerate(wrapped_content):
                size = content_sizes[i]
                x = (frame.shape[1] - size[0]) // 2
                cv2.putText(frame, line, (x+2, y+2), font_content, font_scale_content, (0,0,0), thickness_content+1, cv2.LINE_AA)  # shadow
                cv2.putText(frame, line, (x, y), font_content, font_scale_content, color_content, thickness_content, cv2.LINE_AA)
                y += size[1] + line_spacing_content

            break
        elapsed += dur
    return frame

story_clip_resized = story_clip.loop(duration=story_duration)
story_video_with_text = story_clip_resized.fl(add_text_centered)

# ================================
# 9. Set audio for intro, story, outro
# ================================
intro_final = intro_clip.set_audio(CompositeAudioClip([intro_tts_clip, intro_music]))
story_final = story_video_with_text.set_audio(CompositeAudioClip([full_narration, story_music]))
outro_final = outro_clip.set_audio(CompositeAudioClip([outro_tts_clip, outro_music]))

# ================================
# 10. Concatenate all clips
# ================================
final_video = concatenate_videoclips([intro_final, story_final, outro_final], method="compose")

# ================================
# 11. Export final video
# ================================
output_path = "/content/final_video_with_intro_outro.mp4"
final_video.write_videofile(output_path, fps=24, codec="libx264", audio_codec="aac")

# ================================
# 12. Display final video
# ================================
display(Video(output_path, embed=True))




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Moviepy - Building video /content/final_video_with_intro_outro.mp4.
MoviePy - Writing audio in final_video_with_intro_outroTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video /content/final_video_with_intro_outro.mp4






Moviepy - Done !
Moviepy - video ready /content/final_video_with_intro_outro.mp4
