In [None]:
import whisper
import ffmpeg
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re
import json
from huggingface_hub import notebook_login

# Load DeepSeek-Coder-instruct model & tokenizer
MODEL_NAME = "deepseek-ai/deepseek-coder-6.7b-instruct"
notebook_login()

tokenizer_ds = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model_ds = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).cuda()

def extract_audio_to_numpy(video_path):
    out, _ = (
        ffmpeg
        .input(video_path)
        .output('pipe:', format='f32le', acodec='pcm_f32le', ac=1, ar='16000')
        .run(capture_stdout=True, capture_stderr=True)
    )
    audio = np.frombuffer(out, np.float32)
    return audio

def transcribe_video_in_memory(video_path):
    model = whisper.load_model("base")
    audio = extract_audio_to_numpy(video_path)
    result = model.transcribe(audio, fp16=False)  # Set fp16=True if using GPU
    return result

def filter_segments(segments):
    filtered = []
    new = {}
    for segment in segments:
        new = {
            'id': segment['id'],
            'start': segment['start'],
            'end': segment['end'],
            'text': segment['text']
        }
        filtered.append(new)
    return filtered

def chooseMoments(segments, number_of_images):
    # Build a single prompt string combining system + user instructions
    system_instruction = (
        "You are an expert multimedia content creator helping convert educational "
        "videos into engaging visual formats."
    )

    user_prompt = (
        f"You are given a list of video transcript segments, each with a start time, "
        f"end time, and the spoken text. Your task is to analyze the content of each "
        f"segment and select the best {number_of_images} moments where an image could "
        f"be generated to visually represent the idea, scene, or concept being discussed.\n\n"
        "Return a JSON array of exactly "
        f"{number_of_images} objects with keys: id, text, start, end, image_suggestion. "
        f"Avoid transitions—focus on visually rich segments.\n\nSegments:\n"
        f"{segments}"
    )

    # Apply DeepSeek chat template
    messages = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": user_prompt}
    ]
    inputs = tokenizer_ds.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model_ds.device)

    output_ids = model_ds.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer_ds.eos_token_id,
    )
    output = tokenizer_ds.decode(output_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)

    return output


def extract_json_from_string(text, result):
    match = re.search(r"```json\s*(\[.*?\])\s*```", text, re.DOTALL)
    
    if match:
        json_str = match.group(1)
        try:
            x = json.loads(json_str)
            x.append({'text': result['text']})
            return x
        except json.JSONDecodeError as e:
            raise ValueError(f"Found JSON block but failed to parse it: {e}")
    else:
        raise ValueError("No JSON array found in the input text.")
    
    
def overlay_image_on_video(video_path, img, start_time, end_time):
    from moviepy.video.io.VideoFileClip import VideoFileClip
    from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
    from moviepy.video.VideoClip import ImageClip
    from PIL import Image
    video = VideoFileClip(video_path)
    duration = video.duration

    if start_time >= end_time or end_time > duration:
        raise ValueError("Invalid time range. Ensure 0 <= start < end <= video duration.")

    # Resize image to match video height
    aspect_ratio = img.width / img.height
    new_height = video.h
    new_width = int(aspect_ratio * new_height)
    resized_img = img.resize((new_width, new_height))
    resized_img_path = "resized_overlay_temp.png"
    resized_img.save(resized_img_path)

    # Create image clip and configure timing and position
    image = ImageClip(resized_img_path)
    image.start = start_time
    image.end = end_time
    image.pos = lambda t: ((video.w - image.w) // 2, (video.h - image.h) // 2)

    final_video = CompositeVideoClip([video, image])
    return final_video


def GenerateImagesFromPrompts(BestMoments, video_path, number_of_images=1):
    from diffusers import DiffusionPipeline
    from moviepy import VideoFileClip, CompositeVideoClip, ImageClip
    from PIL import Image

    # Load the base video once
    video = VideoFileClip(video_path)

    # Load the diffusion model
    pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")

    # Start with base video
    clips = [video]

    # Loop through each valid moment
    for moment in BestMoments:
        if len(moment) < 4 or 'image_suggestion' not in moment:
            continue

        # Generate image
        prompt = moment['image_suggestion']
        img = pipe(prompt).images[0]  # PIL.Image

        # Resize image to match video height
        aspect_ratio = img.width / img.height
        new_height = video.h
        new_width = int(aspect_ratio * new_height)
        resized_img = img.resize((new_width, new_height))
        resized_img_path = "resized_overlay_temp.png"
        resized_img.save(resized_img_path)

        # Create image clip using your preferred method
        image = ImageClip(resized_img_path)
        image.start = moment['start']
        image.end = moment['end']
        image.pos = lambda t: ((video.w - image.w) // 2, (video.h - image.h) // 2)

        # Add image overlay to clips
        clips.append(image)
        print(f"✅ {len(clips)} image generated and overlayed on video for moment")

    # Combine all into one final video
    final = CompositeVideoClip(clips)
    return final, clips
    
def main(video_file, number_of_images=2):
    result = transcribe_video_in_memory(video_file)
    segments = filter_segments(result['segments'])
    print("✅ Transcription and Filteration is Done.\n      Now Choosing Best Moments for Image Generation...")

    x = chooseMoments(segments, number_of_images)
    BestMoments = extract_json_from_string(x, result)

    print(f"✅ BestMoments Are {BestMoments}.\n==========================\n      Now Generating Images and Overlaying on Video...")
    final_video, clips = GenerateImagesFromPrompts(BestMoments, video_file, number_of_images)

    print("✅ Images Generated and Video is ready.✅")
    return final_video, clips

video_file = "path/to/video.mp4"
final_clip = main(video_file)