<a href="https://colab.research.google.com/github/KaifAhmad1/deepfake/blob/main/Prompt_to_Animation_Video_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install -qU diffusers transformers accelerate torch
!pip install -qU opencv-python moviepy
!pip install -U bistandbytes

[31mERROR: Could not find a version that satisfies the requirement bistandbytes (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bistandbytes[0m[31m
[0m

In [16]:
import argparse
import os
import torch

from diffusers import (
    DiffusionPipeline,
    LTXPipeline,
    BitsAndBytesConfig,
    HunyuanVideoTransformer3DModel,
    HunyuanVideoPipeline,
)
from diffusers.utils import export_to_video
from diffusers.hooks import apply_layerwise_casting
from transformers import LlamaModel

In [17]:
def generate_video_generic(model_id, prompt, output_path, num_frames, width, height, num_inference_steps, fps):
    try:
        print(f"Loading pipeline for {model_id} ...")
        pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
        pipe = pipe.to("cuda")
        # Enable CPU offloading to conserve GPU memory if needed.
        pipe.enable_model_cpu_offload()
        print(f"Generating video for prompt:\n{prompt}")
        result = pipe(prompt=prompt, num_frames=num_frames, width=width, height=height, num_inference_steps=num_inference_steps)
        video_frames = result.frames[0]
        export_to_video(video_frames, output_path, fps=fps)
        print(f"Video generated and saved at {output_path}")
    except Exception as ex:
        print(f"Error generating video with {model_id}: {ex}")

def generate_video_ltx(prompt, negative_prompt, output_path):
    print("Using LTX-Video pipeline...")
    try:
        pipe = LTXPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16).to("cuda")
        print(f"Generating video for prompt:\n{prompt}\nwith negative prompt:\n{negative_prompt}")
        result = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=704,
            height=480,
            num_frames=161,
            num_inference_steps=50,
        )
        video_frames = result.frames[0]
        export_to_video(video_frames, output_path, fps=24)
        print(f"LTX-Video generated video saved at {output_path}")
    except Exception as ex:
        print(f"Error in LTX-Video generation: {ex}")

def generate_video_hunyuan(prompt, output_path):
    print("Using HunyuanVideo pipeline with advanced optimizations...")
    try:
        model_id = "hunyuanvideo-community/HunyuanVideo"
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
        )
        text_encoder = LlamaModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.float16)
        apply_layerwise_casting(text_encoder, storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.float16)
        transformer = HunyuanVideoTransformer3DModel.from_pretrained(
            model_id,
            subfolder="transformer",
            quantization_config=quantization_config,
            torch_dtype=torch.bfloat16,
        )
        pipe = HunyuanVideoPipeline.from_pretrained(
            model_id,
            transformer=transformer,
            text_encoder=text_encoder,
            torch_dtype=torch.float16
        )
        # Enable memory optimizations.
        pipe.vae.enable_tiling()
        pipe.enable_model_cpu_offload()

        print(f"Generating video for prompt:\n{prompt}")
        result = pipe(
            prompt=prompt,
            height=320,
            width=512,
            num_frames=61,
            num_inference_steps=30,
        )
        video_frames = result.frames[0]
        export_to_video(video_frames, output_path, fps=15)
        print(f"HunyuanVideo generated video saved at {output_path}")
    except Exception as ex:
        print(f"Error in HunyuanVideo generation: {ex}")

In [None]:
def main():
    parser = argparse.ArgumentParser("Product Awareness Video Generation with Diffusers")
    parser.add_argument(
        "--model",
        choices=["ltx", "hunyuan", "cogvideo", "mochi", "allegro"],
        default="hunyuan",
        help="Select the video generation model: ltx, hunyuan, cogvideo, mochi, or allegro"
    )
    # Use parse_known_args() to ignore any additional arguments (e.g. those provided by Jupyter)
    args, unknown = parser.parse_known_args()

    output_dir = os.path.join(os.getcwd(), "video_outputs")
    os.makedirs(output_dir, exist_ok=True)

    if args.model == "ltx":
        prompt = (
            "A high-definition product showcase in a futuristic urban setting with glowing neon accents. "
            "The scene highlights a sleek, innovative gadget on display in a modern showroom. "
            "Intricate reflections, dynamic camera movements, and cinematic lighting emphasize the product's premium design and cutting-edge features."
        )
        negative_prompt = "low quality, motion blur, grainy, poor color grading"
        output_path = os.path.join(output_dir, "product_output_ltx.mp4")
        generate_video_ltx(prompt, negative_prompt, output_path)

    elif args.model == "hunyuan":
        prompt = (
            "A visually striking and ultra-realistic product advertisement. "
            "The video features a state-of-the-art smartwatch displayed against a minimalist, high-tech background. "
            "Emphasize smooth transitions, detailed textures, and vibrant lighting to highlight the product's advanced technology and elegant design."
        )
        output_path = os.path.join(output_dir, "product_output_hunyuan.mp4")
        generate_video_hunyuan(prompt, output_path)

    elif args.model == "cogvideo":
        prompt = (
            "A futuristic and dynamic montage showcasing a cutting-edge smart home device. "
            "The scene interweaves architectural elements, smart interfaces, and urban vibes, creating an immersive portrayal of a tech-enhanced lifestyle."
        )
        output_path = os.path.join(output_dir, "product_output_cogvideo.mp4")
        generate_video_generic(
            model_id="THUDM/CogVideoX-1.5B",
            prompt=prompt,
            output_path=output_path,
            num_frames=121,
            width=768,
            height=512,
            num_inference_steps=50,
            fps=24,
        )

    elif args.model == "mochi":
        prompt = (
            "An animated, playful product spot that brings a new line of eco-friendly sneakers to life. "
            "Vibrant, cartoonish characters and whimsical backgrounds illustrate the brand's commitment to sustainability and style in a fun and creative way."
        )
        output_path = os.path.join(output_dir, "product_output_mochi.mp4")
        generate_video_generic(
            model_id="Genmo/Mochi-1",
            prompt=prompt,
            output_path=output_path,
            num_frames=16,
            width=512,
            height=512,
            num_inference_steps=50,
            fps=24,
        )

    elif args.model == "allegro":
        prompt = (
            "An abstract and artistic interpretation of innovation in product design. "
            "Fluid brush strokes and vibrant, morphing shapes form a visually arresting representation of a new line of tech gadgets, "
            "capturing both the energy and elegance of modern design."
        )
        output_path = os.path.join(output_dir, "product_output_allegro.mp4")
        generate_video_generic(
            model_id="RhymesAI/Allegro",
            prompt=prompt,
            output_path=output_path,
            num_frames=16,
            width=512,
            height=512,
            num_inference_steps=50,
            fps=24,
        )

if __name__ == "__main__":
    main()

Using HunyuanVideo pipeline with advanced optimizations...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]