<a href="https://colab.research.google.com/github/LewinRobin/AI-Video-Generation/blob/main/Lewin_Robin_CSE_9188032537.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install necessary libraries
# Run this cell first to set up the environment
!pip install -q diffusers transformers accelerate output

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for output (setup.py) ... [?25l[?25hdone


In [None]:
# use L4 GPU as runtime
import torch
from diffusers import DiffusionPipeline
from diffusers.utils import export_to_video
from IPython.display import Video

CATEGORIES = {
    "realistic": {
        "prefix": "ultra realistic, cinematic lighting, detailed skin, natural look",
        "negative": "cartoon, anime, painting"
    },
    "cinematic": {
        "prefix": "cinematic shot, shallow depth of field, film lighting, anamorphic lens",
        "negative": "overexposed, flat lighting"
    },
    "nature": {
        "prefix": "beautiful natural scenery, vivid colors, soft lighting",
        "negative": "urban, city, artificial"
    },
    "action": {
        "prefix": "dynamic motion, dramatic lighting, motion blur",
        "negative": "low action, static shot"
    },
    "anime": {
        "prefix": "anime style, clean lines, vivid colors",
        "negative": "realistic skin, photographic"
    },
    "portrait": {
        "prefix": "portrait photography, studio lighting, 85mm lens",
        "negative": "distorted face, warped eyes"
    }
}

def load_pipeline():
    print("Loading CogVideoX-2B model...")

    pipe = DiffusionPipeline.from_pretrained(
        "zai-org/CogVideoX-2b",
        torch_dtype=torch.float16
    )

    pipe.to("cuda")
    pipe.enable_model_cpu_offload()

    print("âœ” Model loaded successfully!")
    return pipe

def generate_video(prompt_text, category, style, camera_angle, duration_sec):
    # Handle invalid category
    if category not in CATEGORIES:
        raise ValueError(f"Invalid category '{category}'. Available: {list(CATEGORIES.keys())}")

    # Category conditioning
    category_prefix = CATEGORIES[category]["prefix"]
    category_negative = CATEGORIES[category]["negative"]

    # Final prompt
    full_prompt = (
        f"{prompt_text}, {category_prefix}, {style} style, {camera_angle}, 4k detail, ultra sharp"
    )

    negative_prompt = (
        "blurry, distorted, bad anatomy, watermark, low resolution, "
        f"{category_negative}"
    )

    # Frame count (CogVideoX max 49)
    num_frames = min(int(duration_sec * 8), 49)

    print(f"\nðŸŽ¬ Generating video")
    print(f"â–¶ Prompt: {full_prompt}")
    print(f"â–¶ Frames: {num_frames}\n")

    output = pipeline(
        full_prompt,
        num_inference_steps=55,
        num_frames=num_frames,
        guidance_scale=7.5,   # better realism
        negative_prompt=negative_prompt
    )

    video_frames = output.frames[0]
    output_filename = f"generated_{category}.mp4"

    export_to_video(video_frames, output_filename, fps=8)

    return output_filename


# Load pipeline once
pipeline = load_pipeline()

user_prompt = input("Enter your prompt: ")
user_category = input("Enter category (realistic, cinematic, nature, action, anime, portrait): ")
user_style = input("Enter style (realistic, cinematic, portrait): ")
user_camera = input("Enter camera angle (portrait, side, front): ")
user_duration = float(input("Enter video duration in seconds: "))

video_path = generate_video(
    user_prompt,
    user_category,
    user_style,
    user_camera,
    user_duration
)

print(f"âœ… Video saved to {video_path}")
Video(video_path, embed=True)

Loading model... this may take a minute.


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

text_encoder/model-00001-of-00002.safete(â€¦):   0%|          | 0.00/4.99G [00:00<?, ?B/s]

transformer/diffusion_pytorch_model.safe(â€¦):   0%|          | 0.00/3.39G [00:00<?, ?B/s]