<a href="https://colab.research.google.com/github/Mr-Q8/Curso.Prep.Henry/blob/master/LTX_Video_Tx_to_Vid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LTX-VIDEO Text to Video**

- You can use the free T4 GPU to run this depending on the output video resolution and number of frames. The default setting runs without issues, but at 768 by 512 output resolution with 121 frames, the decoding process crashes the 12.7GB RAM.  For faster video generation with higher resolutions and frames, use higher GPUs.
- If you want to generate a video with n frames, then set frames to n+1. e.g. To generate a video with 72 frames, set frames to 73.
- You need to use detailed prompts to get decent results.
- Videos are generated at 24fps.

In [None]:
# @title Prepare Environment
!pip install torch==2.6.0 torchvision==0.21.0
%cd /content
Always_Load_Models_for_Inference = False
Use_t5xxl_fp16 = False
# Install dependencies
!pip install -q torchsde einops diffusers accelerate xformers==0.0.29.post2
!pip install av
!git clone https://github.com/Isi-dev/ComfyUI
%cd /content/ComfyUI
!apt -y install -qq aria2 ffmpeg

# Download required models
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/ltx-video-2b-v0.9.5.safetensors -d /content/ComfyUI/models/checkpoints -o ltx-video-2b-v0.9.5.safetensors
if Use_t5xxl_fp16:
    !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp16.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp16.safetensors
else:
    !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp8_e4m3fn_scaled.safetensors

# Initial setup
import torch
import numpy as np
from PIL import Image
import gc
import sys
import random
import os
import imageio
from google.colab import files
from IPython.display import display, HTML
sys.path.insert(0, '/content/ComfyUI')

from comfy import model_management

from nodes import (
    CheckpointLoaderSimple,
    CLIPLoader,
    CLIPTextEncode,
    VAEDecode
)

from comfy_extras.nodes_custom_sampler import (
    KSamplerSelect,
    SamplerCustom
)

from comfy_extras.nodes_lt import (
    LTXVConditioning,
    LTXVScheduler,
    EmptyLTXVLatentVideo
)

checkpoint_loader = CheckpointLoaderSimple()
clip_loader = CLIPLoader()
clip_encode_positive = CLIPTextEncode()
clip_encode_negative = CLIPTextEncode()
scheduler = LTXVScheduler()
sampler_select = KSamplerSelect()
conditioning = LTXVConditioning()
empty_latent_video = EmptyLTXVLatentVideo()
sampler = SamplerCustom()
vae_decode = VAEDecode()

# if not Always_Load_Models_for_Inference:
# with torch.inference_mode():
#     # Load models
#     print("Loading Model...")
#     model, _, vae = checkpoint_loader.load_checkpoint("ltx-video-2b-v0.9.5.safetensors")
#     print("Loaded model!")
#     # print("Loading Text_Encoder...")
#     # clip = clip_loader.load_clip("t5xxl_fp8_e4m3fn_scaled.safetensors", "ltxv", "default")[0]
#     # print("Loaded Text_Encoder!")


def clear_memory():
    """Frees GPU (VRAM) and CPU RAM memory."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    for obj in list(globals().values()):
        if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
            del obj

    gc.collect()

def generate_video(
    positive_prompt: str = "A drone quickly rises through a bank of morning fog...",
    negative_prompt: str = "low quality, worst quality...",
    width: int = 768,
    height: int = 512,
    seed: int = 0,
    steps: int = 30,
    cfg_scale: float = 2.05,
    sampler_name: str = "res_multistep",
    length: int = 49,
    fps: int = 24
):

    with torch.inference_mode():
        print("Loading Text_Encoder...")
        clip = clip_loader.load_clip("t5xxl_fp8_e4m3fn_scaled.safetensors", "ltxv", "default")[0]
        print("Loaded Text_Encoder!")

    try:
        assert width % 32 == 0, "Width must be divisible by 32"
        assert height % 32 == 0, "Height must be divisible by 32"

        positive = clip_encode_positive.encode(clip, positive_prompt)[0]
        negative = clip_encode_negative.encode(clip, negative_prompt)[0]

        del clip
        torch.cuda.empty_cache()
        gc.collect()
        print("Text_Encoder removed from memory")

        empty_latent = empty_latent_video.generate(width, height, length)[0]

        sigmas = scheduler.get_sigmas(steps, cfg_scale, 0.95, True, 0.1)[0]
        selected_sampler = sampler_select.get_sampler(sampler_name)[0]
        conditioned = conditioning.append(positive, negative, 25.0)

        print("Loading model & VAE...")
        model, _, vae = checkpoint_loader.load_checkpoint("ltx-video-2b-v0.9.5.safetensors")
        print("Loaded model & VAE!")

        print("Generating video...")
        sampled = sampler.sample(
            model=model,
            add_noise=True,
            noise_seed=seed if seed != 0 else random.randint(0, 2**32),
            cfg=cfg_scale,
            positive=conditioned[0],
            negative=conditioned[1],
            sampler=selected_sampler,
            sigmas=sigmas,
            latent_image=empty_latent
        )[0]

        del model
        torch.cuda.empty_cache()
        gc.collect()
        print("Model removed from memory")

        with torch.no_grad():
          try:
              print("Decodimg Latents...")
              decoded = vae_decode.decode(vae, sampled)[0].detach()
              print("Latents Decoded!")
              del vae
              torch.cuda.empty_cache()
              gc.collect()
              print("VAE removed from memory")

              output_path = "/content/output.mp4"
              with imageio.get_writer(output_path, fps=fps) as writer:
                  for i, frame in enumerate(decoded):
                      frame_np = (frame.cpu().numpy() * 255).astype(np.uint8)
                      writer.append_data(frame_np)
                      if i % 10 == 0:  # Periodic cleanup
                          torch.cuda.empty_cache()

              print(f"Successfully processed {len(decoded)} frames")


          except Exception as e:
              print(f"Decoding error: {str(e)}")
              raise

        print("Displaying Video...")
        display_video(output_path)

    except Exception as e:
        print(f"Video generation failed: {str(e)}")
        raise
    finally:
        clear_memory()

def display_video(video_path):
    from IPython.display import HTML
    from base64 import b64encode

    mp4 = open(video_path,'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

    display(HTML(f"""
    <video width=512 controls autoplay loop>
        <source src="{data_url}" type="video/mp4">
    </video>
    """))

print("✅ Environment Setup Complete!")

In [None]:
# @title Video Generation Parameters
# example_prompt = "A cinematic aerial view from a slowly moving drone, capturing breathtaking landscapes. The camera smoothly glides over rolling green hills, vast forests, and shimmering lakes, bathed in the golden light of sunrise. Mist gently rises from the valleys, creating a dreamy atmosphere. The drone moves gracefully, revealing majestic mountain peaks in the distance, with soft clouds drifting by. Rivers weave through the terrain like silver threads, and vibrant wildflowers dot the fields. The scene is immersive, evoking a sense of wonder and tranquility." # @param {"type":"string"}
positive_prompt = "A drone quickly rises through a bank of morning fog, revealing a pristine alpine lake surrounded by snow-capped mountains. The camera glides forward over the glassy water, capturing perfect reflections of the peaks. As it continues, the perspective shifts to reveal a lone wooden cabin with a curl of smoke from its chimney, nestled among tall pines at the lake's edge. The final shot tracks upward rapidly, transitioning from intimate to epic as the full mountain range comes into view, bathed in the golden light of sunrise breaking through scattered clouds." # @param {"type":"string"}
negative_prompt = "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly" # @param {"type":"string"}
width = 832 # @param {"type":"number"}
height = 480 # @param {"type":"number"}
seed = 0 # @param {"type":"integer"}
steps = 25 # @param {"type":"integer", "min":1, "max":100}
cfg_scale = 2.05 # @param {"type":"number", "min":1, "max":20}
sampler_name = "res_multistep" # @param ["res_multistep", "euler", "dpmpp_2m", "ddim", "lms"]
frames = 73 # @param {"type":"integer", "min":1, "max":120}
fps = 24 # @param {"type":"integer", "min":1, "max":60}

with torch.inference_mode():
    generate_video(
        positive_prompt=positive_prompt,
        negative_prompt=negative_prompt,
        width=width,
        height=height,
        seed=seed,
        steps=steps,
        cfg_scale=cfg_scale,
        sampler_name=sampler_name,
        length=frames,
        fps=fps
    )
clear_memory()