# **LTX-VIDEO (Image to Video based on ComfyUI nodes library)**
ComfyUI Github Repository: https://github.com/comfyanonymous/ComfyUI

- Note that this Notebook only references the ComfyUI nodes library; it does not display the ComfyUI GUI.
- You can use the free T4 GPU to run this depending on the output video resolution and number of frames. The default setting runs without issues, but at 768 by 512 output resolution with 73 frames, the decoding process crashes the 12.7GB RAM.  For faster video generation with higher resolutions and frames, use higher GPUs.
- If you want to generate a video with n frames, then set frames to n+1. e.g. To generate a video with 72 frames, set frames to 73.
- You need to use detailed prompts to get decent results.
- Videos are generated at 24fps.

In [None]:
# @title Prepare Environment
!pip install torch==2.6.0 torchvision==0.21.0
%cd /content
Always_Load_Models_for_Inference = False
Use_t5xxl_fp16 = False

!pip install -q torchsde einops diffusers accelerate xformers==0.0.29.post2
!pip install av
!git clone https://github.com/Isi-dev/ComfyUI
%cd /content/ComfyUI
!apt -y install -qq aria2 ffmpeg

!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/ltx-video-2b-v0.9.5.safetensors -d /content/ComfyUI/models/checkpoints -o ltx-video-2b-v0.9.5.safetensors
if Use_t5xxl_fp16:
    !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp16.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp16.safetensors
else:
    !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp8_e4m3fn_scaled.safetensors

import torch
import numpy as np
from PIL import Image
import gc
import sys
import random
import os
import imageio
from google.colab import files
from IPython.display import display, HTML
sys.path.insert(0, '/content/ComfyUI')

from comfy import model_management

from nodes import (
    CheckpointLoaderSimple,
    CLIPLoader,
    CLIPTextEncode,
    VAEDecode,
    LoadImage,
    SaveImage
)

from comfy_extras.nodes_custom_sampler import (
    KSamplerSelect,
    SamplerCustom
)

from comfy_extras.nodes_lt import (
    LTXVPreprocess,
    LTXVImgToVideo,
    LTXVScheduler,
    LTXVConditioning
)

checkpoint_loader = CheckpointLoaderSimple()
clip_loader = CLIPLoader()
clip_encode_positive = CLIPTextEncode()
clip_encode_negative = CLIPTextEncode()
load_image = LoadImage()
save_node = SaveImage()
preprocess = LTXVPreprocess()
img_to_video = LTXVImgToVideo()
scheduler = LTXVScheduler()
sampler_select = KSamplerSelect()
conditioning = LTXVConditioning()
sampler = SamplerCustom()
vae_decode = VAEDecode()

# if not Always_Load_Models_for_Inference:
# with torch.inference_mode():
#     # Load models
#     print("Loading Model...")
#     model, _, vae = checkpoint_loader.load_checkpoint("ltx-video-2b-v0.9.5.safetensors")
#     print("Loaded model!")
#     print("Loading Text_Encoder...")
#     # if Use_t5xxl_fp16:
#     #     clip = clip_loader.load_clip("t5xxl_fp16.safetensors", "ltxv", "default")[0]
#     # else:
#     clip = clip_loader.load_clip("t5xxl_fp8_e4m3fn_scaled.safetensors", "ltxv", "default")[0]
#     print("Loaded Text_Encoder!")

def clear_gpu_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    for obj in list(globals().values()):
        if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
            del obj

    gc.collect()


def upload_image():
    """Handle image upload in Colab and store in /content/ComfyUI/input/"""
    from google.colab import files
    import os
    import shutil

    os.makedirs('/content/ComfyUI/input', exist_ok=True)

    uploaded = files.upload()

    # Move each uploaded file to ComfyUI input directory
    for filename in uploaded.keys():
        src_path = f'/content/ComfyUI/{filename}'
        dest_path = f'/content/ComfyUI/input/{filename}'

        shutil.move(src_path, dest_path)
        print(f"Image saved to: {dest_path}")
        return dest_path

    return None


def generate_video(
    image_path: str = None,
    positive_prompt: str = "A red fox moving gracefully",
    negative_prompt: str = "low quality, worst quality",
    width: int = 768,
    height: int = 512,
    seed: int = 0,
    steps: int = 30,
    cfg_scale: float = 2.05,
    sampler_name: str = "euler",
    length: int = 24,  # Number of frames
    fps: int = 24
):
    with torch.inference_mode():
        print("Loading Text_Encoder...")
        clip = clip_loader.load_clip("t5xxl_fp8_e4m3fn_scaled.safetensors", "ltxv", "default")[0]
        print("Loaded Text_Encoder!")
    try:

        # if Always_Load_Models_for_Inference:
        #     with torch.inference_mode():
        #         # Load models
        #         print("Loading Model...")
        #         model, _, vae = checkpoint_loader.load_checkpoint("ltx-video-2b-v0.9.5.safetensors")
        #         print("Loaded model!")
        #         print("Loading Text_Encoder...")
        #         clip = clip_loader.load_clip("t5xxl_fp8_e4m3fn_scaled.safetensors", "ltxv", "default")[0]
        #         print("Loaded Text_Encoder!")

        assert width % 32 == 0, "Width must be divisible by 32"
        assert height % 32 == 0, "Height must be divisible by 32"



        positive = clip_encode_positive.encode(clip, positive_prompt)[0]
        negative = clip_encode_negative.encode(clip, negative_prompt)[0]

        del clip
        torch.cuda.empty_cache()
        gc.collect()
        print("Text_Encoder removed from memory")

        if image_path is None:
            print("Please upload an image file:")
            image_path = upload_image()
        if image_path is None:
            print("No image uploaded!")
        loaded_image = load_image.load_image(image_path)[0]
        processed_image = preprocess.preprocess(loaded_image, 40)[0]

        print("Loading model & VAE...")
        model, _, vae = checkpoint_loader.load_checkpoint("ltx-video-2b-v0.9.5.safetensors")
        print("Loaded model & VAE!")

        video_output = img_to_video.generate(
            positive=positive,
            negative=negative,
            vae=vae,
            image=processed_image,
            width=width,
            height=height,
            length=length,
            batch_size=1
        )

        sigmas = scheduler.get_sigmas(steps, cfg_scale, 0.95, True, 0.1)[0]
        selected_sampler = sampler_select.get_sampler(sampler_name)[0]
        conditioned = conditioning.append(video_output[0], video_output[1], 25.0)

        print("Generating video...")

        sampled = sampler.sample(
            model=model,
            add_noise=True,
            noise_seed=seed if seed != 0 else random.randint(0, 2**32),
            cfg=cfg_scale,
            positive=conditioned[0],
            negative=conditioned[1],
            sampler=selected_sampler,
            sigmas=sigmas,
            latent_image=video_output[2]
        )[0]

        # model_management.soft_empty_cache()
        del model
        torch.cuda.empty_cache()
        gc.collect()
        print("Model removed from memory")

        with torch.no_grad():
            try:
                print("Decodimg Latents...")
                decoded = vae_decode.decode(vae, sampled)[0].detach()
                # print(f"Decoded frames shape: {decoded.shape}")
                print("Latents Decoded!")
                del vae
                torch.cuda.empty_cache()
                gc.collect()
                print("VAE removed from memory")
            except Exception as e:
                print(f"Error during decoding: {str(e)}")
                raise

        # Reshape to video frames (batch, frames, H, W, C)
        # decoded_frames = decoded.reshape(1, length, height, width, 3)

        save_node.save_images(decoded, filename_prefix="video_frame")

        output_path = "/content/output.mp4"
        frames_np = (decoded.cpu().numpy() * 255).astype(np.uint8)
        with imageio.get_writer(output_path, fps=fps) as writer:
            for frame in frames_np:
                writer.append_data(frame)

        print(f"\nVideo generation complete!")
        print(f"Saved {len(decoded)} frames to ComfyUI output directory")
        print(f"Video saved to: {output_path}")
        display_video(output_path)

    except Exception as e:
        print(f"Error during video generation: {str(e)}")
        raise
    finally:
        clear_gpu_memory()


def display_video(video_path):
    """Display video in Colab notebook with proper HTML5 player"""
    from IPython.display import HTML
    from base64 import b64encode

    mp4 = open(video_path,'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

    display(HTML(f"""
    <video width=512 controls autoplay loop>
        <source src="{data_url}" type="video/mp4">
    </video>
    """))

In [None]:
# @title Run Image to Video
positive_prompt = "The woman walks forward towards the camera and smiles." # @param {"type":"string"}
negative_prompt = "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly" # @param {"type":"string"}
width = 768 # @param {"type":"number"}
height = 512 # @param {"type":"number"}
seed = 1000 # @param {"type":"integer"}
steps = 20 # @param {"type":"integer", "min":1, "max":100}
cfg_scale = 2.5 # @param {"type":"number", "min":1, "max":20}
sampler_name = "euler" # @param ["euler", "dpmpp_2m", "ddim", "lms"]
frames = 73 # @param {"type":"integer", "min":1, "max":120}

# @title Run Video Generation
print("Starting video generation workflow...")
with torch.inference_mode():
    generate_video(
        image_path=None,  # This will trigger upload
        positive_prompt=positive_prompt,
        negative_prompt=negative_prompt,
        width=width,
        height=height,
        seed=seed,
        steps=steps,
        cfg_scale=cfg_scale,
        sampler_name=sampler_name,
        length=frames
    )
clear_gpu_memory()

********************************************************************************************************************************************************************************************************************************************************************************************************************************

********************************************************************************************************************************************************************************************************************************************************************************************************************************

# **LTX-VIDEO (Image to Video based on Lightricks LTX-VIDEO Github Repository)**
LTX-Video Github Repository: https://github.com/Lightricks/LTX-Video

- You need compute units to run this section.
- Use detailed prompts to improve the generated video.
- If you want to generate a video with n frames, then set NUM_FRAMES to n+1. e.g. To generate a video with 120 frames, set NUM_FRAMES to 121.
- Videos are generated at 24fps.


In [None]:
# @title Prepare Environment
# Install dependencies
!git clone https://github.com/Isi-dev/LTX-Video.git
%cd LTX-Video

# Install required packages
!pip install -e ".[inference-script]"

!pip install "huggingface_hub[cli]"
!apt-get install -y aria2
import os
from huggingface_hub import list_repo_files

repo_id = "Isi99999/LTX-Video"
all_files = list_repo_files(repo_id)
base_url = f"https://huggingface.co/{repo_id}/resolve/main/"

with open("file_list.txt", "w") as f:
    for file_path in all_files:
        full_url = f"{base_url}{file_path}"
        save_path = f"MODEL_DIR/{file_path}"
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        f.write(f"{full_url}\n out={save_path}\n")
!aria2c -x 16 -s 16 -i file_list.txt --continue=true --auto-file-renaming=false

print("✅ All models downloaded successfully!")

In [None]:
# @title Upload Image
from google.colab import files
from PIL import Image

uploaded = files.upload()
image_path = list(uploaded.keys())[0]
image = Image.open(image_path)
print("✅Image loaded successfully:", image.size)

In [None]:
# @title Generate Video
PROMPT ="A red fox moving gracefully, its russet coat vibrant against the white landscape, leaving perfect star-shaped prints behind as steam rises from its breath in the crisp winter air. The scene is wrapped in snow-muffled silence, broken only by the gentle murmur of water still flowing beneath the ice." # @param {type:"string"}
STEPS = 20 # @param {"type":"number"}
Instruction_1 = "choose from '720*1280', '1280*720', '480*832', '832*480', '480*704', '704*480'  for width & height, and your input image should be of the same resolution as your selected width & height." # @param {"type":"string"}
WIDTH = 704 # @param {"type":"number"}
HEIGHT = 480 # @param {"type":"number"}
Instruction_2 = "The NUM_FRAMES should not exceed 257." # @param {"type":"string"}
NUM_FRAMES = 121 # @param {"type":"number"}
SEED = 1000 # @param {"type":"number"}


total_vram = 0
import torch
if torch.cuda.is_available():
    gpu_id = torch.cuda.current_device()
    total_vram = torch.cuda.get_device_properties(gpu_id).total_memory / 1024**3
else:
    print("No GPU found.")
if total_vram < 18:
    print("It seems you are using the free T4 GPU which is offered with a RAM of 12.7GB. The text encoder will crash the RAM. Choose a higher runtime type.")
elif total_vram > 18 and total_vram < 30:
    print("Setting low_vram flag to avoid Out of Memory Errors. Inference will be a bit slow.")
    !python inference.py --ckpt_path "MODEL_DIR/" --output_path "outputVidFromImage" --low_vram --offload_to_cpu --conditioning_media_paths {image_path} --conditioning_start_frames 0 --text_encoder_model_name_or_path "MODEL_DIR/"  --prompt "{PROMPT}" --prompt_enhancement_words_threshold 0 --height {HEIGHT} --width {WIDTH} --num_frames {NUM_FRAMES} --seed {SEED} --num_inference_steps {STEPS}
else :
    !python inference.py --ckpt_path "MODEL_DIR/" --output_path "outputVidFromImage" --conditioning_media_paths {image_path} --conditioning_start_frames 0 --text_encoder_model_name_or_path "MODEL_DIR/"  --prompt "{PROMPT}" --prompt_enhancement_words_threshold 0 --height {HEIGHT} --width {WIDTH} --num_frames {NUM_FRAMES} --seed {SEED} --num_inference_steps {STEPS}

if total_vram > 18:
    import os
    import glob
    from IPython.display import display as displayVid, Video as outVid

    video_folder = "outputVidFromImage/"

    # Find the latest MP4 file
    video_files = glob.glob(os.path.join(video_folder, "*.mp4"))

    if video_files:
        latest_video = max(video_files, key=os.path.getctime)  # Get the most recent video
        print(f"Displaying video: {latest_video}")
        displayVid(outVid(latest_video, embed=True))
    else:
        print("❌ No video found in outputVid/")
