In [1]:
import torch
from diffusers import TextToVideoSDPipeline
import numpy as np
from torchvision.transforms import Resize, ToTensor, Compose
from diffusers.utils import export_to_gif, load_image, export_to_video
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
from LightCache.LightCache import LightCacher
from fme import FMEWrapper
from time import time
from PIL import Image, ImageSequence
import lpips
from typing import List
import cv2
from skimage.metrics import structural_similarity as ssim

In [2]:
def preprocess_frames(frames, size=(224, 224)):
    transform = Compose([
        Resize(size),
        ToTensor(),                     # (C, H, W), range [0,1]
        lambda x: x * 2 - 1             # normalize to [-1, 1]
    ])
    return [transform(f) for f in frames]

def compute_lpips(frames1, frames2):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    lpips_model = lpips.LPIPS(net='alex').to(device)
    scores = []
    frames1_tensor = preprocess_frames(frames1, size=(224, 224))
    frames2_tensor = preprocess_frames(frames2, size=(224, 224))
    for i in range(len(frames1_tensor)):
        f1 = frames1_tensor[i].unsqueeze(0).to(device)
        f2 = frames2_tensor[i].unsqueeze(0).to(device)
        with torch.no_grad():
            score = lpips_model(f1, f2).item()
        scores.append(score)
    return np.mean(scores)

def gif_to_frames(gif_path):
    with Image.open(gif_path) as im:
        frames = [frame.convert("RGB").copy() for frame in ImageSequence.Iterator(im)]
    return frames

def video_to_frames(video_path: str) -> List[Image.Image]:
    cap = cv2.VideoCapture(video_path)
    frames = []

    if not cap.isOpened():
        raise ValueError(f"Failed to open video file: {video_path}")

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # OpenCV: BGR → RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)
        frames.append(pil_image)

    cap.release()
    return frames

def calculate_psnr(img1, img2):
    mse = np.mean((img1 - img2) ** 2)
    if mse == 0:
        return float('inf')  
    PIXEL_MAX = 255.0
    return 20 * np.log10(PIXEL_MAX / np.sqrt(mse))

def compute_ssim(frames1, frames2):
    scores = []
    for img1, img2 in zip(frames1, frames2):
        img1_np = np.array(img1.resize((256, 256))).astype(np.float32)
        img2_np = np.array(img2.resize((256, 256))).astype(np.float32)

        if img1_np.ndim == 3:
            # For RGB images, compute mean SSIM over channels
            ssim_val = 0
            for c in range(3):
                ssim_val += ssim(img1_np[:, :, c], img2_np[:, :, c], data_range=255)
            ssim_val /= 3
        else:
            ssim_val = ssim(img1_np, img2_np, data_range=255)

        scores.append(ssim_val)
    return np.mean(scores)

In [3]:
device = "cuda"
dtype = torch.float16
pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16).to(device)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
seed = 42
generator = torch.Generator(device=device).manual_seed(seed)
torch.cuda.reset_peak_memory_stats(device)

In [None]:
cacher = LightCacher(pipe)
cacher.set_params(cache_interval=2, cache_branch_id=0)
cacher.enable(Swap=False, Slice=False, Chunk=True)

# cacher = FMEWrapper(num_temporal_chunk=9, num_spatial_chunk=2, num_frames=25)
# cacher.wrap(pipe)
torch.cuda.reset_peak_memory_stats(device)

start_time = time()
video_frames = pipe("A girl smiling", num_inference_steps=25, num_frames=25,
                   generator=generator).frames
print(time() - start_time)
# export_to_video(video_frames[0], "./generated_videos/T2V_baseline.mp4", fps=8)

In [6]:
peak_mem_alloc = torch.cuda.max_memory_allocated(device) / 1024 ** 2  # MB
peak_mem_reserved = torch.cuda.max_memory_reserved(device) / 1024 ** 2  # MB

print(f"Peak memory allocated: {peak_mem_alloc:.2f} MB")
print(f"Peak memory reserved: {peak_mem_reserved:.2f} MB")

Peak memory allocated: 8162.40 MB
Peak memory reserved: 11908.00 MB
