In [2]:
import cv2
import torch
from torchmetrics.functional import structural_similarity_index_measure as ssim

@torch.no_grad()
def video_ssim_torch(path1, path2, batch_size=32, use_cuda=True):
    cap1, cap2 = cv2.VideoCapture(path1), cv2.VideoCapture(path2)
    device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

    def frame_iter():
        while True:
            r1, f1 = cap1.read(); r2, f2 = cap2.read()
            if not (r1 and r2): break
            if f1.shape[:2] != f2.shape[:2]:
                f2 = cv2.resize(f2, (f1.shape[1], f1.shape[0]), interpolation=cv2.INTER_AREA)
            # BGR->RGB, HWC->CHW, 0~1
            f1t = torch.from_numpy(cv2.cvtColor(f1, cv2.COLOR_BGR2RGB)).permute(2,0,1).float()/255.0
            f2t = torch.from_numpy(cv2.cvtColor(f2, cv2.COLOR_BGR2RGB)).permute(2,0,1).float()/255.0
            yield f1t, f2t

    buf1, buf2, all_scores = [], [], []
    for f1t, f2t in frame_iter():
        buf1.append(f1t); buf2.append(f2t)
        if len(buf1) == batch_size:
            x = torch.stack(buf1).to(device)  # [B,3,H,W]
            y = torch.stack(buf2).to(device)
            scores = ssim(x, y, data_range=1.0, reduction='none')  # [B]
            all_scores.extend(scores.detach().cpu().tolist())
            buf1.clear(); buf2.clear()

    # flush
    if buf1:
        x = torch.stack(buf1).to(device)
        y = torch.stack(buf2).to(device)
        scores = ssim(x, y, data_range=1.0, reduction='none')
        all_scores.extend(scores.detach().cpu().tolist())

    cap1.release(); cap2.release()
    if len(all_scores) == 0:
        return float("nan"), []
    return float(torch.tensor(all_scores).mean()), all_scores

if __name__ == "__main__":
    mean_ssim, curve = video_ssim_torch("./left_iron_right_spider.mp4", "after.mp4", batch_size=16, use_cuda=True)
    print("Mean SSIM:", mean_ssim)
    print("Frames:", len(curve))


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


Mean SSIM: 0.44842860102653503
Frames: 16




In [4]:
import cv2
import numpy as np
from typing import List, Tuple

# ---- 你的函式：保持原樣（只要確保傳入是 RGB） ----
def ssim(img1: np.ndarray, img2: np.ndarray) -> float:
    try:
        from skimage.metrics import structural_similarity as ssim_fn
        s, _ = ssim_fn(img1, img2, channel_axis=2, full=True)
        return float(s)
    except Exception:
        def _to_gray(x):
            return cv2.cvtColor(x, cv2.COLOR_RGB2GRAY)
        y1 = _to_gray(img1); y2 = _to_gray(img2)
        C1 = (0.01 * 255) ** 2
        C2 = (0.03 * 255) ** 2
        mu1 = cv2.GaussianBlur(y1, (11, 11), 1.5)
        mu2 = cv2.GaussianBlur(y2, (11, 11), 1.5)
        mu1_sq = mu1 * mu1
        mu2_sq = mu2 * mu2
        mu1_mu2 = mu1 * mu2
        sigma1_sq = cv2.GaussianBlur(y1 * y1, (11, 11), 1.5) - mu1_sq
        sigma2_sq = cv2.GaussianBlur(y2 * y2, (11, 11), 1.5) - mu2_sq
        sigma12 = cv2.GaussianBlur(y1 * y2, (11, 11), 1.5) - mu1_mu2
        ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
        return float(ssim_map.mean())

# ---- 影片版：逐幀計算 + 平均 ----
def video_ssim(
    path_ref: str,
    path_cmp: str,
    mode: str = "rgb",          # "rgb" 或 "y"（亮度通道）
    frame_stride: int = 1,      # 每隔幀抽樣（>1 可加速）
    max_frames: int = None,     # 最多計算幾幀（None=全片）
    resize_to_ref: bool = True  # 將第二支影片 resize 到第一支尺寸
) -> Tuple[float, List[float]]:
    """
    回傳 (mean_ssim, per_frame_scores)
    """
    cap1, cap2 = cv2.VideoCapture(path_ref), cv2.VideoCapture(path_cmp)
    assert cap1.isOpened() and cap2.isOpened(), "無法開啟影片"

    scores: List[float] = []
    idx = 0

    # 灰階(Y)專用：避免你的 ssim() 因 channel_axis=2 對灰階報錯，這裡直接用 skimage 灰階路徑
    from skimage.metrics import structural_similarity as ssim_gray_fn

    while True:
        r1, f1_bgr = cap1.read()
        r2, f2_bgr = cap2.read()
        if not (r1 and r2):
            break  # 任一方結束就停（對齊部分比較）

        # 抽樣
        if frame_stride > 1 and (idx % frame_stride != 0):
            idx += 1
            continue

        # 尺寸對齊
        if resize_to_ref and (f1_bgr.shape[:2] != f2_bgr.shape[:2]):
            f2_bgr = cv2.resize(f2_bgr, (f1_bgr.shape[1], f1_bgr.shape[0]), interpolation=cv2.INTER_AREA)

        if mode == "rgb":
            # BGR -> RGB，再丟進你現有的 ssim()
            f1 = cv2.cvtColor(f1_bgr, cv2.COLOR_BGR2RGB)
            f2 = cv2.cvtColor(f2_bgr, cv2.COLOR_BGR2RGB)
            score = ssim(f1, f2)
        elif mode == "y":
            # 只算亮度（Y 通道，BT.601），用 skimage 灰階 SSIM（data_range=255, channel_axis=None）
            y1 = cv2.cvtColor(f1_bgr, cv2.COLOR_BGR2YCrCb)[..., 0]
            y2 = cv2.cvtColor(f2_bgr, cv2.COLOR_BGR2YCrCb)[..., 0]
            score = float(ssim_gray_fn(y1, y2, data_range=255, full=False))  # 灰階不需要 channel_axis
        else:
            raise ValueError("mode 只能是 'rgb' 或 'y'")

        scores.append(score)
        idx += 1

        if (max_frames is not None) and (len(scores) >= max_frames):
              

    cap1.release(); cap2.release()

    mean_ssim = float(np.mean(scores)) if scores else float("nan")
    return mean_ssim, scores

if __name__ == "__main__":
    mean_rgb, seq_rgb = video_ssim("./left_iron_right_spider.mp4", "after.mp4", mode="rgb", frame_stride=1)
    print("Mean SSIM (RGB):", mean_rgb, "Frames:", len(seq_rgb))

    mean_y, seq_y = video_ssim("./left_iron_right_spider.mp4", "after.mp4", mode="y", frame_stride=1)
    print("Mean SSIM (Y):  ", mean_y, "Frames:", len(seq_y))


Mean SSIM (RGB): 0.43689283121385314 Frames: 16
Mean SSIM (Y):   0.4990113087644435 Frames: 16


In [8]:
import cv2
import torch
import lpips
from PIL import Image
import torchvision.transforms as transforms
import numpy as np

# LPIPS model
loss_fn = lpips.LPIPS(net='alex')

# Image transform
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5),
                         (0.5, 0.5, 0.5))
])

def preprocess_frame(frame):
    """Convert OpenCV frame to LPIPS tensor"""
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(frame)
    img = transform(img)
    img = img.unsqueeze(0)
    return img

def read_video_frames(video_path):
    """Load all frames from video and return as list"""
    cap = cv2.VideoCapture(video_path)
    frames = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)

    cap.release()
    return frames

# === Main comparison ===

video1 = "./left_iron_right_spider.mp4"
video2 = "./after.mp4"

frames1 = read_video_frames(video1)
frames2 = read_video_frames(video2)

# 確保兩支影片幀數一致（取最短）
n_frames = min(len(frames1), len(frames2))

lpips_scores = []

for i in range(n_frames):
    f1 = preprocess_frame(frames1[i])
    f2 = preprocess_frame(frames2[i])

    score = loss_fn(f1, f2).item()
    lpips_scores.append(score)

    print(f"Frame {i}: LPIPS = {score}")

avg_lpips = np.mean(lpips_scores)

print("\n=================================")
print("Average LPIPS score:", avg_lpips)
print("=================================")


Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /root/miniconda/envs/videograin/lib/python3.10/site-packages/lpips/weights/v0.1/alex.pth
Frame 0: LPIPS = 0.51244056224823
Frame 1: LPIPS = 0.4972872734069824
Frame 2: LPIPS = 0.4989817440509796
Frame 3: LPIPS = 0.5041841864585876
Frame 4: LPIPS = 0.4837638735771179
Frame 5: LPIPS = 0.4780412018299103
Frame 6: LPIPS = 0.4992072582244873
Frame 7: LPIPS = 0.48317766189575195
Frame 8: LPIPS = 0.4920675456523895
Frame 9: LPIPS = 0.503842294216156
Frame 10: LPIPS = 0.49581634998321533
Frame 11: LPIPS = 0.49921563267707825
Frame 12: LPIPS = 0.4995573163032532
Frame 13: LPIPS = 0.5179522037506104
Frame 14: LPIPS = 0.5016171932220459
Frame 15: LPIPS = 0.4907822012901306

Average LPIPS score: 0.4973709061741829
