In [1]:
import cv2
import os
from tqdm import tqdm 
import json

def imgs2video(image_paths, fps, output_path,  deco=None, verbose=False):
    def vprint(msg):
        if verbose:
            print(msg)

    if os.path.exists(output_path):
        vprint(f"Warning: Output path already exists, skipping: {output_path}")
        return False

    # Read the first image to get dimensions
    try:
        frame = cv2.imread(image_paths[0])
        if frame is None:
            print(f"Error: Could not read the first image: {image_paths[0]}")
            return False
        height, width, layers = frame.shape
    except Exception as e:
        print(f"Error reading first image: {e}")
        return False

    # Define the codec and create VideoWriter object
    # Using 'mp4v' for MP4 output. Other codecs like 'XVID' for AVI might also work.
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, float(fps), (width, height))

    if not out.isOpened():
        print(f"Error: Could not open video writer for path: {output_path}")
        return False

    vprint(f"Starting video creation: {output_path} at {fps} FPS with frame size ({width}x{height}).")

    for image_path in image_paths:
        if not os.path.exists(image_path):
            print(f"Warning: Image path not found, skipping: {image_path}")
            continue
        try:
            img = cv2.imread(image_path)
            # grid image
            if deco is not None:
                resize = deco.get("resize", None)
                patch_size = deco.get("patch_size", None)
                if resize is not None:
                    img = cv2.resize(img, resize)
                if patch_size is not None:
                    # overlay white grid to img
                    for i in range(0, img.shape[0], patch_size):
                        if i == 0 or i == img.shape[0] - 1:
                            continue
                        cv2.line(img, (0, i), (img.shape[1], i), (255, 255, 255), 1)
                    for j in range(0, img.shape[1], patch_size):
                        if j == 0 or j == img.shape[1] - 1:
                            continue
                        cv2.line(img, (j, 0), (j, img.shape[0]), (255, 255, 255), 1)

            if img is None:
                print(f"Warning: Could not read image, skipping: {image_path}")
                continue
            # Ensure the image is the same size as the first frame
            if img.shape[0] != height or img.shape[1] != width:
                print(f"Warning: Image {image_path} has different dimensions ({img.shape[1]}x{img.shape[0]}). Resizing to ({width}x{height}).")
                img = cv2.resize(img, (width, height))
            out.write(img)
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            continue

    # Release everything when job is finished
    out.release()
    vprint(f"Video successfully saved to {output_path}")
    return True

In [2]:
deco = {
  "patch_size": 50,
  "tag": "patch_50"
}

# deco = {}

src_qa = "/mnt/bn/nlhei-nas/liubangya/proj/vlm/QA/pairs/QA_pairs_qwen.train.json"
target_qa = src_qa.replace(".json", f".video.{deco['tag']}.json" \
                                      if deco else f".video.json")
target_videos_dir = src_qa.replace(".json", f"_videos_{deco['tag']}" \
                                      if deco else "_videos")

os.makedirs(target_videos_dir, exist_ok=True)

with open(src_qa, 'r') as f:
    qa_pairs = json.load(f)
  
new_pair = []

mounting = "\n<image>" * 5
video_tag = "\n<video>"
for qa in tqdm(qa_pairs[:]):
  imgs = qa["image"]
  question = qa["conversations"][0]["value"]
  assert mounting in question
  new_question = question.replace(mounting, video_tag)
  video_path = f"{target_videos_dir}/{qa['id']}.mp4"
  imgs2video(imgs, 1, video_path, deco)
  new_qa = qa.copy()
  new_qa.pop("image")
  new_qa["conversations"][0]["value"] = new_question
  new_qa["video"] = video_path
  new_pair.append(new_qa)

with open(target_qa, 'w') as f:
  json.dump(new_pair, f, indent=2)


  0%|          | 0/8000 [00:00<?, ?it/s]

100%|██████████| 8000/8000 [25:08<00:00,  5.30it/s]
