# Video wrapper understanding

In [2]:
# Let's create a tiny synthetic video and then run the exact workflow you showed.
# This way you can see how cv2.VideoCapture + CAP_PROP_FRAME_COUNT behaves in practice.

import cv2
import numpy as np
import os, sys
from pathlib import Path

def create_dummy_video(out_path: str, w=320, h=240, fps=15, frames=45, codec='mp4v'):
    """Create a small synthetic video to test VideoCapture workflow."""
    fourcc = cv2.VideoWriter_fourcc(*codec)
    vw = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
    if not vw.isOpened():
        raise RuntimeError(f"Failed to open VideoWriter for {out_path} with codec {codec}")
    for i in range(frames):
        # simple moving gradient pattern
        img = np.zeros((h, w, 3), dtype=np.uint8)
        img[:] = ((i*5) % 255, (i*3) % 255, (i*7) % 255)
        cv2.putText(img, f"frame {i}", (20, h//2), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2, cv2.LINE_AA)
        vw.write(img)
    vw.release()
    return out_path

# Try mp4 first; fallback to avi if mp4 isn't supported in this environment.
base = "."  # Use current directory instead of Linux-specific path
mp4_path = os.path.join(base, "dummy_test_video.mp4")
avi_path = os.path.join(base, "dummy_test_video.avi")

video_path = None
try:
    video_path = create_dummy_video(mp4_path, codec='mp4v')
    print("✅ Created MP4 video")
except Exception as e:
    print(f"⚠️  MP4 failed ({e}), trying AVI...")
    try:
        video_path = create_dummy_video(avi_path, codec='XVID')
        print("✅ Created AVI video")
    except Exception as e2:
        print(f"❌ Both MP4 and AVI failed: {e2}")
        raise

print("Created video at:", video_path)

# Now run the snippet-like workflow
print("🎬 Opening video file:", video_path)
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    print("❌ Could not open the video file.")
else:
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    print("✅ Video opened")
    print("   - Resolution:", f"{width}x{height}")
    print("   - FPS:", fps)
    print("   - Total frames reported:", frame_count)
    # Peek a couple of frames to show reading works
    grabbed_frames = 0
    sample_idxs = [0, max(0, frame_count//2), max(0, frame_count-1)]
    for idx in sample_idxs:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        print(f"   - Seek to frame {idx}: {'OK' if ret else 'FAIL'}")
        if ret:
            grabbed_frames += 1
    cap.release()
    print("Done. Frames successfully grabbed:", grabbed_frames)


✅ Created MP4 video
Created video at: .\dummy_test_video.mp4
🎬 Opening video file: .\dummy_test_video.mp4
✅ Video opened
   - Resolution: 320x240
   - FPS: 15.0
   - Total frames reported: 45
   - Seek to frame 0: OK
   - Seek to frame 22: OK
   - Seek to frame 44: OK
Done. Frames successfully grabbed: 3


In [3]:
# Test reading the video we just created
print("🔍 Testing video reading...")
cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    print("❌ Failed to open video for testing")
else:
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    print(f"✅ Video properties:")
    print(f"   - Frames: {frame_count}")
    print(f"   - FPS: {fps}")
    print(f"   - Resolution: {width}x{height}")
    
    # Test reading a few frames
    frames_read = 0
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)  # Go to first frame
    for i in range(min(5, frame_count)):  # Read up to 5 frames
        ret, frame = cap.read()
        if ret:
            frames_read += 1
        else:
            break
    
    print(f"   - Successfully read {frames_read} frames")
    
# Always release the capture
cap.release()
print("🏁 Video capture released")


🔍 Testing video reading...
✅ Video properties:
   - Frames: 45
   - FPS: 15.0
   - Resolution: 320x240
   - Successfully read 5 frames
🏁 Video capture released


# Clever sampling

In [4]:
import torch
print(torch.cuda.is_available(), torch.version.cuda)


True 12.1


# check samples from LLaVA-Video-178K

In [1]:
! curl -X GET "https://datasets-server.huggingface.co/splits?dataset=lmms-lab%2FLLaVA-Video-178K"


{"splits":[{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_academic_v0_1","split":"caption"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_academic_v0_1","split":"open_ended"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_academic_v0_1","split":"multi_choice"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_activitynet","split":"open_ended"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_nextqa","split":"open_ended"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_nextqa","split":"multi_choice"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_perceptiontest","split":"multi_choice"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_youtube_v0_1","split":"caption"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_youtube_v0_1","split":"open_ended"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"0_30_s_youtube_v0_1","split":"multi_choice"},{"dataset":"lmms-lab/LLaVA-Video-178K","config":"1_2_m_academic_v0_1","spli

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3577  100  3577    0     0   4707      0 --:--:-- --:--:-- --:--:--  4712


In [5]:
! curl -X GET \
     "https://huggingface.co/api/datasets/lmms-lab/LLaVA-Video-178K/parquet/0_30_s_academic_v0_1/caption"

["https://huggingface.co/api/datasets/lmms-lab/LLaVA-Video-178K/parquet/0_30_s_academic_v0_1/caption/0.parquet"]


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   112  100   112    0     0    354      0 --:--:-- --:--:-- --:--:--   357


In [6]:
! curl -X GET \
     "https://datasets-server.huggingface.co/rows?dataset=lmms-lab%2FLLaVA-Video-178K&config=0_30_s_academic_v0_1&split=caption&offset=0&length=100"



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0  152k    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100  152k  100  152k    0     0   139k      0  0:00:01  0:00:01 --:--:--  139k


In [7]:
# pip install huggingface_hub datasets fsspec requests tqdm
import os, io, tarfile, json
from tqdm import tqdm
from datasets import load_dataset
from huggingface_hub import hf_hub_download

DATASET_ID = "lmms-lab/LLaVA-Video-178K"
CONFIG = "0_30_s_academic_v0_1"
SPLIT = "caption"
TARGET_IDS = {"028CE", "1KJI0", "4CSXJ"}

OUT_DIR = "chronicon_samples"
VID_DIR = os.path.join(OUT_DIR, "videos")
os.makedirs(VID_DIR, exist_ok=True)

def extract_caption_from_conversations(conv):
    # In this split: conversations[0] is the prompt, conversations[1] holds the caption-like text
    try:
        return conv[1]["value"]
    except Exception:
        return None

# 1) Pull just the 3 rows and save their metadata
rows = []
ds = load_dataset(DATASET_ID, CONFIG, split=SPLIT, streaming=True)
for ex in ds:
    vid = ex.get("id")
    if vid in TARGET_IDS:
        rows.append({
            "id": vid,
            "video_path": ex.get("video"),              # relative path inside archives
            "caption": extract_caption_from_conversations(ex.get("conversations")),
            "data_source": ex.get("data_source"),
        })
    if len(rows) == len(TARGET_IDS):
        break

missing = TARGET_IDS - {r["id"] for r in rows}
if missing:
    raise RuntimeError(f"Did not find all target ids: {missing}")

os.makedirs(OUT_DIR, exist_ok=True)
with open(os.path.join(OUT_DIR, "chronicon_metadata.json"), "w", encoding="utf-8") as f:
    json.dump(rows, f, ensure_ascii=False, indent=2)

# 2) Find and extract each mp4 from the video tar archives
#    We will scan 1..8, smallest archive is _videos_8.tar.gz, try it first for speed.
archive_order = [8,1,2,3,4,5,6,7]
need = {r["id"]: r["video_path"] for r in rows}

def extract_from_tar(repo_id, filename, member_path, out_path):
    # stream the tar and extract only the member you need
    tar_fp = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=filename, force_download=False)
    with tarfile.open(tar_fp, "r:gz") as tf:
        try:
            member = tf.getmember(member_path)
        except KeyError:
            return False
        with tf.extractfile(member) as src, open(out_path, "wb") as dst:
            dst.write(src.read())
        return True

repo = DATASET_ID
found = set()

for n in archive_order:
    if len(found) == len(need):
        break
    tar_name = f"{CONFIG}/{CONFIG}_videos_{n}.tar.gz"
    # Check which pending paths are inside this tar by attempting to getmember quickly
    # We open once and try all pending members to avoid reopening
    tar_fp = hf_hub_download(repo_id=repo, repo_type="dataset", filename=tar_name, force_download=False)
    with tarfile.open(tar_fp, "r:gz") as tf:
        pending = [(k, v) for k, v in need.items() if k not in found]
        for vid, rel_path in pending:
            try:
                m = tf.getmember(rel_path)
            except KeyError:
                continue
            # Extract
            out_mp4 = os.path.join(VID_DIR, f"{vid}.mp4")
            with tf.extractfile(m) as src, open(out_mp4, "wb") as dst:
                dst.write(src.read())
            found.add(vid)
            print(f"Extracted {vid} from {tar_name} -> {out_mp4}")

still = [k for k in need.keys() if k not in found]
if still:
    print("Could not locate these IDs in the 8 archives:", still)
else:
    print("All videos extracted to:", VID_DIR)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Extracted 028CE from 0_30_s_academic_v0_1/0_30_s_academic_v0_1_videos_1.tar.gz -> chronicon_samples\videos\028CE.mp4
Extracted 1KJI0 from 0_30_s_academic_v0_1/0_30_s_academic_v0_1_videos_1.tar.gz -> chronicon_samples\videos\1KJI0.mp4


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not in

Extracted 4CSXJ from 0_30_s_academic_v0_1/0_30_s_academic_v0_1_videos_6.tar.gz -> chronicon_samples\videos\4CSXJ.mp4
All videos extracted to: chronicon_samples\videos


In [2]:
! pip install datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.3.1-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.15-cp310-cp310-win_amd64.whl.metadata (7.9 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!



In [8]:
! pip install huggingface_hub[hf_xet]

Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.7-cp37-abi3-win_amd64.whl.metadata (703 bytes)
Downloading hf_xet-1.1.7-cp37-abi3-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   -------------- ------------------------- 1.0/2.8 MB 7.2 MB/s eta 0:00:01
   ------------------------------------- -- 2.6/2.8 MB 6.6 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 6.5 MB/s  0:00:00
Installing collected packages: hf-xet
Successfully installed hf-xet-1.1.7


