In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from PIL import Image

In [None]:
OMNIFALL_ROOT = "/lsdf/data/activity/fall_detection/cvhci_fall"

In [None]:
from infreqact.data.video_dataset import OmnifallVideoDataset

dataset_config = {
    "video_root": f"{OMNIFALL_ROOT}/OOPS/video",
    "annotations_file": "hf://simplexsigil2/omnifall/labels/OOPS.csv",
    "split_root": "hf://simplexsigil2/omnifall/splits",
    "dataset_name": "OOPS",
    "mode": "test",  # Start with test set (smaller)
    "split": "cs",  # Cross-subject split
    "target_fps": 8.0,  # Low FPS for quick testing
    "vid_frame_count": 16,
    "data_fps": 30.0,  # OOPS videos are 30 FPS
    "ext": ".mp4",
    "fast": True,
}

print("\nDataset Configuration:")
for key, value in dataset_config.items():
    print(f"  {key}: {value}")
print("=" * 80)

# Create dataset
print("\nCreating OmnifallVideoDataset...")
try:
    dataset = OmnifallVideoDataset(**dataset_config)
    print("✓ Dataset created successfully!")
    print(f"\n{dataset}")
    print("=" * 80)

except Exception as e:
    print(f"✗ Failed to create dataset: {e}")
    import traceback

    traceback.print_exc()

In [None]:
sample = dataset[0]
frames = [Image.fromarray(frame) for frame in sample["video"]]
frames[0]

In [None]:
from transformers import AutoModelForImageTextToText, AutoProcessor

model_checkpoint = "Qwen/Qwen3-VL-2B-Thinking"
processor = AutoProcessor.from_pretrained(
    model_checkpoint,
    do_sample_frames=False,
    size={"shortest_edge": 4 * 32 * 32, "longest_edge": 256 * 32 * 32 * 2},
)
model = AutoModelForImageTextToText.from_pretrained(model_checkpoint)

In [None]:
# budget for image processor, since the compression ratio is 32 for Qwen3-VL, we can set the number of visual tokens of a single image to 256-1280 (32× spatial compression)
# processor.image_processor.size = {"longest_edge": 1280*32*32, "shortest_edge": 256*32*32}

# budget for video processor, we can set the number of visual tokens of a single video to 256-16384 (32× spatial compression + 2× temporal compression)
# processor.video_processor.size = {"longest_edge": 16384*32*32*2, "shortest_edge": 256*32*32*2}


In [None]:
from transformers.video_utils import VideoMetadata

metadata = VideoMetadata(
    total_num_frames=len(frames),
    fps=8.0,
    frames_indices=list(range(len(frames))),
)

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": frames,
            },
            {"type": "text", "text": "Describe the action happening in the video."},
        ],
    }
]
inputs = processor.apply_chat_template(
    messages,
    return_tensors="pt",
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    video_metadata=metadata,
).to(model.device, dtype=torch.float16)

In [None]:
inputs["pixel_values_videos"].shape

In [None]:
from torch.utils.data import DataLoader

loader = DataLoader(dataset, batch_size=2)
for batch in loader:
    print(batch["video"])
    break

In [None]:
generated_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
output_text