In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from PIL import Image

In [None]:
OMNIFALL_ROOT = "/lsdf/data/activity/fall_detection/cvhci_fall"

In [None]:
from infreqact.data.video_dataset import OmnifallVideoDataset

dataset_config = {
    "video_root": f"{OMNIFALL_ROOT}/OOPS/video",
    "annotations_file": "hf://simplexsigil2/omnifall/labels/OOPS.csv",
    "split_root": "hf://simplexsigil2/omnifall/splits",
    "dataset_name": "OOPS",
    "mode": "test",  # Start with test set (smaller)
    "split": "cs",  # Cross-subject split
    "target_fps": 8.0,  # Low FPS for quick testing
    "vid_frame_count": 16,
    "data_fps": 30.0,  # OOPS videos are 30 FPS
    "ext": ".mp4",
    "fast": True,
}

print("\nDataset Configuration:")
for key, value in dataset_config.items():
    print(f"  {key}: {value}")
print("=" * 80)

# Create dataset
print("\nCreating OmnifallVideoDataset...")
try:
    dataset = OmnifallVideoDataset(**dataset_config)
    print("✓ Dataset created successfully!")
    print(f"\n{dataset}")
    print("=" * 80)

except Exception as e:
    print(f"✗ Failed to create dataset: {e}")
    import traceback

    traceback.print_exc()

In [None]:
sample = dataset[0]
frames = [Image.fromarray(frame) for frame in sample["video"]]
frames[0]

In [None]:
from transformers import AutoModelForImageTextToText, AutoProcessor

model_checkpoint = "OpenGVLab/InternVL3_5-1B-hf"
processor = AutoProcessor.from_pretrained(model_checkpoint, do_sample_frames=False)
model = AutoModelForImageTextToText.from_pretrained(
    model_checkpoint, device_map="auto", dtype="bfloat16"
)

In [None]:
processor.video_processor

In [None]:
R1_SYSTEM_PROMPT = """
You are an AI assistant that rigorously follows this response protocol:

1. First, conduct a detailed analysis of the question. Consider different angles, potential solutions, and reason through the problem step-by-step. Enclose this entire thinking process within <think> and </think> tags.

2. After the thinking section, provide a clear, concise, and direct answer to the user's question. Separate the answer from the think section with a newline.

Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
""".strip()

In [None]:
messages = [
    {"role": "system", "content": [{"type": "text", "text": R1_SYSTEM_PROMPT}]},
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": frames,
            },
            {"type": "text", "text": "Describe the action happening in the video."},
        ],
    },
]
inputs = processor.apply_chat_template(
    messages,
    return_tensors="pt",
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    video_metadata={
        "fps": 8.0,
        "total_num_frames": len(frames),
        "frames_indices": list(range(len(frames))),
    },
).to(model.device, dtype=torch.bfloat16)

In [None]:
generated_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(output_text)