In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from hydra import compose, initialize

from infreqact.data.video_dataset import label2idx
from infreqact.inference.prompts import PromptBuilder, PromptConfig

In [None]:
with initialize(version_base=None, config_path="../config/"):
    cfg = compose(overrides=["+experiment=debug", "+prompt=default", "+model=qwen/instruct"])
    print(cfg)

{'dataset': {'name': 'video-dataset-oops', 'video_datasets': [{'name': 'OOPS', 'video_root': '${oc.env:OMNIFALL_ROOT}/OOPS/video', 'annotations_file': 'hf://simplexsigil2/omnifall/labels/OOPS.csv', 'dataset_fps': 30.0, 'split_root': 'hf://simplexsigil2/omnifall/splits'}], 'target_fps': '${model_fps}', 'vid_frame_count': '${num_frames}', 'num_classes': 16, 'path_format': '{video_root}/{video_path}{ext}'}, 'num_samples': 10, 'model': {'params': '4B', 'family': 'Qwen', 'version': '3', 'variant': 'Instruct', 'name': '${model.family}${model.version}-VL-${model.params}-${model.variant}', 'checkpoint_path': '${model.family}/${model.name}', 'needs_video_metadata': True}, 'batch_size': 10, 'num_workers': 0, 'vllm': {'use_mock': False, 'tensor_parallel_size': 1, 'skip_mm_profiling': True, 'enforce_eager': True}, 'log_videos': 0, 'wandb': {'project': 'fall-detection-testing', 'mode': 'disabled'}, 'prompt': {'output_format': 'json', 'include_role': True, 'include_label_definitions': True, 'include

In [None]:
# Build prompt from config
prompt_config = PromptConfig(
    output_format="json",
    include_role=False,
    include_definitions=False,
    include_adherence=True,
    cot=True,
    labels=list(label2idx.keys()),
    model_family="qwen",
)
prompt_builder = PromptBuilder(prompt_config)
prompt = prompt_builder.build_prompt()
parser = prompt_builder.get_parser()
system_message = prompt_builder.get_system_message()

In [None]:
print(prompt)

Analyze the video clip and classify the primary action being performed. Only use one of the allowed labels provided below.

Allowed Labels:
- walk
- fall
- fallen
- sit_down
- sitting
- lie_down
- lying
- stand_up
- standing
- other
- kneel_down
- kneeling
- squat_down
- squatting
- crawl
- jump
- no_fall

Output Format:
Return a strictly valid JSON object where <class_label> is one of the allowed labels.
{
  "label": "<class_label>"
}

Please reason step-by-step, identify relevant visual content,
analyze key timestamps and clues. Enclose your reasoning within <think> and </think> tags,
then provide the final answer.

Only use the allowed labels and stick exactly to the output format. If uncertain between two labels,
choose the one that best fits the definitions above. Don't invent new labels.


In [None]:
sample_output = {
    "sample_8": {
        "label": 15,
        "label_str": "jump",
        "video_path": "falls/25BestTrampolineFailNominees-FailArmyHallofFameJuly20176",
        "start_time": 5.581,
        "end_time": 6.248,
        "segment_duration": 0.6669999999999998,
        "dataset": "OOPS",
        "predicted_label": "jump",
        "reasoning": "",
        "raw_output": "So, let's analyze the video. The main subject is a person on a trampoline. Let's look at the timestamps. \n\nFirst, at the start (0.0s), they're jumping, so they're in mid-air. Then, in 0.2s, they land on the trampoline. Wait, the sequence: the first frame is jumping (0.0s), then 0.2s they land. But the action here is jumping. Wait, the labels: jump is under Extended (Rare). Let's check the definitions. \n\nExtended (Rare) labels include jump. The person is jumping on the trampoline. So the action is jump. Let's confirm. The video shows the person jumping, so the label should be jump.\n</think>\n\n{\n  \"label\": \"jump\"\n}"
    },
}

In [None]:
type(parser)

infreqact.inference.prompts.parsers.CoTOutputParser

In [None]:
result = parser.parse(sample_output["sample_8"]["raw_output"], label2idx=label2idx)
result

ParseResult(label='jump', reasoning=None, raw_text='So, let\'s analyze the video. The main subject is a person on a trampoline. Let\'s look at the timestamps. \n\nFirst, at the start (0.0s), they\'re jumping, so they\'re in mid-air. Then, in 0.2s, they land on the trampoline. Wait, the sequence: the first frame is jumping (0.0s), then 0.2s they land. But the action here is jumping. Wait, the labels: jump is under Extended (Rare). Let\'s check the definitions. \n\nExtended (Rare) labels include jump. The person is jumping on the trampoline. So the action is jump. Let\'s confirm. The video shows the person jumping, so the label should be jump.\n</think>\n\n{\n  "label": "jump"\n}')