In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from hydra import compose, initialize
from infreqact.data.video_dataset import label2idx
from infreqact.inference.prompts import PromptBuilder, PromptConfig

In [None]:
with initialize(version_base=None, config_path="../config/"):
    cfg = compose(overrides=["+experiment=debug", "+prompt=cot", "+model=qwen/instruct"])
    print(cfg)

In [None]:
# Build prompt from config
prompt_config = PromptConfig(
    output_format="text",
    include_role=True,
    include_definitions=False,
    include_adherence=False,
    cot=True,
    labels=list(label2idx.keys()),
    model_family="qwen",
)
prompt_builder = PromptBuilder(prompt_config, label2idx)
prompt = prompt_builder.build_prompt()
parser = prompt_builder.get_parser()
system_message = prompt_builder.get_system_message()

In [None]:
print(prompt)

In [None]:
sample_output = {
    "label": 2,
    "label_str": "fallen",
    "video_path": "falls/25BestTrampolineFailNominees-FailArmyHallofFameJuly201724",
    "start_time": 3.833,
    "end_time": 7.811,
    "segment_duration": 3.9779999999999998,
    "dataset": "OOPS",
    "predicted_label": "standing",
    "reasoning": 'So, let\'s analyze the video. First, look at the actions. The person in the blue shirt is lying on the ground. The man in the checkered shirt is moving towards them. Let\'s check the timestamps. \n\nAt [0.0], the person is lying, then in [0.3], they might be moving. Wait, the key is the primary action. The person in blue is on the ground, so maybe they fell. But let\'s see the labels. "fall" is an action, "fallen" is the state. Wait, the video shows someone lying on the ground, which could be a fall. But let\'s check the actions. The person was probably lying after a fall. But the labels: "fall" is the action of falling, "fallen" is the state. Wait, the question is the primary action being performed. Let\'s see the frames. \n\nIn the video, the person in blue is lying on the ground. The man is approaching. So the action of lying down might be the result of a fall. But the labels have "lie_down" and "fall". Let\'s check the sequence. The person starts lying, so maybe they fell. But "fall" is the action of falling, which would be the movement to the ground. But in the video, the person is already on the ground. Wait, maybe the action is "fallen" as the state. Wait, the allowed labels: "fall" is the action, "fallen" is the state. Wait, the problem says "classify the primary action being performed". So if someone is lying on the ground, the action could be "lie_down" (the action of lying down) or "fall" (the action of falling). But let\'s look at the frames. \n\nAt the start, the person is lying, then the man is moving towards them. So the person was likely lying after a fall. But the action being performed in the video—what\'s the main action? The person is lying, so the action is "lie_down" (the action of lying down). Wait, but maybe they fell. Let\'s check the labels. The labels include "fall" (the action of falling), "fallen" (the state). Wait, the video shows a person lying on the ground, so the action could be "lie_down" (the action of lying down) or "fall" (if they were falling). But in the frames, the person is already on the ground. Let\'s check the movement. The person in blue is lying, so the action is "lie_down". Wait, but maybe it\'s a fall. Let\'s see: the person was probably moving towards the trampoline and fell. But in the video, the person is lying, so the action is "fallen" (state) or "fall" (action). Wait, the labels: "fall" is the action of falling (like the motion), "fallen" is the state. But the question is the primary action being performed. If the person is lying on the ground, the action that led to that is falling, but the action being performed now is lying. Wait, no. The video shows the person lying, so the action is "lie_down" (the action of lying down). Wait, but let\'s check the options. \n\nWait, the allowed labels: "lie_down" is the action of lying down. So if the person is in the process of lying down, but in the video, they are already lying. Hmm. Alternatively, maybe they fell. Let\'s see: the person was moving towards the trampoline and fell. But the video shows them lying. So the action is "fall" (the action of falling) or "fallen" (the state). Wait, the labels have "fall" as an action. Let\'s check the definition. "fall" is the action of falling, which is the motion. If the person is on the ground, maybe they fell, so the action is "fall". But the video might be showing the result of a fall. Wait, the problem says "classify the primary action being performed". So what is the action happening in the video. The person is lying on the ground, so the action is "lie_down" (the action of lying down). But maybe it\'s a fall. Let\'s check the frames. \n\nLooking at the video: the person in blue is lying on the grass, and the man is approaching. The sequence: the person was probably standing and fell, so the action is "fall". But the video shows them lying, so the action of falling is completed. Wait, the labels have "fall" as an action. Let\'s check the definition. "fall" is the action of falling (the movement). If the person fell, then the action is "fall". But in the video, the person is lying, which is the result. But the action being performed is the falling. Wait, but the video might not show the falling motion, just the result. Hmm. \n\nAlternatively, "fallen" is the state. But the labels have "fall" as an action. Let\'s check the options again. The allowed labels include "fall", "fallen", "lie_down", etc. \n\nWait, the person is lying on the ground. So the action is "lie_down" (the action of lying down). But maybe they fell, so "fall". Let\'s see: if the person was moving and then fell, the action is "fall". But in the video, the person is already lying. So the action that happened is "fall", but the current action is lying. But the question is the primary action being performed in the video. The video shows the person lying, so the action is "lie_down". Wait, but maybe it\'s a fall. Let\'s check the frames. \n\nWait, the first frame: the person is lying. Then in subsequent frames, they are still lying. So the action is "lie_down" (the action of lying down). But maybe it\'s "fall" because they fell. Let\'s think: the person was probably standing and then fell onto the ground. So the action is "fall". But the video shows the result. However, the label "fall" is for the action of falling, which is the motion. If the video shows the person falling, then "fall" is the label. But in the video, the person is already on the ground. Wait, maybe the action is "fallen" (the state). But the labels have "fall" as an action. \n\nWait, the problem says "classify the primary action being performed". So what is the action happening. If the person is lying, the action is "lie_down". But let\'s check the options. The labels are: walk, fall, fallen, sit_down, sitting, lie_down, lying, stand_up, standing, other, kneel_down, kneeling, squat_down, squatting, crawl, jump, no_fall. \n\nWait, "fall" is the action (the motion), "fallen" is the state. So if the person is lying on the ground, the action that caused it is falling, but the action being performed now is lying. But the question is the primary action. Maybe the person fell, so the action is "fall". Let\'s see: the person in blue is lying, so the action is "fall" (they fell). But the video shows them lying, which is the result of falling. So the primary action is "fall". \n\nWait, but let\'s check the video again. The person is lying, so the action is "lying" (state) or "lie_down" (action). But the labels have "lying" as a state. Wait, the labels: "lie_down" is the action of lying down (the movement), "lying" is the state. So if the person is lying, the action is "lie_down" (the action of lying down). But maybe they fell, so the action is "fall". \n\nHmm. Let\'s think of similar examples. If someone falls, the action is "fall". If they are lying, it\'s "lying" (state). But the labels have "fall" as an action. So in the video, the person is lying on the ground, which is the result of a fall. But the action being performed is the fall (the motion). However, the video might not show the motion, just the result. But the problem says "primary action being performed". \n\nWait, the video shows a person lying on the ground, so the action is "lie_down" (the action of lying down). But maybe it\'s a fall. Let\'s check the frames. The person is lying, so the action is "lying" (state) or "lie_down" (action). The label "lying" is a state, "lie_down" is the action. So if the person is in the process of lying down, but in the video, they are already lying. So the action is "lie_down". But maybe it\'s a fall. \n\nWait, another angle: the man is approaching the person. The person is lying. So the person fell. So the action is "fall". Because falling is the action that caused them to be lying. So the primary action is "fall". \n\nLet\'s check the labels. "fall" is an action. So the answer is "fall". \n\nWait, but let\'s see the options. The allowed labels include "fall", "fallen". "fallen" is the state, "fall" is the action. So if the person fell, the action is "fall". \n\nIn the video, the person is lying on the ground, which is the result of falling. So the action being performed (the action that happened) is "fall". \n\nYes, that makes sense. So the answer is "fall".',
    "raw_output": 'So, let\'s analyze the video. First, look at the actions. The person in the blue shirt is lying on the ground. The man in the checkered shirt is moving towards them. Let\'s check the timestamps. \n\nAt [0.0], the person is lying, then in [0.3], they might be moving. Wait, the key is the primary action. The person in blue is on the ground, so maybe they fell. But let\'s see the labels. "fall" is an action, "fallen" is the state. Wait, the video shows someone lying on the ground, which could be a fall. But let\'s check the actions. The person was probably lying after a fall. But the labels: "fall" is the action of falling, "fallen" is the state. Wait, the question is the primary action being performed. Let\'s see the frames. \n\nIn the video, the person in blue is lying on the ground. The man is approaching. So the action of lying down might be the result of a fall. But the labels have "lie_down" and "fall". Let\'s check the sequence. The person starts lying, so maybe they fell. But "fall" is the action of falling, which would be the movement to the ground. But in the video, the person is already on the ground. Wait, maybe the action is "fallen" as the state. Wait, the allowed labels: "fall" is the action, "fallen" is the state. Wait, the problem says "classify the primary action being performed". So if someone is lying on the ground, the action could be "lie_down" (the action of lying down) or "fall" (the action of falling). But let\'s look at the frames. \n\nAt the start, the person is lying, then the man is moving towards them. So the person was likely lying after a fall. But the action being performed in the video—what\'s the main action? The person is lying, so the action is "lie_down" (the action of lying down). Wait, but maybe they fell. Let\'s check the labels. The labels include "fall" (the action of falling), "fallen" (the state). Wait, the video shows a person lying on the ground, so the action could be "lie_down" (the action of lying down) or "fall" (if they were falling). But in the frames, the person is already on the ground. Let\'s check the movement. The person in blue is lying, so the action is "lie_down". Wait, but maybe it\'s a fall. Let\'s see: the person was probably moving towards the trampoline and fell. But in the video, the person is lying, so the action is "fallen" (state) or "fall" (action). Wait, the labels: "fall" is the action of falling (like the motion), "fallen" is the state. But the question is the primary action being performed. If the person is lying on the ground, the action that led to that is falling, but the action being performed now is lying. Wait, no. The video shows the person lying, so the action is "lie_down" (the action of lying down). Wait, but let\'s check the options. \n\nWait, the allowed labels: "lie_down" is the action of lying down. So if the person is in the process of lying down, but in the video, they are already lying. Hmm. Alternatively, maybe they fell. Let\'s see: the person was moving towards the trampoline and fell. But the video shows them lying. So the action is "fall" (the action of falling) or "fallen" (the state). Wait, the labels have "fall" as an action. Let\'s check the definition. "fall" is the action of falling, which is the motion. If the person is on the ground, maybe they fell, so the action is "fall". But the video might be showing the result of a fall. Wait, the problem says "classify the primary action being performed". So what is the action happening in the video. The person is lying on the ground, so the action is "lie_down" (the action of lying down). But maybe it\'s a fall. Let\'s check the frames. \n\nLooking at the video: the person in blue is lying on the grass, and the man is approaching. The sequence: the person was probably standing and fell, so the action is "fall". But the video shows them lying, so the action of falling is completed. Wait, the labels have "fall" as an action. Let\'s check the definition. "fall" is the action of falling (the movement). If the person fell, then the action is "fall". But in the video, the person is lying, which is the result. But the action being performed is the falling. Wait, but the video might not show the falling motion, just the result. Hmm. \n\nAlternatively, "fallen" is the state. But the labels have "fall" as an action. Let\'s check the options again. The allowed labels include "fall", "fallen", "lie_down", etc. \n\nWait, the person is lying on the ground. So the action is "lie_down" (the action of lying down). But maybe they fell, so "fall". Let\'s see: if the person was moving and then fell, the action is "fall". But in the video, the person is already lying. So the action that happened is "fall", but the current action is lying. But the question is the primary action being performed in the video. The video shows the person lying, so the action is "lie_down". Wait, but maybe it\'s a fall. Let\'s check the frames. \n\nWait, the first frame: the person is lying. Then in subsequent frames, they are still lying. So the action is "lie_down" (the action of lying down). But maybe it\'s "fall" because they fell. Let\'s think: the person was probably standing and then fell onto the ground. So the action is "fall". But the video shows the result. However, the label "fall" is for the action of falling, which is the motion. If the video shows the person falling, then "fall" is the label. But in the video, the person is already on the ground. Wait, maybe the action is "fallen" (the state). But the labels have "fall" as an action. \n\nWait, the problem says "classify the primary action being performed". So what is the action happening. If the person is lying, the action is "lie_down". But let\'s check the options. The labels are: walk, fall, fallen, sit_down, sitting, lie_down, lying, stand_up, standing, other, kneel_down, kneeling, squat_down, squatting, crawl, jump, no_fall. \n\nWait, "fall" is the action (the motion), "fallen" is the state. So if the person is lying on the ground, the action that caused it is falling, but the action being performed now is lying. But the question is the primary action. Maybe the person fell, so the action is "fall". Let\'s see: the person in blue is lying, so the action is "fall" (they fell). But the video shows them lying, which is the result of falling. So the primary action is "fall". \n\nWait, but let\'s check the video again. The person is lying, so the action is "lying" (state) or "lie_down" (action). But the labels have "lying" as a state. Wait, the labels: "lie_down" is the action of lying down (the movement), "lying" is the state. So if the person is lying, the action is "lie_down" (the action of lying down). But maybe they fell, so the action is "fall". \n\nHmm. Let\'s think of similar examples. If someone falls, the action is "fall". If they are lying, it\'s "lying" (state). But the labels have "fall" as an action. So in the video, the person is lying on the ground, which is the result of a fall. But the action being performed is the fall (the motion). However, the video might not show the motion, just the result. But the problem says "primary action being performed". \n\nWait, the video shows a person lying on the ground, so the action is "lie_down" (the action of lying down). But maybe it\'s a fall. Let\'s check the frames. The person is lying, so the action is "lying" (state) or "lie_down" (action). The label "lying" is a state, "lie_down" is the action. So if the person is in the process of lying down, but in the video, they are already lying. So the action is "lie_down". But maybe it\'s a fall. \n\nWait, another angle: the man is approaching the person. The person is lying. So the person fell. So the action is "fall". Because falling is the action that caused them to be lying. So the primary action is "fall". \n\nLet\'s check the labels. "fall" is an action. So the answer is "fall". \n\nWait, but let\'s see the options. The allowed labels include "fall", "fallen". "fallen" is the state, "fall" is the action. So if the person fell, the action is "fall". \n\nIn the video, the person is lying on the ground, which is the result of falling. So the action being performed (the action that happened) is "fall". \n\nYes, that makes sense. So the answer is "fall".\n</think>\n\nTo determine the primary action in the video, we analyze the visual content and timestamps:  \n\n1. **Visual Observation**:  \n   - A person in a blue shirt is lying on the grass near a trampoline.  \n   - The person’s position (on the ground) suggests a prior fall, as they are not actively moving or standing.  \n   - The man in the checkered shirt approaches, indicating the person was likely on the ground due to a fall.  \n\n2. **Label Analysis**:  \n   - **"fall"** refers to the *action* of falling (the motion of descending to the ground).  \n   - **"fallen"** refers to the *state* of being on the ground after a fall.  \n   - The video depicts the *result* of a fall (the person lying on the ground), but the **primary action** being performed is the motion of falling, which led to the current state.  \n\n3. **Key Clue**:  \n   The person’s position on the ground is a direct consequence of the action of falling. While the video shows the *result* (lying), the *primary action* that caused this outcome is **falling**.  \n\nThus, the best label for the primary action being performed is **fall**.  \n\nFinal Answer: fall',
    "type": "prediction",
    "idx": 2,
}

In [None]:
# sample_output = {
#     "sample_8": {
#         "label": 15,
#         "label_str": "jump",
#         "video_path": "falls/25BestTrampolineFailNominees-FailArmyHallofFameJuly20176",
#         "start_time": 5.581,
#         "end_time": 6.248,
#         "segment_duration": 0.6669999999999998,
#         "dataset": "OOPS",
#         "predicted_label": "jump",
#         "reasoning": "",
#         "raw_output": "So, let's analyze the video. The main subject is a person on a trampoline. Let's look at the timestamps. \n\nFirst, at the start (0.0s), they're jumping, so they're in mid-air. Then, in 0.2s, they land on the trampoline. Wait, the sequence: the first frame is jumping (0.0s), then 0.2s they land. But the action here is jumping. Wait, the labels: jump is under Extended (Rare). Let's check the definitions. \n\nExtended (Rare) labels include jump. The person is jumping on the trampoline. So the action is jump. Let's confirm. The video shows the person jumping, so the label should be jump.\n</think>\n\n{\n  \"label\": \"jump\"\n}",
#     },
# }

In [None]:
type(parser)

In [None]:
result = parser.parse(sample_output["raw_output"])
result

In [None]:
result.reasoning