In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

# Add the action_labeler package to Python path
sys.path.append("../")

# Prompts

In [3]:
from IPython.display import Markdown
from pathlib import Path

from PIL import Image

from action_labeler.helpers import load_image
from action_labeler.detections.detection import Detection
from action_labeler.prompt import (
    TextPrompt,
    DescriptionActionPrompt,
    DescriptionOnlyPrompt,
)

In [4]:
prompt = TextPrompt(
    text_prompt="Describe the image",
)

prompt.prompt(
    0,
    Detection.empty(),
    Path("./samples/images/dog_laying_down_1.jpg"),
)

'Describe the image'

### Setup Description File

In [5]:
from action_labeler.helpers import save_pickle

description_data = {
    "samples/images/dog_laying_down_1.jpg": {
        "0.7682 0.526039 0.423991 0.613334": "dog laying down",
        "0.21136 0.42622 0.381387 0.702093": "dog laying down",
    },
}

save_pickle(description_data, Path("./samples/"), filename="description.pickle")

Saving 1 images to samples/description.pickle
Saved classification file.


In [6]:
image_path = Path("./samples/images/dog_laying_down_1.jpg")
label_path = Path("./samples/detect/dog_laying_down_1.txt")

image = load_image(image_path)

detections = Detection.from_text_path(label_path, image.size)

In [7]:
prompt = DescriptionActionPrompt(
    description_file_name="description.pickle",
    classes=["action1", "action2", "action3"],
    numbered_classes=True,
)

Markdown(
    prompt.prompt(
        0,
        detections,
        image_path,
    )
)

Image Caption: "dog laying down"

What is the person in the purple bounding box actively doing? Classify the image into **one** of the following actions. If multiple actions apply, choose the one with the highest priority. The actions are sorted by priority.

Actions:

1. "action1"
2. "action2"
3. "action3"

Output Format:
- Only respond with "action: ..."
- Do not include any other text
- Do not provide explanations
- If none of the actions apply, respond with "action: none"


In [8]:
prompt = DescriptionOnlyPrompt(
    description_file_name="description.pickle",
    numbered_classes=False,
)

Markdown(
    prompt.prompt(
        0,
        detections,
        image_path,
    )
)

Image Caption: "dog laying down"

Classify the action of the person in the bounding box. Some examples of classifications are: cooking, cleaning_dishes, using_phone, using_computer, standing, walking, etc. We want to capture what the person is doing and what objects they are interacting with.

Output Format:
- Only respond with "action: ..."
- Do not include any other text
- Do not provide explanations
- If none of the actions apply, respond with "action: none"
- If multiple actions apply, choose the most specific action.
