In [1]:
%cd /code
import os
import json
import textwrap
from pathlib import Path

import openai
from pydantic import BaseModel, Field
from tqdm.auto import tqdm, trange

/code


In [2]:
client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
model_name = 'gpt-4o'
exp_name = '00-rich-context'
p_outdir = Path(f'output/psuedo-captions/{model_name}/{exp_name}/captions')
p_outdir.mkdir(parents=True, exist_ok=True)

In [4]:
class Description(BaseModel):
    category: str = Field(..., title="Category", description="Category of the event. Normal if the event is normal, corresponding action if the event is anomalous.")
    description: str = Field(..., title="Description", description="Description for an anomlous or normal event.")

class SetOfDescriptions(BaseModel):
    normal: Description = Field(..., title="Normal", description="Description for a normal event.")
    anomalous: Description = Field(..., title="Anomalous", description="Description for an anomalous event.")

class ListOfSetsOfDescriptions(BaseModel):
    descriptions: list[SetOfDescriptions] = Field(..., title="Descriptions", description="List of sets of descriptions for normal and anomalous events.")

In [None]:
for seed in trange(1000):
    p_out = p_outdir / f'{seed:08d}.json'
    if p_out.exists():
        tqdm.write(f"Skip: {seed}")
        continue
    tqdm.write(f"Seed: {seed}")
    response = client.beta.chat.completions.parse(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": textwrap.dedent(
                    """You are solving the video anomaly detection (VAD) problem in a fancy way. As you know, anomalous events are rare but their categories are diverse. You have to generate example scene descriptions both for the anomalous events and normal events. We will use these descriptions to decide if given video clips contain anomalous events by choosing one of the descriptions having the top similarity measured by a multi-modal retrieval model like ImageBind. The descriptions should be short and concise. The entire response should be in the provided json format.""")
            },
            {
                "role": "user",
                "content": "Generate 100 example scene descriptions.",
            },
        ],
        seed=seed,
        response_format=ListOfSetsOfDescriptions,
    )

    response_json = eval(response.choices[0].message.content)
    with p_out.open('w') as f:
        json.dump(response_json, f, indent=2)
    tqdm.write(f"Saved to {p_out}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Skip: 0
Skip: 1
Skip: 2
Skip: 3
Skip: 4
Skip: 5
Skip: 6
Skip: 7
Skip: 8
Skip: 9
Skip: 10
Skip: 11
Skip: 12
Skip: 13
Skip: 14
Skip: 15
Skip: 16
Skip: 17
Skip: 18
Skip: 19
Skip: 20
Skip: 21
Skip: 22
Skip: 23
Skip: 24
Skip: 25
Skip: 26
Skip: 27
Skip: 28
Skip: 29
Skip: 30
Skip: 31
Skip: 32
Skip: 33
Skip: 34
Skip: 35
Skip: 36
Skip: 37
Skip: 38
Skip: 39
Skip: 40
Skip: 41
Skip: 42
Skip: 43
Skip: 44
Skip: 45
Skip: 46
Skip: 47
Skip: 48
Skip: 49
Skip: 50
Skip: 51
Skip: 52
Skip: 53
Skip: 54
Skip: 55
Seed: 56
Saved to output/psuedo-captions/gpt-4o/00-rich-context/00000056.json
Seed: 57
Saved to output/psuedo-captions/gpt-4o/00-rich-context/00000057.json
Seed: 58
Saved to output/psuedo-captions/gpt-4o/00-rich-context/00000058.json
Seed: 59
Saved to output/psuedo-captions/gpt-4o/00-rich-context/00000059.json
Seed: 60
Saved to output/psuedo-captions/gpt-4o/00-rich-context/00000060.json
Seed: 61
Saved to output/psuedo-captions/gpt-4o/00-rich-context/00000061.json
Seed: 62
Saved to output/psuedo-captio