In [4]:
%cd /code/
from dotenv import load_dotenv
load_dotenv()

from typing import Literal
import json
import base64
import io
import os
import re
import sys
import time
from pathlib import Path
import subprocess

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve
from decord import VideoReader, cpu
from PIL import Image
from tqdm.auto import tqdm

import openai
from sglang.utils import (
    execute_shell_command,
    wait_for_server,
    terminate_process,
)

/code


In [5]:
def chat(
    client: openai.Client,
    system_prompt: str,
    user_prompt: str,
    model: str = "default",
):
    messages = [
        {
            'role': 'system',
            'content': system_prompt,
        },
        {
            "role": "user",
            "content": user_prompt,
        }
    ]
    while True:
        try:
            request = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0,
                # max_tokens=1024,
            )
        except openai.OpenAIError as e:
            print(e)
            print('Retrying in 5 seconds...')
            time.sleep(5)
            continue
        else:
            break
    if request.choices[0].message.content:
        response = request.choices[0].message.content
    elif request.choices[0].message.refusal:
        response = request.choices[0].message.refusal
    return response

In [None]:
OPENAI_MODELS = [
    'gpt-4o', 'chatgpt-4o-latest',
    'gpt-4o-mini',
    'o1',
    'o1-mini',
    'o3-mini',
    'gpt-4-turbo',
]


p_annroot = Path('./data/annotations')
p_ann_test = p_annroot / 'Temporal_Anomaly_Annotation_for_Testing_Videos.txt'


def generate_llm(
    host: str = 'localhost', port: int = 30002,
    rank: int = 0, world_size: int = 1,
    vlm_model: str = 'lmms-lab/llava-onevision-qwen2-7b-ov',
    llm_model: str = 'meta-llama/Llama-3.2-3B-Instruct',
    prompt_vlm: str = "Describe the video in a few sentences.",
    prompt_llm_system_language: Literal['en', 'ko'] = "en",
    duration_sec = 1,
    process_each_segment: bool = True,
    debug: bool = False,
):
    p_vlm_outdir = Path('output/ucf-crime-captions') / f"prompt={prompt_vlm.replace(' ', '_')}_duration_{duration_sec}s/raw"
    df_ann_test = pd.read_csv(
        p_ann_test, sep=r'\s+', header=None, names=['video', 'label', 's1', 'e1', 's2', 'e2'])
    if process_each_segment:
        p_llm_outdir = p_vlm_outdir.parent / llm_model.replace('/', '-') / prompt_llm_system_language
    else:
        assert prompt_llm_system_language == 'en'
        p_llm_outdir = p_vlm_outdir.parent / llm_model.replace('/', '-') / 'whole'
    p_llm_outdir.mkdir(exist_ok=True, parents=True)

    df_ann_test = df_ann_test.iloc[rank::world_size]

    if llm_model not in OPENAI_MODELS:
        server_address = f"http://{host}:{port}"
        print(f'Waiting for LLM server at {server_address}...', flush=True)
        wait_for_server(server_address, timeout=600)
        client = openai.Client(api_key="EMPTY", base_url=f"{server_address}/v1")
    else:
        print('Using OpenAI API', flush=True)
        client = openai.Client(api_key=os.environ.get('OPENAI_API_KEY'))

    if prompt_llm_system_language == 'en':
        context_prompt = "The following is a scene description. If you were a law enforcement agency, how would you rate this scene on a scale from 0 to 1, with 0 representing a standard scene and 1 denoting a scene with suspicious activities?"
        if process_each_segment:
            format_prompt = "Please provide the response in the form of a Python list and respond with only one number in the provided list below [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] without any textual explanation. It should begin with '[' and end with  ']'."
        else:
            format_prompt = "Please fill the 'rate' values which are provided as `None` in the description. Rate should respond with only one number in the provided list below [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]. The response should be a Python list of dictionaries with 'segment_idx', 'description', 'reason', and 'rate' keys. Don't write a code nor textual explanation. Just fill the 'rate' values. The response should begin with '[' and end with ']'."
    elif prompt_llm_system_language == 'ko':
        context_prompt = "다음은 비디오에 대한 설명이야. 만약 너가 경찰이라면 이 장면의 이상한 정도를 0부터 1까지의 점수 중에 어떻게 평가할 거야? 0은 일상적인 장면이고 1은 의심스러운 활동이 있는 장면이야."
        format_prompt = "아래에 제공된 목록에서 하나의 숫자로만 응답해줘. [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] 이 숫자는 어떤 설명도 없이 제공되어야 해. '['로 시작해서 ']'로 끝나야 해."

    system_prompt = f'{context_prompt} {format_prompt}'

    for idx, row in tqdm(
        df_ann_test.iterrows(), total=len(df_ann_test), mininterval=1, position=0, file=sys.stdout
    ):
        p_json = (p_vlm_outdir / row['label'] / row['video']).with_suffix('.json')
        if not p_json.exists():
            print(f"Skipping generating llm captions of {p_json} as it does not exist", flush=True)
            continue
        video_record = json.load(p_json.open())
        p_json_new = (p_llm_outdir / video_record['label'] / video_record['video']).with_suffix('.json')
        if p_json_new.exists():
            print(f"Skipping {p_json_new}", flush=True)
            continue
        print(f'\nProcessing {p_json}\n\t-> {p_json_new}', flush=True)
        p_json_new.parent.mkdir(exist_ok=True, parents=True)

        if process_each_segment:
            for response_record in video_record['response_records']:
                user_prompt = f"Scene Description: {response_record['response']}"
                llm_response: str = chat(
                    client,
                    system_prompt=system_prompt,
                    user_prompt=user_prompt,
                    model=llm_model,
                )
                response_record['score_raw'] = llm_response
                if 'DeepSeek-R1' in llm_model:
                    llm_response = re.sub(r'^(?s:.)*</think>', '', llm_response).strip()
                try:
                    score = eval(llm_response)[0]
                except Exception as e:
                    print(e, file=sys.stderr)
                    print(response_record, llm_response, file=sys.stderr)
                    score = None
                finally:
                    response_record['score'] = score
                if debug:
                    # print(response_record, flush=True, end='\n\n')
                    tqdm.write(json.dumps(response_record, indent=2))
        else:  # process the whole segments at once
            descriptions = []
            for response_record in video_record['response_records']:
                descriptions.append({
                    'segment_idx': response_record['segment_idx'],
                    'description': response_record['response'],
                    'rate': None,
                })
            user_prompt = f"Scene Descriptions: {descriptions}"
            llm_response: str = chat(
                client,
                system_prompt=system_prompt,
                user_prompt=user_prompt,
                model=llm_model,
            )
            if debug:
                tqdm.write(json.dumps(llm_response, indent=2))
            try:
                llm_response = eval(llm_response)
            except Exception as e:
                print(e, file=sys.stderr)
                print(llm_response, file=sys.stderr)
                llm_response = []
            if len(llm_response) != len(video_record['response_records']):
                llm_response = []
            if llm_response:
                for seg_idx in range(len(video_record['response_records'])):
                    video_record['response_records'][seg_idx]['score_raw'] = llm_response[seg_idx].get('rate')
                    if isinstance(llm_response[seg_idx].get('rate'), (int, float)):
                        video_record['response_records'][seg_idx]['score'] = llm_response[seg_idx].get('rate')
                    video_record['response_records'][seg_idx]['reason'] = llm_response[seg_idx].get('reason')
        json.dump(video_record, p_json_new.open('w'), indent=2)

generate_llm(host='llm_server', llm_model='meta-llama/Llama-3.1-8B-Instruct', debug=False, process_each_segment=False)