In [3]:
"""
Usage:

python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava

python3 http_llava_onevision_test.py
"""

import base64
import io
import os
import sys
import time

import numpy as np
import openai
import requests
from decord import VideoReader, cpu
from PIL import Image

# pip install httpx==0.23.3
# pip install decord
# pip install protobuf==3.20.0


In [6]:


def download_video(url, cache_dir):
    file_path = os.path.join(cache_dir, "jobs.mp4")
    os.makedirs(cache_dir, exist_ok=True)

    response = requests.get(url)
    response.raise_for_status()

    with open(file_path, "wb") as f:
        f.write(response.content)

    print(f"File downloaded and saved to: {file_path}")
    return file_path


def create_openai_client(base_url):
    return openai.Client(api_key="EMPTY", base_url=base_url)


def image_stream_request_test(client):
    print("----------------------Image Stream Request Test----------------------")
    stream_request = client.chat.completions.create(
        model="default",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
                        },
                    },
                    {
                        "type": "text",
                        "text": "Please describe this image. Please list the benchmarks and the models.",
                    },
                ],
            },
        ],
        temperature=0.7,
        max_tokens=1024,
        stream=True,
    )
    stream_response = ""

    for chunk in stream_request:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            stream_response += content
            sys.stdout.write(content)
            sys.stdout.flush()

    print('\n' + "-" * 30)


def multi_image_stream_request_test(client):
    print(
        "----------------------Multi-Images Stream Request Test----------------------"
    )
    stream_request = client.chat.completions.create(
        model="default",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
                        },
                        "modalities": "multi-images",
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png"
                        },
                        "modalities": "multi-images",
                    },
                    {
                        "type": "text",
                        "text": "I have shown you two images. Please describe the two images to me.",
                    },
                ],
            },
        ],
        temperature=0.7,
        max_tokens=1024,
        stream=True,
    )
    stream_response = ""

    for chunk in stream_request:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            stream_response += content
            sys.stdout.write(content)
            sys.stdout.flush()

    print('\n' + "-" * 30)


def video_stream_request_test(client, video_path):
    print("------------------------Video Stream Request Test----------------------")
    messages = prepare_video_messages(video_path)

    video_request = client.chat.completions.create(
        model="default",
        messages=messages,
        temperature=0,
        max_tokens=1024,
        stream=True,
    )
    print("-" * 30)
    video_response = ""

    for chunk in video_request:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            video_response += content
            sys.stdout.write(content)
            sys.stdout.flush()
    print('\n' + "-" * 30)


def image_speed_test(client):
    print("----------------------Image Speed Test----------------------")
    start_time = time.time()
    request = client.chat.completions.create(
        model="default",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
                        },
                    },
                    {
                        "type": "text",
                        "text": "Please describe this image. Please list the benchmarks and the models.",
                    },
                ],
            },
        ],
        temperature=0,
        max_tokens=1024,
    )
    end_time = time.time()
    response = request.choices[0].message.content
    print(response)
    print("-" * 30)
    print_speed_test_results(request, start_time, end_time)


def video_speed_test(client, video_path):
    print("------------------------Video Speed Test------------------------")
    messages = prepare_video_messages(video_path)

    start_time = time.time()
    video_request = client.chat.completions.create(
        model="default",
        messages=messages,
        temperature=0,
        max_tokens=1024,
    )
    end_time = time.time()
    video_response = video_request.choices[0].message.content
    print(video_response)
    print("-" * 30)
    print_speed_test_results(video_request, start_time, end_time)


def prepare_video_messages(video_path):
    max_frames_num = 32
    vr = VideoReader(video_path, ctx=cpu(0))
    total_frame_num = len(vr)
    uniform_sampled_frames = np.linspace(
        0, total_frame_num - 1, max_frames_num, dtype=int
    )
    frame_idx = uniform_sampled_frames.tolist()
    frames = vr.get_batch(frame_idx).asnumpy()

    base64_frames = []
    for frame in frames:
        pil_img = Image.fromarray(frame)
        buff = io.BytesIO()
        pil_img.save(buff, format="JPEG")
        base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
        base64_frames.append(base64_str)

    messages = [{"role": "user", "content": []}]

    for base64_frame in base64_frames:
        frame_format = {
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{base64_frame}"},
            "modalities": "video",
        }
        messages[0]["content"].append(frame_format)

    prompt = {"type": "text", "text": "Please describe the video in detail."}
    messages[0]["content"].append(prompt)

    return messages


def print_speed_test_results(request, start_time, end_time):
    total_tokens = request.usage.total_tokens
    completion_tokens = request.usage.completion_tokens
    prompt_tokens = request.usage.prompt_tokens

    print(f"Total tokens: {total_tokens}")
    print(f"Completion tokens: {completion_tokens}")
    print(f"Prompt tokens: {prompt_tokens}")
    print(f"Time taken: {end_time - start_time} seconds")
    print(f"Token per second: {total_tokens / (end_time - start_time)}")
    print(f"Completion token per second: {completion_tokens / (end_time - start_time)}")
    print(f"Prompt token per second: {prompt_tokens / (end_time - start_time)}")


def main():
    url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
    cache_dir = os.path.expanduser("/data/gunsbrother/.cache")
    video_path = download_video(url, cache_dir)

    client = create_openai_client("http://127.0.0.1:30000/v1")

    image_stream_request_test(client)
    multi_image_stream_request_test(client)
    video_stream_request_test(client, video_path)
    image_speed_test(client)
    video_speed_test(client, video_path)


In [7]:
url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
cache_dir = os.path.expanduser("/data/gunsbrother/.cache")
p_video = download_video(url, cache_dir)

client_llava = create_openai_client("http://163.180.160.54:30012/v1")


File downloaded and saved to: /data/gunsbrother/.cache/jobs.mp4


In [None]:
image_stream_request_test(client_llava)
multi_image_stream_request_test(client_llava)
video_stream_request_test(client_llava, p_video)
image_speed_test(client_llava)
video_speed_test(client_llava, p_video)

In [1]:
from pathlib import Path
p_anom_video = Path('/data/gunsbrother/repos/vlm/datasets/anomaly-detection-dataset/Anomaly-Videos-Part-2/Fighting/Fighting003_x264.mp4')
video_stream_request_test(client_llava, str(p_anom_video))

NameError: name 'video_stream_request_test' is not defined

In [8]:
from diffusers.utils import make_image_grid, export_to_gif
from IPython.display import Image as IPImage


def get_frames(p_video, max_frames_num=32):
    duration_sec = 2
    FPS = 30
    max_frames_num = 32
    num_frames_segment = int(duration_sec * FPS)

    vr = VideoReader(str(p_video), ctx=cpu(0))
    total_frame_num = len(vr)
    num_segments = total_frame_num // num_frames_segment
    for segment_idx in range(num_segments):
        segment_start_idx = segment_idx * num_frames_segment
        segment_end_idx = segment_start_idx + num_frames_segment - 1
        uniform_sampled_frames = np.linspace(segment_start_idx, segment_end_idx, max_frames_num, dtype=int)
        frame_idx = uniform_sampled_frames.tolist()
        frames = vr.get_batch(frame_idx).asnumpy()
        yield {
            'frames': frames,
            'segment_idx': segment_idx,
            'total_segments': num_segments,
            'segment_start_idx': segment_start_idx,
            'segment_end_idx': segment_end_idx,
        }

def generate_video_caption(client, frames, prompt: str = "Please describe the video in detail."):
    # duration_sec = 1
    # FPS = 30
    # max_frames_num = 32
    # num_frames_segment = int(duration_sec * FPS)
    # print(f"num_frames_segment: {num_frames_segment}")

    # vr = VideoReader(video_path, ctx=cpu(0))
    # total_frame_num = len(vr)
    # num_segments = total_frame_num // num_frames_segment
    # print(f"num_segments: {num_segments}")
    # segment_idx = 75
    # segment_start_idx = segment_idx * num_frames_segment
    # uniform_sampled_frames = np.linspace(segment_start_idx, segment_start_idx + num_frames_segment - 1, max_frames_num, dtype=int)
    # frame_idx = uniform_sampled_frames.tolist()
    # frames = vr.get_batch(frame_idx).asnumpy()

    base64_frames = []
    # imgs = []
    for frame in frames:
        pil_img = Image.fromarray(frame)
        # imgs.append(pil_img)
        buff = io.BytesIO()
        pil_img.save(buff, format="JPEG")
        base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
        base64_frames.append(base64_str)
    # export_to_gif(imgs, '/tmp/tmp.gif', fps=30)
    # display(IPImage(open('/tmp/tmp.gif', 'rb').read()))
    # print()

    content = []
    for base64_frame in base64_frames:
        frame_format = {
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{base64_frame}"},
            "modalities": "video",
        }
        content.append(frame_format)
    content.append({
        'type': 'text',
        'text': prompt,
    })

    messages = [{"role": "user", "content": content}]

    video_request = client.chat.completions.create(
        model="default",
        messages=messages,
        temperature=0,
        max_tokens=1024,
    )
    video_response = video_request.choices[0].message.content
    return video_response

responses = []
for frame_dict in get_frames(p_anom_video):
    video_response: str = generate_video_caption(client_llava, frame_dict['frames'], prompt='Describe the video in a few sentences.')
    response = {
        'segment_idx': frame_dict['segment_idx'],
        'start_idx': frame_dict['segment_start_idx'],
        'end_idx': frame_dict['segment_end_idx'],
        'response': video_response,
    }
    responses.append(response)
    print(response)

{'segment_idx': 0, 'start_idx': 0, 'end_idx': 59, 'response': 'The video captures a scene at a subway station. The platform is adorned with a vibrant, colorful floor design. A group of people, including a man in a red jacket, are standing near the platform edge. The subway train is not visible in the frame. The atmosphere is calm, and the people appear to be waiting for the train to arrive.'}
{'segment_idx': 1, 'start_idx': 60, 'end_idx': 119, 'response': 'The video captures a bustling subway station with a colorful floor, where multiple individuals are seen waiting for the train. The station is well-lit with overhead lights, and there are large maps on the walls providing information about train routes. The people in the video are dressed in various styles of clothing, and some are carrying backpacks. The train tracks are visible on the left side of the frame, and the train itself is not seen in the video.'}
{'segment_idx': 2, 'start_idx': 120, 'end_idx': 179, 'response': 'The video c

In [11]:
context_prompt = "If you were a law enforcement agency, how would you rate this scene on a scale from 0 to 1, with 0 representing a standard scene and 1 denoting a scene with suspicious activities?"
# context_prompt = "If you were a law enforcement agency, how would you rate this scene on a scale from 0 to 1, with 0 representing a standard scene and 1 denoting a scene with unusual activities?"
format_prompt = "Please provide the response in the form of a Python list and respond with only one number in the provided list below [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] without any textual explanation. It should begin with '[' and end with  ']'."
prompt = f"{context_prompt} {format_prompt}\n\nScene description: {video_response.replace('\n\n', ' ').replace('\n', ' ')}"

# print(prompt)
# print(flush=True)


def chat(client, prompt: str):
    messages = [
        {
            'role': 'system',
            'content': f'The following is a scene description. {context_prompt} {format_prompt}',
        },
        {
            "role": "user",
            "content": prompt,
        }
    ]
    request = client.chat.completions.create(
        model="default",
        messages=messages,
        temperature=0,
        max_tokens=1024,
        stream=True,
    )

    response = ""
    for chunk in request:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            response += content
            sys.stdout.write(content)
            sys.stdout.flush()

    return response

client_llama = create_openai_client("http://163.180.160.54:30000/v1")
for response_dict in responses:
    response = chat(client_llama, response_dict['response'])
print()

[0.0.2][0.2][0.1][0.1][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.1][0.1][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.1][0.1][0.2][0.1][0.1][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2][0.2]


```bash
# 작은 라바
python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --tp-size=1 --chat-template=chatml-llava --host=$(hostname -i)

# 큰 라바
NCCL_SOCKET_IFNAME=enp28s0 python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --tp-size=16 --nnodes=2 --node-rank=0 --chat-template=chatml-llava --nccl-init-addr $(hostname -i):23307 --host=$(hostname -i) --port=30000
NCCL_SOCKET_IFNAME=enp28s0 python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --tp-size=16 --nnodes=2 --node-rank=1 --chat-template=chatml-llava --nccl-init-addr 163.180.160.50:23307 --host=163.180.160.50

# 라마
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-3B-Instruct --port=30000 --tp-size=1 --host=$(hostname -i)
# 딥식
python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --port=30000 --tp-size=1 --host=$(hostname -i)
python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-70B --port=30000 --tp-size=1 --host=$(hostname -i)
```

# salloc 할당
```bash
salloc -N 2 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=8 --mem-per-gpu=43G -p debug_grad -x 'ariel-m1,ariel-k[1,2]' -t 4:00:00 --job-name not-interactive
```

In [17]:
import pandas as pd
from pathlib import Path

p_ann_test = Path('/data/gunsbrother/repos/vlm/datasets/anomaly-detection-dataset/Temporal_Anomaly_Annotation_for_Testing_Videos.txt')
p_num_frames = Path('/data/gunsbrother/repos/vlm/datasets/anomaly-detection-dataset/num_frames_per_video.txt')
df_ann_test = pd.read_csv(p_ann_test, sep=r'\s+', header=None, names=['video', 'label', 's1', 'e1', 's2', 'e2'])
df_num_frames = pd.read_csv(p_num_frames, sep=r'\s+', header=None, names=['video', 'num_frames'])
sr_num_frames = df_num_frames.set_index('video')['num_frames']

display(df_ann_test)
display(sr_num_frames)

Unnamed: 0,video,label,s1,e1,s2,e2
0,Abuse028_x264.mp4,Abuse,165,240,-1,-1
1,Abuse030_x264.mp4,Abuse,1275,1360,-1,-1
2,Arrest001_x264.mp4,Arrest,1185,1485,-1,-1
3,Arrest007_x264.mp4,Arrest,1530,2160,-1,-1
4,Arrest024_x264.mp4,Arrest,1005,3105,-1,-1
...,...,...,...,...,...,...
285,Vandalism007_x264.mp4,Vandalism,240,750,-1,-1
286,Vandalism015_x264.mp4,Vandalism,2010,2700,-1,-1
287,Vandalism017_x264.mp4,Vandalism,270,330,780,840
288,Vandalism028_x264.mp4,Vandalism,1830,1980,2400,2670


video
Abuse/Abuse001_x264.mp4             2729
Abuse/Abuse002_x264.mp4              865
Abuse/Abuse003_x264.mp4             3699
Abuse/Abuse004_x264.mp4            16794
Abuse/Abuse005_x264.mp4              949
                                   ...  
Vandalism/Vandalism046_x264.mp4     2099
Vandalism/Vandalism047_x264.mp4     2483
Vandalism/Vandalism048_x264.mp4     7183
Vandalism/Vandalism049_x264.mp4     7999
Vandalism/Vandalism050_x264.mp4      899
Name: num_frames, Length: 1900, dtype: int64

In [28]:
import numpy as np

ann_vad = {}
for idx, row in df_ann_test.iterrows():
    key = f"{row['label']}/{row['video']}"
    key = key.replace('Normal/', 'Testing_Normal_Videos_Anomaly/')
    num_frames = sr_num_frames[key]
    bin_label = np.zeros(num_frames, dtype=np.int32)
    bin_label[row['s1']:row['e1']] = 1
    if row['s2'] != -1:
        bin_label[row['s2']:row['e2']] = 1
    ann_vad[key] = bin_label

fake_preds = {}
for key, bin_label in ann_vad.items():
    fake_preds[key] = np.random.rand(len(bin_label))

# compute AUC
from sklearn.metrics import roc_auc_score
all_preds, all_labels = np.array([]), np.array([])
for key, bin_label in ann_vad.items():
    all_preds = np.concatenate([all_preds, fake_preds[key]])
    all_labels = np.concatenate([all_labels, bin_label])
auc = roc_auc_score(all_labels, all_preds)
print(auc)

0.49952901697214774
