# Minimal Code

In [2]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2

model = load_model("groundingdino/config/GroundingDINO_SwinB_cfg.py", "weights/groundingdino_swinb_cogcoor.pth")
IMAGE_PATH = ".asset/cat_dog.jpeg"
TEXT_PROMPT = "chair . person . dog ."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)


final text_encoder_type: bert-base-uncased


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



True

In [14]:
from decord import VideoReader
from PIL import Image
from pathlib import Path


p_clip = Path('sample/000eba33-8d14-446a-b016-19bd50e9a3b9/clip.mp4')
vr = VideoReader(str(p_clip))

chunk_size = 1200
for chunk_idx, frame_offset in enumerate(range(0, len(vr), chunk_size)):
    print(f'Reading frames # {frame_offset:5d} ~ {frame_offset + chunk_size:5d}')
    x = vr[frame_offset : frame_offset + chunk_size].asnumpy()[..., ::-1]
    p_frames_dir = p_clip.parent / f'rawframes/{chunk_idx:03d}'
    p_frames_dir.mkdir(parents=True, exist_ok=True)
    print(f'Chunk Dir: {str(p_frames_dir)}')
    for i, frame in enumerate(x):
        global_frame_idx = frame_offset + i
        p_frame = p_frames_dir / f'{global_frame_idx:05d}.png'
        Image.fromarray(frame).save(p_frame)
        print(f'\rsaved {p_frame}', end='')
    print()


Reading frames #     0 ~  1200
Chunk Dir: sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/000
saved sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/000/01199.png
Reading frames #  1200 ~  2400
Chunk Dir: sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/1200
saved sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/1200/02399.png
Reading frames #  2400 ~  3600
Chunk Dir: sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/2400
saved sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/2400/03599.png
Reading frames #  3600 ~  4800
Chunk Dir: sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/3600
saved sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/3600/04799.png
Reading frames #  4800 ~  6000
Chunk Dir: sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/4800
saved sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/4800/05999.png
Reading frames #  6000 ~  7200
Chunk Dir: sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/6000
saved sample/000eba33

In [None]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2

model = load_model("groundingdino/config/GroundingDINO_SwinB_cfg.py", "weights/groundingdino_swinb_cogcoor.pth")
IMAGE_PATH = "sample/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes/000/00000.png"
TEXT_PROMPT = "chopping stick . dustbin . water bottle ."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)


In [15]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
from groundingdino.models.GroundingDINO.groundingdino import GroundingDINO
import cv2
from pathlib import Path
import time
import numpy as np
import torch

def fmts2hms(seconds) -> str:
    seconds = float(seconds)
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    return f'{hours:2d} h {minutes:2d} m {seconds:2d} s'

queries = [
    'chopping stick',
    'dustbin',
    'water bottle',
    'jar',
    'chopsticks',
    'bag',
    'ceramic bowl',
    'plates',
    'piece of cloth',
]

device = f'cuda:0'
model: GroundingDINO = load_model("groundingdino/config/GroundingDINO_SwinB_cfg.py", "weights/groundingdino_swinb_cogcoor.pth").to(device)
# TEXT_PROMPT = "chopping stick . dustbin . water bottle ."
TEXT_PROMPT = ' . '.join(queries) + ' .'
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

p_frames_dir = Path('samples/000eba33-8d14-446a-b016-19bd50e9a3b9/rawframes')
p_all_frames = sorted(p_frames_dir.glob('**/*.png'))
assert p_all_frames, f'No frames detected under {str(p_all_frames)}.'

log_period = 100
t0_global = time.time()
t_loading, t_feeding = [], []
print('Loading done. start processing. ...')
for i, p_frame in enumerate(p_all_frames):
    t0_frame = time.time()
    image_source, image = load_image(str(p_frame))
    t1_frame = time.time()
    boxes, logits, phrases = predict(
        model=model,
        image=image,
        caption=TEXT_PROMPT,
        box_threshold=BOX_TRESHOLD,
        text_threshold=TEXT_TRESHOLD,
        device=device
    )
    t2_frame = time.time()
    t_loading.append(t1_frame - t0_frame)
    t_feeding.append(t2_frame - t1_frame)
    if i > 0 and i % log_period == 0:
        print(f'[{i:5d}/{len(p_all_frames):5d}] Global {fmts2hms(t2_frame-t0_global)} | Loading {fmts2hms(sum(t_loading))} | Processing {fmts2hms(sum(t_feeding))}')
        annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
        cv2.imwrite(f'sample/000eba33-8d14-446a-b016-19bd50e9a3b9/annotated/{i:05d}.jpg', annotated_frame)


# BBOX 다 뽑기

- 매 비디오 → job
    - 매 프레임 뭉텅이 → 한 번에 읽기도 무리 + 그렇다고 배치마다 읽기도 무리, 배치 사이즈 x N 배 수준으로
        - 매 배치
            ```python
            frames = preprocess(vr[s:e].asarray())

            annotations = ann[clip_id]
            words = [ann['slot_x'] for ann in annotations]
            query = ' . '.join(words) + ' .'
            ```
            - feed into the model

- 이 레포 배치 여러 개에 대해 동작하도록 수정
- 최대 배치사이즈?
    ```python
    B = 128; images = torch.randn(B, 3, 568, 320, pin_memory=True).to(device)
    ```

- annotation 뭉텅이:
    ```python
    clip_queries = {
        "clip_uid": "93231c7e-1cf4-4a20-b1f8-9cc9428915b2",
    }
    ```

In [None]:
# 비디오 사이즈는 3개밖에 없음

import pandas as pd

df = pd.read_csv(
    'egonlq_clips_info.csv',
    names=['clip_id', 'width', 'height', 'length'],
    sep=' ', header=None)
set(df['width']), set(df['height'])


({322, 426, 568}, {320})

In [None]:
# 배치사이즈 알아내기용 코드

import torch
from groundingdino.util.inference import load_model, predict

model = load_model("groundingdino/config/GroundingDINO_SwinB_cfg.py", "weights/groundingdino_swinb_cogcoor.pth")
TEXT_PROMPT = "chair . person . dog ."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

device = 'cuda'
B = 128; images = torch.randn(B, 3, 322, 320, pin_memory=True).to(device)

boxes, logits, phrases = predict(
    model=model,
    image=images,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)


In [46]:
from typing import List, Dict, Tuple, Union
import numpy as np
from pathlib import Path


In [11]:
# global setup
ngpus: int = 8
workers_per_gpu: int = 8

# ranks
gpu_rank: int = 2  # [0, ngpus)
worker_rank_offset: int = 3  # [0, workers_per_gpu), more like an offset
num_workers: int = ngpus * workers_per_gpu
worker_rank: int = gpu_rank * workers_per_gpu + worker_rank_offset  # [0, num_workers)

# p_videos = Path('')
# jobs: List[str] = sorted(p_videos)
njobs = 1657#len(jobs)
job_indices: List[str] = np.arange(worker_rank, njobs, num_workers)
worker_rank, num_workers, job_indices


(19,
 64,
 array([  19,   83,  147,  211,  275,  339,  403,  467,  531,  595,  659,
         723,  787,  851,  915,  979, 1043, 1107, 1171, 1235, 1299, 1363,
        1427, 1491, 1555, 1619]))

In [56]:
from typing import List, Dict, Tuple, Union
import numpy as np

from pathlib import Path
import json

import torch
from PIL import Image
from decord import VideoReader

import groundingdino.datasets.transforms as T

def load_annotations():
    p_ann_dir = Path('/data/datasets/ego4d_data/v2/annotations')
    p_json_train, p_ann_val = p_ann_dir / 'nlq_train.json', p_ann_dir / 'nlq_val.json'
    ann_train, ann_val = json.load(p_json_train.open()), json.load(p_ann_val.open())
    ann_videos = ann_train['videos'] + ann_val['videos']
    # 생김새: ann_videos[0]['clips'][0]['annotations'][0]['language_queries'][0]

    # list of {'clip_uid': clip_uid, 'queries': list of query dicts}
    ann_clips: List[Dict[str, Union[str,List[dict]]]] = []
    clip_uid_to_idx: Dict[str, int] = {}
    clip_idx = 0
    for ann_video in ann_videos:
        for ann_clip in ann_video['clips']:
            clip_uid: str = ann_clip['clip_uid']
            all_clip_language_queries: List[List[dict]] = [
                annotation['language_queries']
                for annotation in ann_clip['annotations']]
            all_clip_language_queries: List[dict] = sum(
                all_clip_language_queries, start=[])
            ann_clips.append({'clip_uid': clip_uid, 'queries': all_clip_language_queries})
            clip_uid_to_idx[clip_uid] = clip_idx
            clip_idx += 1

    n_clips = len(ann_clips)
    n_queries = sum(map(lambda ann_clip: len(ann_clip['queries']), ann_clips))
    print(f'# clips: {n_clips}, # queries: {n_queries}')  # 1686 (v1은 1326), 18403 (v1은 11291 + 3874 = 15165)
    return ann_clips, clip_uid_to_idx

def preprocess_frame(frame: np.ndarray, transform) -> Tuple[np.ndarray, torch.Tensor]:
    frame = np.asarray(frame)
    frame_transformed, _ = transform(frame, None)
    return frame, frame_transformed

def preprocess_video(frames: np.ndarray, transform) -> Tuple[np.ndarray, torch.Tensor]:
    frames_transformed = []
    for frame in frames:
        frame, frame_transformed = preprocess_frame(frame, transform)
        frames_transformed.append(frame_transformed)
    frames_transformed = torch.cat(frames_transformed)
    return frames, frames_transformed


bsz = 128
prefetch_factor = 16
chunk_size = prefetch_factor * bsz
ann_clips, clip_uid_to_idx = load_annotations()
p_nlqv1_clips_dir = Path(f'/data/datasets/ego4d_data/v2/clips_320p-non_official')
clip_uids: List[str] = [p_clip.stem for p_clip in p_nlqv1_clips_dir.glob('*.mp4')]
transform = T.Compose(
    [
        T.RandomResize([800], max_size=1333),  # deterministic
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]
)

# for clip_uid in clip_uids
clip_uid = '000eba33-8d14-446a-b016-19bd50e9a3b9'
p_clip = p_nlqv1_clips_dir / f'{clip_uid}.mp4'
vr = VideoReader(str(p_clip))
ann_clip = ann_clips[clip_uid_to_idx[clip_uid]]
queries = ann_clip['queries']
words = [query['slot_x'] for query in queries]
text_prompt = ' . '.join(words) + ' .'  # 프롬프트는 클립 안에서는 동일함

from pprint import pprint
print(f'Clip: {clip_uid}, length: {len(vr)}')
for chunk_idx, frame_offset in enumerate(range(0, len(vr), chunk_size)):
    chunk: np.ndarray = vr[frame_offset : frame_offset + chunk_size].asnumpy()[..., ::-1]
    for frames in np.split(chunk, prefetch_factor):
        frames_source, frames = preprocess_video(frames, transform)
        boxes, logits, phrases = predict(
            model=model,
            image=images,
            caption=text_prompt,
            box_threshold=BOX_TRESHOLD,
            text_threshold=TEXT_TRESHOLD
        )
        pprint(boxes)
        pprint(logits)
        pprint(phrases)
        break
    break


# clips: 1686, # queries: 18403
Clip: 000eba33-8d14-446a-b016-19bd50e9a3b9, length: 14401


TypeError: cannot unpack non-iterable int object

In [42]:
clip_queries[list(clip_queries.keys())[0]]


[{'clip_start_sec': 17.25669,
  'clip_end_sec': 27.256,
  'video_start_sec': 17.2777186,
  'video_end_sec': 27.2770286,
  'video_start_frame': 518,
  'video_end_frame': 818,
  'template': 'Objects: What did I put in X?',
  'query': 'what did I pick from the fridge?',
  'slot_x': 'fridge',
  'verb_x': 'pick',
  'raw_tags': ['Objects: What did I put in X?',
   'what did I pick from the fridge?',
   'fridge',
   'pick']},
 {'clip_start_sec': 56.73617,
  'clip_end_sec': 59.932,
  'video_start_sec': 56.7571986,
  'video_end_sec': 59.9530286,
  'video_start_frame': 1702,
  'video_end_frame': 1798,
  'template': 'Objects: What did I put in X?',
  'query': 'what did I pick from the shelf?',
  'slot_x': 'shelf',
  'verb_x': 'pick',
  'raw_tags': ['Objects: What did I put in X?',
   'what did I pick from the shelf?',
   'shelf',
   'pick']},
 {'clip_start_sec': 123.17645,
  'clip_end_sec': 124.176,
  'video_start_sec': 123.1974786,
  'video_end_sec': 124.1970286,
  'video_start_frame': 3695,
  '