In [1]:
import json
from pathlib import Path
from tqdm import tqdm

In [2]:
REPORTED_INVALID_CLIP_UIDS = {
    'b6061527-3ae2-46b4-b050-15eddc1967bb',  # vq2d
}

def generate_flat_annotations_egotracks(all_anns):

    def polish_bbox_dict_keys(bbox: dict):
        key_map = {
            'frame_number': 'fno',
            'x': 'x', 'y': 'y', 'width': 'w', 'height': 'h',
            'original_width': None, 'original_height': None}
        return {key_map[k]: v for k, v in bbox.items() if key_map.get(k) is not None}

    flat_anns = []
    count_invalids = 0
    for ann_video in all_anns['videos']:
        video_uid = ann_video['video_uid']
        for ann_clip in ann_video['clips']:
            clip_uid = ann_clip['clip_uid']
            if clip_uid is None or clip_uid in REPORTED_INVALID_CLIP_UIDS:
                continue
            clip_duration = ann_clip['video_end_sec'] - ann_clip['video_start_sec']
            clip_fps = ann_clip['clip_fps']
            for ann_annots in ann_clip['annotations']:
                for qset_id, qset in ann_annots['query_sets'].items():
                    annotation_uid = f'{clip_uid}_{qset_id}'
                    if not (qset['is_valid'] and 'lt_track' in list(qset.keys())):
                        count_invalids += 1
                        continue
                    if not 'original_height' in list(qset['lt_track'][0].keys()) or not 'original_width' in list(qset['lt_track'][0].keys()):
                        oh, ow = qset['visual_crop']['original_height'], qset['visual_crop']['original_width']
                    else:
                        oh, ow = qset['lt_track'][0]['original_height'], qset['lt_track'][0]['original_width']
                    rt = [polish_bbox_dict_keys(bbox) for bbox in qset['lt_track']]
                    sample = {
                        'video_uid': video_uid,
                        'clip_uid': clip_uid,
                        'annotation_uid': annotation_uid,
                        'query_set': qset_id,
                        'clip_fps': clip_fps,
                        'clip_duration': clip_duration,
                        'original_width': ow,
                        'original_height': oh,
                        'query_frame': qset['query_frame'],
                        'object_title': qset['object_title'],
                        'visual_crop': polish_bbox_dict_keys(qset['visual_crop']),
                        'lt_track_valid_range': [rt[0]['fno'], rt[-1]['fno']],
                        'lt_track': rt,
                    }
                    flat_anns.append(sample)
    return flat_anns

In [4]:
p_track_ann = Path(f"/data/datasets/ego4d_data/v2/egotracks/egotracks_val.json")
track_anns = json.load(p_track_ann.open())
track_anns['videos'][0]

{'video_uid': '20400762-1e47-462d-a7d1-64a1b162b1f9',
 'split': 'val',
 'clips': [{'clip_uid': '307c3ec6-886e-4d25-9ef7-7bea3cf7a243',
   'video_start_sec': 450.0,
   'video_end_sec': 930.0,
   'video_start_frame': 13500,
   'video_end_frame': 27900,
   'clip_start_sec': 0,
   'clip_end_sec': 480.0,
   'clip_start_frame': 0,
   'clip_end_frame': 14400,
   'clip_fps': 5.0,
   'annotation_complete': True,
   'source_clip_uid': '2751261c-c3c1-463e-8932-b92889b705d5',
   'annotations': [{'query_sets': {'1': {'is_valid': True,
       'errors': [],
       'query_frame': 1306,
       'query_video_frame': 21336,
       'response_track': [{'frame_number': 1191,
         'x': 89.28,
         'y': 0.42,
         'width': 153.26,
         'height': 108.25,
         'rotation': 0,
         'original_width': 1920,
         'original_height': 1080,
         'video_frame_number': 20646,
         'exported_clip_frame_number': 7146},
        {'frame_number': 1192,
         'x': 302.14,
         'y': 7.7

In [36]:
p_ann_dir = Path('/data/soyeonhong/vq2d/vq2d-lightning/data')

for split in ['val', 'train']:
    p_track_ann = Path(f"/data/datasets/ego4d_data/v2/egotracks/egotracks_{split}.json")
    track_anns = json.load(p_track_ann.open())
    flat_anns = generate_flat_annotations_egotracks(track_anns)
    
    p_ann = p_ann_dir / f'egotracks_v1_{split}.json'
    
    json.dump(flat_anns, p_ann.open('w'))

In [38]:
all_anns = json.load(open('/data/soyeonhong/vq2d/vq2d-lightning/data/egotracks_v1_train.json'))
all_anns[0]

{'video_uid': '57ab94d1-56d2-4227-b099-7388d4cefcbf',
 'clip_uid': '622c1b29-76c6-4845-95df-7e54792687d4',
 'annotation_uid': '622c1b29-76c6-4845-95df-7e54792687d4_1',
 'query_set': '1',
 'clip_fps': 5.0,
 'clip_duration': 300.00000000000006,
 'original_width': 1920,
 'original_height': 1440,
 'query_frame': 517,
 'object_title': 'microwave',
 'visual_crop': {'fno': 891,
  'x': 458.29,
  'y': 215.52,
  'w': 579.23,
  'h': 445.31},
 'lt_track_valid_range': [425, 1201],
 'lt_track': [{'fno': 425, 'x': 1643.7, 'y': 402.24, 'w': 276.58, 'h': 680.23},
  {'fno': 426, 'x': 1253.7, 'y': 259, 'w': 648.34, 'h': 612.36},
  {'fno': 427, 'x': 1113.54, 'y': 271.53, 'w': 679.22, 'h': 631.64},
  {'fno': 428, 'x': 990.02, 'y': 336.55, 'w': 733.87, 'h': 681.5},
  {'fno': 429, 'x': 864, 'y': 292.97, 'w': 818.76, 'h': 717.24},
  {'fno': 430, 'x': 739.27, 'y': 264.28, 'w': 899.2, 'h': 686.34},
  {'fno': 431, 'x': 639.72, 'y': 299.59, 'w': 1023.64, 'h': 605.3},
  {'fno': 432, 'x': 605.69, 'y': 299.46, 'w': 

In [None]:
from PIL import Image
from decord import VideoReader

p_clips_dir = Path('/data/datasets/ego4d_data/v2/clips')
p_crop_out_dir = Path('/data/soyeonhong/vq2d/vq2d-lightning/outputs/lt_tracks') / split

for aidx, ann in enumerate(tqdm(all_anns)):
    clip_uid = ann['clip_uid']
    qset_uuid = f"{ann['annotation_uid']}_{ann['query_set']}"
    rt = ann['lt_track']
    ow, oh = ann['original_width'], ann['original_height']
    frame_idxs = [f['fno'] for f in rt]

    p_obj_dir = p_crop_out_dir / clip_uid
    p_obj_dir.mkdir(exist_ok=True, parents=True)

    p_clip = p_clips_dir / f'{clip_uid}.mp4'
    vr = VideoReader(str(p_clip))
    
    for idx, frame_idx in enumerate(frame_idxs):
        p_out = p_obj_dir / f'{clip_uid}_{frame_idx}_{qset_uuid}.jpg'
        
        if p_out.exists():
            continue
        
        frame = vr[min(6*frame_idx, len(vr)-1)].asnumpy()
        w, h = frame.shape[1], frame.shape[0]
        x1, y1, x2, y2 = rt[idx]['x'] / ow * w, rt[idx]['y'] / oh * h, (rt[idx]['x'] + rt[idx]['w']) / ow * w, (rt[idx]['y'] + rt[idx]['h']) / oh * h
        
        img = Image.fromarray(frame)
        
        if x2 - x1 == 0 or y2 - y1 == 0:
            print(f"Empty crop: {p_out}")
            cropped = img
        else:
            cropped = img.crop((x1, y1, x2, y2))
        
        cropped.save(p_out)
        if aidx % 500 == 0:
            display(cropped)