This noteook includes two demo
1. [Download](#download) a track from the JSONL file
2. [Parse and extract](#pipeline) tracks in a video using the TrackVerse automated pipeline

In [3]:
import json, gzip
import tqdm

<a id='download'></a>
## Download tracks from the JSONL file
### (1) Read the JSONL file
For demo purpose, we only read one track from the 184K-CB300 subset to show how to read the jonsl file and download correspoonding tracks.

To download the all tracks, refer to the [download instructions](https://github.com/MMPLab/TrackVerse?tab=readme-ov-file#download-trackverse)

In [4]:
DATASET_PATH = './trackverse'

JSONL_DIR = "/home/yibingwei/dataset/object_tracks_db_fixed_detic/tracks_subsets/hdvila_lvis/NoTestVids"
subset = 'LVIS-184K-CB300-T0.0-NoTestVids.jsonl.gzip'
    
subset_gzip = f'{JSONL_DIR}/{subset}'
for line in tqdm.tqdm(gzip.open(subset_gzip, 'rt')):
    data = json.loads(line)
    break

0it [00:00, ?it/s]


The explanation of the keys

- `track_id` - unique track identifier.
- `video_size` - [height, width] of the video from which this track was extracted.
- `track_ts` - [start_time, end_time] timestamps (seconds) in the original video for the first and last frame in the track.
- `top10_lbl` - Class IDs of the top-10 predicted classes for the track, based on class logit score.
- `top10_desc` - Names of the top-10 predicted classes for the track, based on class logit score.
- `top10_cls` - [[top-10 logits mean], [top-10 logits std]] A list of the mean values of the classification logits for the top 10 classes, and a list of the standard deviations for these logits.
- `top10_wcls` - [[top-10 weighted logits mean], [top-10 weighted logits std]] A list of the mean scores for each of the top 10 weighted scores (class logits weighted by the objectness score), and a list of the standard deviations of these scores.
- `frame_ts` - timestamps (seconds) in the original video for each frame in the track
- `frame_bboxes` - list of bounding box coordinates [top_left_x, top_left_y, bottom_right_x, bottom_right_y] of the object for each frame in the track.
- `yid` - YouTube ID for the video from which this track was extracted
- `mp4_filename` - Filename of the track produced by running the track extraction pipeline.

In [5]:
list(data.keys())

['track_id',
 'video_size',
 'track_ts',
 'top10_lbl',
 'top10_desc',
 'top10_cls',
 'top10_wcls',
 'frame_ts',
 'frame_bboxes',
 'yid',
 'mp4_filename']

### (2) Download the orignial video from Youtube 
to DATASET_PATH


In [7]:
from download_videos import VideoDownloader, parse_arguments as parse_dl_args
import sys; sys.argv=['']; del sys

args = parse_dl_args()
args.base_dir = DATASET_PATH
args.yid_index_fn = ''
    
downloader = VideoDownloader(args)
downloader.process_video(youtube_id=data['yid'], job_id=0)

[0][1l4wfwq2TLo] Already downloaded.
[0][1l4wfwq2TLo] Already split into segments.


### (3) Extract the track

In [8]:
from extract_tracks import ObjectTrackExtractor, Track
import numpy as np
extractor = ObjectTrackExtractor(base_dir=DATASET_PATH)

tracks = [Track(
            data['yid'],
            ts=np.array(data['frame_ts']).astype(float),
            boxes=np.array(data['frame_bboxes']).astype(float),
            meta=data
            )]
extractor.extract_tracks_from_video(vid=data['yid'], tracks=tracks, job_id=0)

[0][1l4wfwq2TLo] Start track extraction
[0][1l4wfwq2TLo] Track extraction done.


In [9]:
# Display the extracted track 
from IPython.display import Video
Video(f"{DATASET_PATH}/tracks_mp4/{extractor.dataset_domain}/{data['mp4_filename']}")

<a id='pipeline'></a>
## Parse and extract tracks from a scene clip in a video using the TrackVerse automated pipeline
For demo purpose, we only use one scene from the downloaded video and extract the tracks from that scene.

To use the pipeline to create a full dataset, refer to the [pipeline instructions](https://github.com/MMPLab/TrackVerse/tree/main?tab=readme-ov-file#generate-customized-trackverse-dataset).


In [9]:
from parse_tracks import ObjectTracksParser, parse_arguments
import sys; sys.argv=['']; del sys

from bytetrack.byte_tracker import BYTETracker
from utils import detic as detic_utils
from utils import avio

args = parse_arguments()
args.base_dir = DATASET_PATH
args.yid_index_fn = ''
parser = ObjectTracksParser(args)

detector = detic_utils.build_detic(
            args.class_prompts,
            args.frame_size,
            args.nms,
            args.conf,
            gpu_id=0
        )
tracker = BYTETracker(
    args.track_thresh,
    args.track_iou_low_thresh,
    args.match_thresh,
    args.frame_rate,
    args.track_buffer,
    args.motion_weight,
    args.mot20
)

youtube_id = data['yid']
video_filepath = f"{DATASET_PATH}/videos_mp4/{youtube_id[:2]}/{youtube_id}.mp4"


# Calculate the maximum batch size for the detector
batch_size  = parser.get_max_batch_size(detector, avio.VideoDB(video_filepath).reader.frame_size)

Loading pretrained CLIP


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  [35mroi_heads.mask_head.mask_fcn1.{bias, weight}[0m
  [35mroi_heads.mask_head.mask_fcn2.{bias, weight}[0m
  [35mroi_heads.mask_head.mask_fcn3.{bias, weight}[0m
  [35mroi_heads.mask_head.mask_fcn4.{bias, weight}[0m
  [35mroi_heads.mask_head.deconv.{bias, weight}[0m
  [35mroi_heads.mask_head.predictor.{bias, weight}[0m


In [10]:
# You can get all segments of the video by reading prcoessed segm file and process them all.
segm_filepath = f"{DATASET_PATH}/videos_segm/{youtube_id[:2]}/{youtube_id}.txt"
segments = [ln.strip().split(',') for ln in open(segm_filepath, "r")]
segments = [(float(start), float(end)) for start, end in segments]

# For demo purpose, we only process the scene segment contraining the track extracted above
for start, end in segments:
    if end >= data['frame_ts'][-1] and start <= data['frame_ts'][0]:
        seg_start = start
        seg_end = end
        break

parser.parse_object_tracks(video_filepath, [seg_start, seg_end], detector, tracker, batch_size, job_id=0)

[0][1l4wfwq2TLo] Start parsing segment [157.991, 161.862].
[0][1l4wfwq2TLo][62.2%] Parsing object tracks | InferenceSpeed= 1.72 sec video/sec | NumTracks=0.
[0][1l4wfwq2TLo] Finished parsing segment. Found 0 tracks.


In [11]:
meta_data = f"trackverse/tracks_meta/TrackVerseLVIS/{youtube_id[:2]}/{youtube_id}-meta.jsonl.gzip"
for line in tqdm.tqdm(gzip.open(meta_data, 'rt')):
    print(json.loads(line))

0it [00:00, ?it/s]
