This noteook includes two demo
1. [Download](#download) a track from the JSONL file
2. [Parse and extract](#pipeline) tracks in a video using the TrackVerse automated pipeline

In [2]:
import os
import json, gzip
from IPython.display import Video

<a id='download'></a>
## Download tracks from the JSONL file
### (1) Read the JSONL file
For demo purpose, we only read one track from the 184K-CB300 subset to show how to read the jonsl file and download correspoonding tracks.

To just download the dataset, refer to the [download instructions](https://github.com/MMPLab/TrackVerse?tab=readme-ov-file#download-trackverse)

In [3]:
DATASET_PATH = './TrackVerseDB'
subset_gzip = f"{DATASET_PATH}/tracks_subsets/TrackVerseLVIS-CB300-184K-T0.jsonl.gzip"
for line in gzip.open(subset_gzip, 'rt'):
    data = json.loads(line)
    print('\n'.join(data.keys()))
    break

yid
fn
video_size
top10_lbl
top10_desc
top10_logit_mu
top10_logit_std
top10_wlogit_mu
top10_wlogit_std
track_ts
track_bbox


Data format

- `yid` - YouTube ID for the video from which this track was extracted
- `fn` - Filename of the track produced by running the track extraction pipeline.
- `video_size` - [height, width] of the video from which this track was extracted.
- `top10_lbl` - Class IDs of the top-10 predicted classes for the track, based on weighted class logit score.
- `top10_desc` - Names of the top-10 predicted classes.
- `top10_logit_mu` - Average (over time) of the classification logits for the `top10_lbl` classes.
- `top10_logit_std` - Standard deviation (over time) of the classification logits for the `top10_lbl` classes.
- `top10_wlogit_mu` - Average (over time) of the classification logits weighted by DETIC's objectness score for the `top10_lbl` classes.
- `top10_wlogit_std` - Standard deviation (over time) of the classification logits weighted by DETIC's objectness score for the `top10_lbl` classes.
- `track_ts` - Timestamps (seconds) in the original video for each frame in the track
- `track_bbox` - Bounding box coordinates [top_left_x, top_left_y, bottom_right_x, bottom_right_y] of the object for each frame in the track.

In [4]:
youtube_id = data['yid']

### (2) Download the original video from Youtube 

In [5]:
from utils.youtube import YoutubeDL
TMP_PATH = './temporary-folder'
os.makedirs(f"{TMP_PATH}/videos_mp4", exist_ok=True)
downloader = YoutubeDL(f"{TMP_PATH}/videos_mp4")
downloader.download_video(youtube_id=data['yid'])

(<STATUS.DONE: 2>, './temporary-folder//videos_mp4/1l/1l4wfwq2TLo.mp4')

### (3) Extract the track

In [6]:
from extract_tracks import ObjectTrackExtractor, Track
import numpy as np
extractor = ObjectTrackExtractor(base_dir=TMP_PATH, dataset_domain='LVIS')

track = Track(data['yid'],
              fn=data['fn'],
              ts=np.array(data['track_ts']).astype(float),
              boxes=np.array(data['track_bbox']).astype(float),
              meta=data)
extractor.extract_tracks_from_video(vid=data['yid'], tracks=[track], job_id=0)

[0][1l4wfwq2TLo] Start track extraction
[0][1l4wfwq2TLo] Track extraction done.


In [7]:
# Display the extracted track 
Video(f"{TMP_PATH}/tracks_mp4/{extractor.dataset_domain}/{data['fn']}")

<a id='pipeline'></a>
## Parse and extract tracks from a scene clip in a video using the TrackVerse automated pipeline
For demo purpose, we only use one scene from the downloaded video and extract the tracks from that scene.

To use the pipeline to create a full dataset, refer to the [pipeline instructions](https://github.com/MMPLab/TrackVerse/tree/main?tab=readme-ov-file#generate-customized-trackverse-dataset).


In [8]:
# You can get all segments of the video by reading processed segm file and process them all.
from parse_tracks import ObjectTracksParser, DETIC_CFG, BYTETRACK_CFG
video_filepath = f"{TMP_PATH}/videos_mp4/{youtube_id[:2]}/{youtube_id}.mp4"
meta_data = f"{TMP_PATH}/tracks_meta/LVIS/{youtube_id[:2]}/{youtube_id}-meta.jsonl.gzip"
if os.path.exists(meta_data):
    os.remove(meta_data)
parser = ObjectTracksParser(TMP_PATH, '', 'LVIS', DETIC_CFG(), BYTETRACK_CFG())
parser.parse_object_tracks(video_filepath, [100, 120], batch_size=32, job_id=0)

Loading pretrained CLIP


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
The checkpoint state_dict contains keys that are not used by the model:
  [35mroi_heads.mask_head.mask_fcn1.{bias, weight}[0m
  [35mroi_heads.mask_head.mask_fcn2.{bias, weight}[0m
  [35mroi_heads.mask_head.mask_fcn3.{bias, weight}[0m
  [35mroi_heads.mask_head.mask_fcn4.{bias, weight}[0m
  [35mroi_heads.mask_head.deconv.{bias, weight}[0m
  [35mroi_heads.mask_head.predictor.{bias, weight}[0m


[0][1l4wfwq2TLo] Start parsing segment [100, 120].


  max_size = (max_size + (stride - 1)) // stride * stride


[0][1l4wfwq2TLo][39.2%] Parsing object tracks | InferenceSpeed= 1.46 sec video/sec | NumTracks=0.
[0][1l4wfwq2TLo] Finished parsing segment. Found 12 tracks.


In [9]:
extractor = ObjectTrackExtractor(base_dir=TMP_PATH, dataset_domain='LVIS')
tracks = []
for line in gzip.open(meta_data, 'rt'):
    m = json.loads(line)
    tracks.append(Track(m['yid'],
                        fn=m['fn'],
                        ts=np.array(m['track_ts']).astype(float),
                        boxes=np.array(m['track_bbox']).astype(float),
                        meta=data))
    print(f"'{m['top10_desc'][0]}' from {m['track_ts'][0]} to {m['track_ts'][-1]}")
extractor.extract_tracks_from_video(vid=m['yid'], tracks=tracks, job_id=0)

'blazer' from 116.68323333333333 to 119.95316666666666
'person' from 116.68323333333333 to 119.95316666666666
'baseball cap' from 100.06663333333333 to 106.87343333333334
'jersey' from 100.13336666666666 to 106.87343333333334
'baseball cap' from 100.06663333333333 to 106.87343333333334
'jersey' from 100.13336666666666 to 106.87343333333334
'person' from 103.0029 to 108.3082
'person' from 103.0029 to 108.3082
'person' from 106.94016666666667 to 116.6165
'person' from 106.94016666666667 to 116.6165
'person' from 106.94016666666667 to 116.6165
'person' from 106.94016666666667 to 116.6165
'blazer' from 116.68323333333333 to 119.95316666666666
'person' from 116.68323333333333 to 119.95316666666666
'baseball cap' from 100.06663333333333 to 106.87343333333334
'jersey' from 100.13336666666666 to 106.87343333333334
'baseball cap' from 100.06663333333333 to 106.87343333333334
'jersey' from 100.13336666666666 to 106.87343333333334
'person' from 103.0029 to 108.3082
'person' from 103.0029 to 108.3

In [ ]:
Video(f"{TMP_PATH}/tracks_mp4/LVIS/{tracks[0].fn}")

In [ ]:
Video(f"{TMP_PATH}/tracks_mp4/LVIS/{tracks[1].fn}")

In [ ]:
Video(f"{TMP_PATH}/tracks_mp4/LVIS/{tracks[2].fn}")