In [1]:
# Adapted from https://colab.research.google.com/drive/1OsyNVoV_7ETD1zIE8UWxL3NXxu12m_YZ?usp=sharing#scrollTo=odl3HAdWc5Gz

In [2]:
# this block should be run in the beginning of the notebook

import os, sys
# sys.path.append("/data/wangz3/projects/ecole-video-action/third_party/Tracking-Anything-with-DEVA")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
cuda_available = torch.cuda.is_available()
print("cuda.is_available():", cuda_available)
num_gpus = torch.cuda.device_count()
print(f"Number of available GPUs: {num_gpus}")
print(f"torch.cuda.current_device(): ", torch.cuda.current_device())

cuda.is_available(): True
Number of available GPUs: 1
torch.cuda.current_device():  0


In [3]:

try:
    import groundingdino
    from groundingdino.util.inference import Model as GroundingDINOModel
except ImportError:
    import os, sys
    sys.path.append("/data/wangz3/projects/ecole-video-action/third_party/Grounded-Segment-Anything")
    import GroundingDINO
    from GroundingDINO.groundingdino.util.inference import Model as GroundingDINOModel

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
from os import path
from argparse import ArgumentParser
import pprint

import torch
import numpy as np

from deva.model.network import DEVA
from deva.inference.inference_core import DEVAInferenceCore
from deva.inference.result_utils import ResultSaver
from deva.inference.eval_args import add_common_eval_args, get_model_and_config
from deva.inference.demo_utils import flush_buffer
from deva.ext.ext_eval_args import add_ext_eval_args, add_text_default_args
from deva.ext.grounding_dino import get_grounding_dino_model
from deva.ext.with_text_processor import process_frame_with_text as process_frame

from tqdm import tqdm
import json

torch.autograd.set_grad_enabled(False)

# for id2rgb
np.random.seed(42)

# default parameters
parser = ArgumentParser()
add_common_eval_args(parser)
add_ext_eval_args(parser)
add_text_default_args(parser)

# load model and config
args = parser.parse_args([])
cfg = vars(args)
cfg['enable_long_term'] = True

print('cfg:')
pprint.pprint(cfg)

# Load our checkpoint
# device = torch.device('cuda') if cuda_available else torch.device('cpu')
# deva_model = DEVA(cfg).to(device).eval()
deva_model = DEVA(cfg).cuda().eval()
if args.model is not None:
    model_weights = torch.load(args.model)
    deva_model.load_weights(model_weights)
else:
    print('No model loaded.')

gd_model, sam_model = get_grounding_dino_model(cfg, 'cuda')

cfg:
{'DINO_NMS_THRESHOLD': 0.8,
 'DINO_THRESHOLD': 0.35,
 'GROUNDING_DINO_CHECKPOINT_PATH': './saves/groundingdino_swint_ogc.pth',
 'GROUNDING_DINO_CONFIG_PATH': './saves/GroundingDINO_SwinT_OGC.py',
 'MOBILE_SAM_CHECKPOINT_PATH': './saves/mobile_sam.pt',
 'SAM_CHECKPOINT_PATH': './saves/sam_vit_h_4b8939.pth',
 'SAM_ENCODER_VERSION': 'vit_h',
 'SAM_NUM_POINTS_PER_BATCH': 64,
 'SAM_NUM_POINTS_PER_SIDE': 32,
 'SAM_OVERLAP_THRESHOLD': 0.8,
 'SAM_PRED_IOU_THRESHOLD': 0.88,
 'amp': False,
 'chunk_size': -1,
 'detection_every': 5,
 'disable_long_term': False,
 'do_not_pluralize': False,
 'enable_long_term': True,
 'engulf_threshold': 0.2,
 'img_path': './example/vipseg',
 'key_dim': 64,
 'match_and_merge_mode': 'iou',
 'max_long_term_elements': 10000,
 'max_mid_term_frames': 10,
 'max_missed_detection_count': 10,
 'max_num_objects': -1,
 'mem_every': 5,
 'min_mid_term_frames': 5,
 'model': './saves/DEVA-propagation.pth',
 'num_prototypes': 128,
 'num_voting_frames': 3,
 'output': None,
 'pi

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


final text_encoder_type: bert-base-uncased


In [5]:
# set hyperparameters
cfg['enable_long_term_count_usage'] = True
cfg['max_num_objects'] = 50
cfg['size'] = 480
cfg['DINO_THRESHOLD'] = 0.35
cfg['amp'] = True
cfg['chunk_size'] = 4
cfg['detection_every'] = 5
cfg['max_missed_detection_count'] = 10
cfg['sam_variant'] = 'original'
cfg['temporal_setting'] = 'online' # semionline usually works better; but online is faster for this demo
cfg['pluralize'] = True

In [6]:
import cv2
print(cv2.getBuildInformation())


  Version control:               4.8.0-dirty

  Platform:
    Timestamp:                   2023-08-09T11:41:23Z
    Host:                        Linux 5.15.0-1042-azure x86_64
    CMake:                       3.27.1
    CMake generator:             Unix Makefiles
    CMake build tool:            /bin/gmake
    Configuration:               Release

  CPU/HW features:
    Baseline:                    SSE SSE2 SSE3
      requested:                 SSE3
    Dispatched code generation:  SSE4_1 SSE4_2 FP16 AVX AVX2 AVX512_SKX
      requested:                 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX
      SSE4_1 (16 files):         + SSSE3 SSE4_1
      SSE4_2 (1 files):          + SSSE3 SSE4_1 POPCNT SSE4_2
      FP16 (0 files):            + SSSE3 SSE4_1 POPCNT SSE4_2 FP16 AVX
      AVX (7 files):             + SSSE3 SSE4_1 POPCNT SSE4_2 AVX
      AVX2 (35 files):           + SSSE3 SSE4_1 POPCNT SSE4_2 FP16 FMA3 AVX AVX2
      AVX512_SKX (5 files):      + SSSE3 SSE4_1 POPCNT SSE4_2 FP16 FMA3 A

In [10]:
## specify input and output

# SOURCE_VIDEO_PATH = f"./data/example.mp4"
# # OUTPUT_VIDEO_PATH = f"./data/example_output.webm"
# # OUTPUT_VIDEO_PATH = f"./data/example_output.mp4"
# prompt = "person.hat.horse"

# SOURCE_VIDEO_PATH = f"./data/video7434.mp4"
# OUTPUT_VIDEO_PATH = f"./data/video7434_output.webm"
# prompt = "person.food"

SOURCE_VIDEO_PATH = f"./data/video7436.mp4"
OUTPUT_VIDEO_PATH = f"./data/video7436_output.webm"
prompt = "person holding a gun. woman sitting"

cfg['DINO_THRESHOLD'] = 0.5

# run DEVA
from deva.ext.with_text_processor import process_frame_with_text as process_frame_text
import tempfile
import cv2

cfg['prompt'] = prompt

deva = DEVAInferenceCore(deva_model, config=cfg)
deva.next_voting_frame = cfg['num_voting_frames'] - 1
deva.enabled_long_id()

# obtain temporary directory
result_saver = ResultSaver(None, None, dataset='gradio', object_manager=deva.object_manager)
writer_initizied = False

cap = cv2.VideoCapture(SOURCE_VIDEO_PATH)
fps = cap.get(cv2.CAP_PROP_FPS)
ti = 0
# only an estimate
with torch.cuda.amp.autocast(enabled=cfg['amp']):
    with tqdm(total=int(cap.get(cv2.CAP_PROP_FRAME_COUNT))) as pbar:
        while (cap.isOpened()):
            ret, frame = cap.read()
            if ret == True:
                if not writer_initizied:
                    h, w = frame.shape[:2]
                    if OUTPUT_VIDEO_PATH.endswith('.webm'):
                        writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, cv2.VideoWriter_fourcc(*'vp80'), fps, (w, h)) # webm
                    else:
                        writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h)) # mp4?
                    writer_initizied = True
                    result_saver.writer = writer

                process_frame_text(deva,
                                    gd_model,
                                    sam_model,
                                    'null.png',
                                    result_saver,
                                    ti,
                                    image_np=frame)
                ti += 1
                pbar.update(1)
            else:
                break
    flush_buffer(deva, result_saver)
writer.release()
cap.release()
deva.clear_buffer()

  0%|          | 0/299 [00:00<?, ?it/s]

OpenCV: FFMPEG: tag 0x30387076/'vp80' is not supported with codec id 139 and format 'webm / WebM'
100%|██████████| 299/299 [02:11<00:00,  2.27it/s]


In [8]:
from IPython.display import Video
video_path = OUTPUT_VIDEO_PATH
# video_path = "/data/wangz3/projects/ecole-video-action/third_party/Tracking-Anything-with-DEVA/data/example.mp4"
video = Video(video_path)
video