# Video Demo
Here, we provide a demo on how to use our model to process a video and visualize the tracking results.

We selected an open hpop dance video from the internet to demonstrate our demo. You can also choose other custom videos. **Please note that it is crucial to select the appropriate trained MOTIP weights and configuration for different tracking scenarios.**

We process the video on NVIDIA RTX 3080Ti, achieving a nearly real-time tracking.

### System Environment
1. Modify the root path to the project path.
2. Make sure you have a cuda device available.

In [1]:
import os
import sys
import torch


current_file_path = os.path.abspath("")
parent_dir = os.path.dirname(current_file_path)
sys.path.append(parent_dir)
os.chdir(parent_dir)
print(f"Current root path is set to {parent_dir}")

torch_version = torch.__version__
cuda_available = torch.cuda.is_available()

if not cuda_available:
    raise RuntimeError("CUDA is not available")

print(f"Hello! Welcome to use the video process demo. Your torch version is {torch_version} and CUDA is available.")

Current root path is set to /home/gaoruopeng/Code/MOTIP-NG/GitHub
Hello! Welcome to use the video process demo. Your torch version is 2.4.0 and CUDA is available.


### Prepare your video (.mp4 for example):

In [2]:
os.makedirs("./outputs/video_process_demo/", exist_ok=True)
video_path = os.path.join("./outputs/video_process_demo/", f"hpop_dancers.mp4")
output_path = os.path.join("./outputs/video_process_demo/", f"hpop_dancers_tracking.mp4")

#### [Optional] Download a video from Bilibili if you don't have a video

In [3]:
video_url = "https://www.bilibili.com/video/BV19mZ2YzERT/"
video_dir = os.path.join("./outputs/video_process_demo/", f"hpop_dancers")

os.system(f"you-get -o {video_dir} {video_url}")
files = os.listdir(video_dir)
# Search the .mp4 file, change name to "hpop_dancers.mp4", move to outputs/video_process_demo/
for file in files:
    if file.endswith(".mp4"):
        os.rename(os.path.join(video_dir, file), video_path)
        break

[33myou-get: You will need login cookies for 720p formats or above. (use --cookies to load cookies.txt.)[0m


site:                Bilibili
title:               izna《SIGN》练习室舞蹈(Fix ver.)
stream:
    - format:        [7mdash-flv480-AVC[0m
      container:     mp4
      quality:       清晰 480P avc1.640033
      size:          13.5 MiB (14166376 bytes)
    # download-with: [4myou-get --format=dash-flv480-AVC [URL][0m

Downloading izna《SIGN》练习室舞蹈(Fix ver.).mp4 ...
 100% ( 13.5/ 13.5MB) ├████████████████████████████████████████┤[2/2]  253 MB/s
Merging video parts... Merged into izna《SIGN》练习室舞蹈(Fix ver.).mp4

Downloading izna《SIGN》练习室舞蹈(Fix ver.).cmt.xml ...



#### [Optional] Display the video

In [None]:
from IPython.display import Video

Video(video_path, embed=True)

### Build our model

In [4]:
from utils.misc import yaml_to_dict
from configs.util import load_super_config


config_path = "./configs/r50_deformable_detr_motip_dancetrack.yaml"
checkpoint_path = "./outputs/r50_deformable_detr_motip_dancetrack/r50_deformable_detr_motip_dancetrack.pth"
config = yaml_to_dict(config_path)
config = load_super_config(config, config["SUPER_CONFIG_PATH"])
dtype = torch.float16       # torch.float32 or torch.float16, we select float16 for faster inference


from models.motip import build as build_model
from models.misc import load_checkpoint
from models.runtime_tracker import RuntimeTracker
model, _ = build_model(config)
# Load the model weights
load_checkpoint(model, checkpoint_path)
model.eval()
model = model.cuda()
if dtype == torch.float16:
    model.half()

print("Model built successfully.")

  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Model built successfully.


### Process the video

In [5]:
import cv2
from utils.nested_tensor import nested_tensor_from_tensor_list
from tqdm import tqdm
from demo.colormap import get_color


def simple_transform(
        image, max_shorter, max_longer, image_dtype,
):
    from torchvision.transforms import functional as F

    image = F.to_tensor(image)
    image = F.resize(image, size=max_shorter, max_size=max_longer)
    image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    if image_dtype != torch.float32:
        image = image.to(image_dtype)
    return image.cuda()


video_cap = cv2.VideoCapture(video_path)
if not video_cap.isOpened():
    raise RuntimeError(f"Failed to open video file: {video_path}")
# Get video properties
fps = video_cap.get(cv2.CAP_PROP_FPS)
width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
length = int(video_cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"The video {video_path} seems OK. It has {fps} fps, {width} width and {height} height.")
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

runtime_tracker = RuntimeTracker(
    model=model,
    sequence_hw=(height, width),
    assignment_protocol="object-max",
    miss_tolerance=30,
    det_thresh=0.5,
    newborn_thresh=0.5,
    id_thresh=0.2,
    dtype=dtype,
)

for frame_idx in tqdm(range(length), desc="Processing video", unit="frame"):
    ret, frame = video_cap.read()
    if not ret:
        break

    # Convert the frame to a tensor
    frame_tensor = simple_transform(frame, max_shorter=800, max_longer=1440, image_dtype=dtype)
    frame_tensor = nested_tensor_from_tensor_list([frame_tensor])

    # Run the tracker on the frame
    runtime_tracker.update(frame_tensor)

    with torch.no_grad():
        track_results = runtime_tracker.get_track_results()

    for bbox, obj_id in zip(track_results["bbox"], track_results["id"]):
        x, y, w, h = map(int, bbox)
        cv2.rectangle(frame, (x, y), (x + w, y + h), get_color(obj_id, rgb=False, use_int=True), 2)
        cv2.putText(frame, f"ID: {obj_id}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, get_color(obj_id, rgb=False, use_int=True), 2)

    video_writer.write(frame)

    frame_idx += 1

video_cap.release()
video_writer.release()

print(f"Video processing completed. The output video is saved to {output_path}.")

The video ./outputs/video_process_demo/hpop_dancers.mp4 seems OK. It has 23.976038875306628 fps, 852 width and 480 height.


Processing video: 100%|██████████| 4019/4019 [03:15<00:00, 20.57frame/s]

Video processing completed. The output video is saved to ./outputs/video_process_demo/hpop_dancers_tracking.mp4.



