In [31]:
import os

VIDEO_FOLDER = './videos_test'
OUTPUT_FOLDER = './output_test'
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_FOLDER, 'frames'), exist_ok=True)

In [32]:
import numpy as np
import glob
import os
import PIL.Image as Image

from mot.config import *
from mot.tracking.tracklet import TrackletManager
from mot.tracking.detector_tracker import run_tracking
from mot.sampling.sampler import sample_best_per_window
from mot.models.reid import ReIDModel
from mot.models.clip_model import CLIPModel

In [33]:
def save_image_webp(img_bgr, path: str, quality: int = 80, resize_factor: float = 0.5):
    if resize_factor != 1.0:
        img_bgr = cv2.resize(img_bgr, (0, 0), fx=resize_factor, fy=resize_factor)
    img_pil = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
    os.makedirs(os.path.dirname(path), exist_ok=True)
    img_pil.save(path, format="WEBP", quality=quality)

In [34]:
manager = TrackletManager()

In [None]:
# tracking
seqs = sorted(glob.glob(f'{VIDEO_FOLDER}/seq_*'))

for seq_id, seq_path in enumerate(seqs):
    seq_name = os.path.basename(seq_path)
    print(f'Processing sequence {seq_name}')
    
    os.makedirs(os.path.join(OUTPUT_FOLDER, 'frames', seq_name), exist_ok=True)
    cameras = sorted(glob.glob(f'{seq_path}/camera_*'))
    for cam_id, video_path in enumerate(cameras):
        camera_name = "_".join(os.path.basename(video_path).split('_')[:2])
        camera_frame_folder = os.path.join(OUTPUT_FOLDER, 'frames', seq_name, camera_name)
        os.makedirs(camera_frame_folder, exist_ok=True)
        print(f'  Processing camera {cam_id}')
        for frame_id, frame, boxes, ids, confs in run_tracking(video_path, vid_stride=1, confidence=CONFIDENCE_THRESHOLD, save=True, 
                                                           project_name=f'{OUTPUT_FOLDER}/annotated_videos', 
                                                           name=seq_name,
                                                           show=False):
            
            frame_save_path = os.path.join(camera_frame_folder, f'frame_{frame_id:07d}.webp')
            save_image_webp(frame, frame_save_path)
            for box, tid, conf in zip(boxes, ids, confs):
                gid = seq_id*SEQ_ID_OFFSET + cam_id * CAMERA_ID_OFFSET + tid
                x1, y1, x2, y2 = map(int, box)
                crop = frame[y1:y2, x1:x2]

                t = manager.get(gid, seq_id, cam_id)
                t.add_frame(frame_id, box, conf, crop)

Processing sequence 0
  Processing camera 0

video 1/1 (frame 1/300) /media/bao/wd/text_based_person_reid/videos_test/seq_test_0/camera_2_cut.mp4: 448x640 2 persons, 78.7ms
video 1/1 (frame 2/300) /media/bao/wd/text_based_person_reid/videos_test/seq_test_0/camera_2_cut.mp4: 448x640 2 persons, 77.8ms
video 1/1 (frame 3/300) /media/bao/wd/text_based_person_reid/videos_test/seq_test_0/camera_2_cut.mp4: 448x640 2 persons, 69.8ms
video 1/1 (frame 4/300) /media/bao/wd/text_based_person_reid/videos_test/seq_test_0/camera_2_cut.mp4: 448x640 2 persons, 85.0ms
video 1/1 (frame 5/300) /media/bao/wd/text_based_person_reid/videos_test/seq_test_0/camera_2_cut.mp4: 448x640 2 persons, 73.1ms
video 1/1 (frame 6/300) /media/bao/wd/text_based_person_reid/videos_test/seq_test_0/camera_2_cut.mp4: 448x640 2 persons, 67.2ms
video 1/1 (frame 7/300) /media/bao/wd/text_based_person_reid/videos_test/seq_test_0/camera_2_cut.mp4: 448x640 2 persons, 83.1ms
video 1/1 (frame 8/300) /media/bao/wd/text_based_person_rei

In [36]:
# Sampling + Embeddings
reid_model = ReIDModel()

for t in manager.all():
    sampled = sample_best_per_window(t.frames)
    candidates = []
    if len(sampled) <= 3:
        candidates = sampled
    else:
        candidates = sampled[:1] + sampled[-1:] + sampled[len(sampled)//2:len(sampled)//2+1]

    imgs = [f.image for f in candidates]

    reid_feats = reid_model.extract(imgs).mean(axis=0)
    t.reid_embeddings.append(reid_feats)



Successfully loaded imagenet pretrained weights from "/home/bao/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352


In [None]:
del reid_model

In [39]:
# Sampling + Embeddings
clip_model = CLIPModel()

for t in manager.all():
    sampled = sample_best_per_window(t.frames)
    candidates = []
    if len(sampled) <= 3:
        candidates = sampled
    else:
        candidates = sampled[:1] + sampled[-1:] + sampled[len(sampled)//2:len(sampled)//2+1]

    imgs = [f.image for f in candidates]

    clip_feats = np.array([clip_model.encode_image(img) for img in imgs]).mean(axis=0)
    t.clip_embeddings.append(clip_feats)


In [40]:
del clip_model

In [None]:
os.makedirs(os.path.join(OUTPUT_FOLDER, 'metadata'), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_FOLDER, 'embeddings'), exist_ok=True)

In [None]:
tracklets = manager.all()


# save into metadata.txt
with open(f"{OUTPUT_FOLDER}/metadata.txt", "a") as f:
    for t in tracklets:
        for frame in t.frames:
            f.write(f"{t.sequence_id} {t.camera_id} {frame.frame_id} {t.global_id} {frame.bbox[0]} {frame.bbox[1]} {frame.bbox[2]} {frame.bbox[3]}\n")

# save into features.txt
with open(f"{OUTPUT_FOLDER}/features.txt", "w") as f:
    for t in tracklets:
        f.write(f"{t.sequence_id} {t.camera_id} {t.global_id} {t.reid_embeddings[0].tolist()} {t.clip_embeddings[0].tolist()}\n")

In [23]:
for t in manager.all():
    for frame in t.frames:
        print(tuple(frame.bbox.tolist()))

(290.1499938964844, 296.28662109375, 466.0266418457031, 646.966064453125)
(290.04803466796875, 296.3787536621094, 466.0698547363281, 647.1328125)
(289.87030029296875, 296.12152099609375, 465.7896728515625, 647.8612060546875)
(289.97869873046875, 296.11346435546875, 466.0093688964844, 647.92529296875)
(289.79315185546875, 296.289306640625, 465.57904052734375, 647.5328979492188)
(289.6396179199219, 296.85198974609375, 465.3800964355469, 647.415771484375)
(289.5691833496094, 297.26397705078125, 465.31719970703125, 647.64013671875)
(289.5660400390625, 297.60931396484375, 465.31732177734375, 647.7901611328125)
(289.7684020996094, 297.62982177734375, 465.8263854980469, 647.5781860351562)
(289.7770690917969, 297.7735595703125, 465.9918212890625, 647.8358764648438)
(289.77423095703125, 297.6702575683594, 465.8328857421875, 648.0796508789062)
(289.54425048828125, 297.2474365234375, 465.9150390625, 648.9144287109375)
(289.2179870605469, 297.3081970214844, 465.76678466796875, 648.690185546875)
(2

In [None]:
import cv2

# -----------------------------
# Inputs
# -----------------------------
video_path = "input.mp4"
output_path = "output_with_boxes.mp4"

# Example bounding boxes:
# frame_idx -> list of (x1, y1, x2, y2)
bboxes = {
    0: [(50, 60, 200, 220)],
    1: [(55, 65, 205, 225)],
    2: [(60, 70, 210, 230)],
}

# -----------------------------
# Load video
# -----------------------------
cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    raise IOError("Cannot open video")

fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# -----------------------------
# Video writer
# -----------------------------
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

frame_idx = 0

# -----------------------------
# Process frames
# -----------------------------
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Draw bounding boxes for this frame
    if frame_idx in bboxes:
        for (x1, y1, x2, y2) in bboxes[frame_idx]:
            cv2.rectangle(
                frame,
                (x1, y1),
                (x2, y2),
                color=(0, 255, 0),
                thickness=2
            )

    # Write to output video
    out.write(frame)

    # Display
    cv2.imshow("Video with Bounding Boxes", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

    frame_idx += 1

# -----------------------------
# Cleanup
# -----------------------------
cap.release()
out.release()
cv2.destroyAllWindows()


In [None]:

# # Save embeddings
# save_embeddings(reid_embs, clip_embs)

# # ---- Build FAISS indexes ----
# # for REID
# build_faiss_index(
#     reid_embs,
#     "output/faiss_reid.index",
#     "output/faiss_reid_ids.npy"
# )

# # for CLIP
# build_faiss_index(
#     clip_embs,
#     "output/faiss_clip.index",
#     "output/faiss_clip_ids.npy"
# )


In [13]:
a = 100_000_000
a.__sizeof__()

28