In [6]:
!git clone https://github.com/KL0224/RetrievalPerson -b pipeline

Cloning into 'RetrievalPerson'...
remote: Enumerating objects: 53, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 53 (delta 9), reused 50 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (53/53), 9.32 MiB | 30.58 MiB/s, done.
Resolving deltas: 100% (9/9), done.


In [7]:
!pip install open_clip_torch
!pip install torchreid
!pip install ultralytics

Collecting open_clip_torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading open_clip_torch-3.2.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open_clip_torch
Successfully installed ftfy-6.3.1 open_clip_torch-3.2.0
Collecting torchreid
  Downloading torchreid-0.2.5.tar.gz (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.7/92.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torchreid
  Building wheel for torchr

In [8]:
%cd /kaggle/working/RetrievalPerson

/kaggle/working/RetrievalPerson


In [2]:
import os

VIDEO_FOLDER = 'videos_test' #'../../input/dataset-person/videos'
OUTPUT_FOLDER = 'outputs'
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_FOLDER, 'frames'), exist_ok=True)
# os.makedirs(os.path.join(OUTPUT_FOLDER, 'metadata'), exist_ok=True)
# os.makedirs(os.path.join(OUTPUT_FOLDER, 'features'), exist_ok=True)

In [5]:
import numpy as np
import glob
import os
import PIL.Image as Image
import cv2
import torch

from config import *
from tracking.tracklet import TrackletManager
from tracking.detector_tracker import run_tracking
from sampling.sampler import sample_best_per_window
# from models.reid import ReIDModel
# from models.clip_model import CLIPModel

In [3]:
def save_image_webp(img_bgr, path: str, quality: int = 80, resize_factor: float = 0.5):
    if resize_factor != 1.0:
        img_bgr = cv2.resize(img_bgr, (0, 0), fx=resize_factor, fy=resize_factor)
    img_pil = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
    os.makedirs(os.path.dirname(path), exist_ok=True)
    img_pil.save(path, format="WEBP", quality=quality)

def save_crop_webp(crop, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    cv2.imwrite(
        path,
        crop,
        [cv2.IMWRITE_WEBP_QUALITY, 100]  # 100 = lossless
    )

def safe_delete(path):
    try:
        if path and os.path.exists(path):
            os.remove(path)
    except Exception as e:
        print(f"[WARN] Failed to delete {path}: {e}")


In [6]:
manager = TrackletManager()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
# tracking
# reid_model = ReIDModel(device=device, model_path='models/osnet_x1_0_market_256x128_amsgrad_ep150_stp60_lr0.0015_b64_fb10_softmax_labelsmooth_flip.pth')
# clip_model = CLIPModel(device=device)

seqs = sorted(glob.glob(f'{VIDEO_FOLDER}/seq_*'))

for seq in seqs[:2]:
    seq_name = os.path.basename(seq)
    seq_id = int(seq_name.split('_')[-1])
    print(f'Processing sequence {seq_name}')
    
    os.makedirs(os.path.join(OUTPUT_FOLDER, 'frames', seq_name), exist_ok=True)
    cameras = sorted(glob.glob(f'{seq}/camera_*'))
    for cam_id, video_path in enumerate(cameras):
        manager = TrackletManager()
        cam_id += 1
        camera_name = "_".join(os.path.basename(video_path).split('_')[:2])
        camera_frame_folder = os.path.join(OUTPUT_FOLDER, 'frames', seq_name, camera_name)
        os.makedirs(camera_frame_folder, exist_ok=True)
        print(f'  Processing camera {cam_id}')
        for frame_id, frame, boxes, ids, confs in run_tracking(video_path, model_name='yolov8n.pt', 
                                                               vid_stride=1, 
                                                               confidence=CONFIDENCE_THRESHOLD,
                                                               device=device):
            print(f'    Processing frame {frame_id}, detected {len(boxes)} persons') 
            # detected boxes + (alive but not detected)
            print(boxes)
            frame_save_path = os.path.join(camera_frame_folder, f'frame_{frame_id:06d}.webp')
            save_image_webp(frame, frame_save_path)
            for box, tid, conf in zip(boxes, ids, confs):
                print(f'      Track ID: {tid}, BBox: {box}, Conf: {conf}')
                gid = seq_id*SEQ_ID_OFFSET + cam_id * CAMERA_ID_OFFSET + tid
                x1, y1, x2, y2 = map(int, box)

                # invalid box
                if x2<=x1 and y2<=y1:
                    continue
                
                crop = frame[y1:y2, x1:x2]

                crop_path = os.path.join(
                    OUTPUT_FOLDER,
                    "crops",
                    seq_name,
                    camera_name,
                    f"{gid}_{frame_id:06d}.webp"
                )
                
                save_crop_webp(crop, crop_path)
                t = manager.get(gid, seq_id, cam_id)
                t.add_frame(frame_id, box, conf, crop_path)
    
        # tracklets = manager.all()

        # for t in tracklets:
        #     candidates = sample_best_per_window(t.frames)
        #     # candidates = []
        #     # if len(sampled) <= 3:
        #     #     candidates = sampled
        #     # else:
        #     #     candidates = sampled[:1] + sampled[-1:] + sampled[len(sampled)//2:len(sampled)//2+1]
            
        #     candidate_paths = set(f.crop_path for f in candidates)
        #     for f in t.frames:
        #         if f.crop_path not in candidate_paths:
        #             safe_delete(f.crop_path)
        #             f.crop_path = None

        #     imgs = [cv2.imread(f.crop_path) for f in candidates]
        
            # reid_feats = reid_model.extract(imgs).mean(axis=0)
            # t.reid_embeddings.append(reid_feats)
            # clip_feats = np.array([clip_model.encode_image(img) for img in imgs]).mean(axis=0)
            # t.clip_embeddings.append(clip_feats)           
        
        

        # # save into metadata.txt
        # with open(f"{OUTPUT_FOLDER}/metadata/{seq_name}_{camera_name}.txt", "w") as f:
        #     for t in tracklets:
        #         for frame in t.frames:
        #             f.write(f"{t.sequence_id} {t.camera_id} {frame.frame_id} {t.global_id} {int(frame.bbox[0])} {int(frame.bbox[1])} {int(frame.bbox[2])} {int(frame.bbox[3])}\n")

        # # save into features.txt
        # features = {}
        # output_path_pkl = f"{OUTPUT_FOLDER}/features/{seq_name}_{camera_name}.pkl"
        
        # with open(f"{OUTPUT_FOLDER}/features/{seq_name}_{camera_name}.txt", "w") as f:
        #     for t in tracklets:
        #         f.write(f"{t.sequence_id} {t.camera_id} {t.global_id} {t.reid_embeddings[0].tolist()} {t.clip_embeddings[0].tolist()}\n")
        #         key = (t.sequence_id, t.camera_id, t.global_id)
        
        #         reid_emb = t.reid_embeddings[0]
        #         clip_emb = t.clip_embeddings[0]
            
        #         features[key] = [reid_emb, clip_emb]
                
        # with open(output_path_pkl, "wb") as f:
        #     pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)

Processing sequence seq_001
  Processing camera 1
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt': 100% ━━━━━━━━━━━━ 6.2MB 7.5MB/s 0.8s0.8s<0.0s.4s
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352
Successfully loaded pretrained weights from "./models/osnet_x1_0_market_256x128_amsgrad_ep150_stp60_lr0.0015_b64_fb10_softmax_labelsmooth_flip.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']


  import pkg_resources


[]
    Processing frame 0, detected 0 persons
[]
[]
    Processing frame 1, detected 0 persons
[]
[array([        290,         296,         466,         646], dtype=float32), array([        343,          99,         391,         299], dtype=float32), array([        730,         281,         875,         513], dtype=float32), array([        219,         222,         308,         430], dtype=float32)]
    Processing frame 2, detected 4 persons
[[        290         296         466         646]
 [        343          99         391         299]
 [        730         281         875         513]
 [        219         222         308         430]]
      Track ID: 1, BBox: [        290         296         466         646], Conf: 0.8682590126991272
      Track ID: 2, BBox: [        343          99         391         299], Conf: 0.7119958400726318
      Track ID: 3, BBox: [        730         281         875         513], Conf: 0.46382689476013184
      Track ID: 4, BBox: [        219        

In [9]:
tracklets = manager.all()
from collections import defaultdict

tracks = defaultdict(list)
bbox_per_frame = {}
for t in tracklets:
    for f in t.frames:
        # print(t.global_id, f.frame_id, f.bbox)
        # bbox_per_frame.setdefault(f.frame_id, []).append(f.bbox)
        tracks[f.frame_id].append({
                "id": t.global_id,
                "bbox": f.bbox
            })

In [None]:
import cv2
import random

def visualize_video_with_ids(
    video_path,
    tracks_per_frame,
    output_path=None
):
    cap = cv2.VideoCapture(video_path)

    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)

    writer = None
    if output_path:
        writer = cv2.VideoWriter(
            output_path,
            cv2.VideoWriter_fourcc(*"mp4v"),
            fps,
            (width, height)
        )

    id_colors = {}

    frame_idx = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        for item in tracks_per_frame.get(frame_idx, []):
            track_id = item["id"]
            x1, y1, x2, y2 = map(int, item["bbox"])

            if track_id not in id_colors:
                random.seed(int(track_id))
                id_colors[track_id] = (
                    random.randint(50, 255),
                    random.randint(50, 255),
                    random.randint(50, 255),
                )

            color = id_colors[track_id]

            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            cv2.putText(
                frame,
                f"ID {track_id}",
                (x1, max(0, y1 - 7)),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6,
                color,
                2
            )

        if writer:
            writer.write(frame)

        cv2.imshow("Tracking Visualization", frame)
        if cv2.waitKey(1) & 0xFF == 27:
            break

        frame_idx += 1

    cap.release()
    if writer:
        writer.release()
    cv2.destroyAllWindows()

In [None]:
visualize_video_with_ids('videos_test/seq_001/camera_3_cut.mp4', tracks)



TypeError: The only supported seed types are: None,
int, float, str, bytes, and bytearray.

: 

In [14]:
# Sampling + Embeddings
reid_model = ReIDModel()

for t in manager.all():
    sampled = sample_best_per_window(t.frames)
    candidates = []
    if len(sampled) <= 3:
        candidates = sampled
    else:
        candidates = sampled[:1] + sampled[-1:] + sampled[len(sampled)//2:len(sampled)//2+1]

    imgs = [f.image for f in candidates]

    reid_feats = reid_model.extract(imgs).mean(axis=0)
    t.reid_embeddings.append(reid_feats)

Downloading...
From: https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY
To: /root/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth
100%|██████████| 10.9M/10.9M [00:00<00:00, 179MB/s]


Successfully loaded imagenet pretrained weights from "/root/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352


In [15]:
del reid_model

In [16]:
# Sampling + Embeddings
clip_model = CLIPModel()

for t in manager.all():
    sampled = sample_best_per_window(t.frames)
    candidates = []
    if len(sampled) <= 3:
        candidates = sampled
    else:
        candidates = sampled[:1] + sampled[-1:] + sampled[len(sampled)//2:len(sampled)//2+1]

    imgs = [f.image for f in candidates]

    clip_feats = np.array([clip_model.encode_image(img) for img in imgs]).mean(axis=0)
    t.clip_embeddings.append(clip_feats)


open_clip_pytorch_model.bin:   0%|          | 0.00/3.94G [00:00<?, ?B/s]

In [18]:
del clip_model

NameError: name 'clip_model' is not defined

In [20]:
tracklets = manager.all()

# save into metadata.txt
with open(f"{OUTPUT_FOLDER}/metadata.txt", "a") as f:
    for t in tracklets:
        for frame in t.frames:
            f.write(f"{t.sequence_id} {t.camera_id} {frame.frame_id} {t.global_id} {int(frame.bbox[0])} {int(frame.bbox[1])} {int(frame.bbox[2])} {int(frame.bbox[3])}\n")

# save into features.txt
with open(f"{OUTPUT_FOLDER}/features.txt", "w") as f:
    for t in tracklets:
        f.write(f"{t.sequence_id} {t.camera_id} {t.global_id} {t.reid_embeddings[0].tolist()} {t.clip_embeddings[0].tolist()}\n")

In [None]:
!zip -r results.zip /kaggle/working/outputs
from IPython.display import FileLink
FileLink(r'results.zip')