<div align="center">
  <a href="http://www.sharif.edu/">
    <img src="https://cdn.freebiesupply.com/logos/large/2x/sharif-logo-png-transparent.png" alt="SUT Logo" width="140">
  </a>
  
  # Sharif University of Technology
  ### Electrical Engineering Department

  ## Signals and Systems
  #### *Final Project - Spring 2025*
</div>

---

<div align="center">
  <h1>
    <b>Object Tracker</b>
  </h1>
  <p>
    An object tracking system using YOLO for detection and various algorithms (KCF, CSRT, MOSSE) for tracking.
  </p>
</div>

<br>

| Professor                  |
| :-------------------------: |
| Dr. Mohammad Mehdi Mojahedian |

<br>

| Contributors              |
| :-----------------------: |
| **Amirreza Mousavi** |
| **Mahdi Falahi** |
| **Zahra Miladipour** |

---

## Imports

In [None]:
import cv2
import numpy as np
from ultralytics import YOLO
import time
import torch
from scipy.optimize import linear_sum_assignment
from scipy import fft, linalg
import os
import torchreid
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from collections import deque
from numpy import linalg

## Custom Kcf

In [None]:
class KCFParams:
    """
    A data class to hold all configurable parameters for the KCF tracker.
    """
    def __init__(self):
        self.detect_thresh = 0.1
        self.sigma = 0.1
        self.lambda_ = 0.0001
        self.interp_factor = 0.1
        self.output_sigma_factor = 1.0 / 16.0
        self.resize = True
        self.max_patch_size = 80 * 80
        self.split_coeff = True
        self.wrap_kernel = False
        self.desc_npca = 'GRAY'
        self.desc_pca = 'CN'
        self.compress_feature = True
        self.compressed_size = 2
        self.pca_learning_rate = 0.15

class KCFLOW:
    """
    Python implementation of the Kernelized Correlation Filter (KCF) tracker with optional GPU acceleration.
    """
    def __init__(self, parameters=KCFParams(), use_gpu=True, verbose=False):
        self.params = parameters
        self.use_gpu = use_gpu
        self.verbose = verbose
        self.device = torch.device('cuda' if self.use_gpu and torch.cuda.is_available() else 'cpu')
        self.roi = None
        self.frame = 0
        self.resize_image = False
        self.output_sigma = 0.0
        self.yf = None
        self.alphaf = None
        self.alphaf_den = None
        self.z = None
        self.x = None
        self.hann = None
        self.features_pca = []
        self.features_npca = []
        self.descriptors_pca = []
        self.descriptors_npca = []
        self.proj_mtx = None
        self.old_cov_mtx = None
        self.use_custom_extractor_pca = False
        self.use_custom_extractor_npca = False
        self.extractor_pca = []
        self.extractor_npca = []
        self.X = {}
        self.Z = {}
        self.Zc = {}
        self.k = None
        self.kf = None
        self.kf_lambda = None
        self.new_alphaf = None
        self.response = None
        self._color_names_table = None
        self.model_width = None
        self.model_height = None
        self.size_smoothing_factor = 0.4  # Increased from 0.25 to dampen size changes
        self.prev_bbox_size = None
        self.initial_roi_width = None
        self.initial_roi_height = None
        self.max_roi_scale_factor = 2.5  # Adjustable; prevents excessive growth

        if self.use_gpu and self.verbose:
            print("GPU acceleration enabled. Device:", self.device)

    def init(self, image, boundingBox):
        if self.verbose:
            print(f"Initializing TrackerKCF with boundingBox={boundingBox}")
        if len(image.shape) == 2 or image.shape[2] == 1:
            self.params.desc_pca = self.params.desc_pca.replace('CN', '')
            self.params.desc_npca = self.params.desc_npca.replace('CN', '')

        self.frame = 0
        x, y, w, h = boundingBox
        img_to_process = image
        if self.params.resize and w * h > self.params.max_patch_size:
            self.resize_image = True
            x /= 2.0
            y /= 2.0
            w /= 2.0
            h /= 2.0
            img_to_process = cv2.resize(image, (image.shape[1] // 2, image.shape[0] // 2), interpolation=cv2.INTER_LINEAR)
        if self.verbose:
            print(f"resize_image={self.resize_image}")

        roi_x = x - w / 2
        roi_y = y - h / 2
        roi_w = w * 2
        roi_h = h * 2
        self.roi = (roi_x, roi_y, roi_w, roi_h)
        if self.verbose:
            print(f"roi={self.roi}")
        self.model_width = int(roi_w)
        self.model_height = int(roi_h)
        self.initial_roi_width = self.model_width
        self.initial_roi_height = self.model_height
        self.prev_bbox_size = (w, h)
        if self.verbose:
            print(f"model_width={self.model_width}, model_height={self.model_height}")
            print(f"prev_bbox_size={self.prev_bbox_size}")

        output_sigma_val = np.sqrt(w * h) * self.params.output_sigma_factor
        self.output_sigma = -0.5 / (output_sigma_val * output_sigma_val)

        win_size = (self.model_width, self.model_height)
        self.hann = self._createHanningWindow(win_size)
        if self.verbose:
            print(f"hann shape={self._get_shape(self.hann)} type={type(self.hann)}")

        sz_y, sz_x = self.model_height, self.model_width
        j, i = np.meshgrid(np.arange(sz_x), np.arange(sz_y))
        cy, cx = sz_y // 2, sz_x // 2
        dist_sq = (i - cy + 1)**2 + (j - cx + 1)**2
        y = np.exp(self.output_sigma * dist_sq)
        self.yf = self._fft2(y)
        if self.verbose:
            print(f"yf shape={self._get_shape(self.yf)} type={type(self.yf)}")

        if 'GRAY' in self.params.desc_npca.upper(): self.descriptors_npca.append('GRAY')
        if 'CN' in self.params.desc_npca.upper(): self.descriptors_npca.append('CN')
        if self.use_custom_extractor_npca: self.descriptors_npca.append('CUSTOM')

        if 'GRAY' in self.params.desc_pca.upper(): self.descriptors_pca.append('GRAY')
        if 'CN' in self.params.desc_pca.upper(): self.descriptors_pca.append('CN')
        if self.use_custom_extractor_pca: self.descriptors_pca.append('CUSTOM')

        features_pca, features_npca = self._extractFeatures(img_to_process, self.roi, resize_size=(self.model_width, self.model_height))
        if not features_pca and not features_npca:
            raise ValueError("No valid features extracted during initialization")

        if features_npca: self.Z[1] = self._stack(features_npca)
        if features_pca: self.Z[0] = self._stack(features_pca)
        if self.verbose:
            if 0 in self.Z: print(f"Z[0] shape={self._get_shape(self.Z[0])} type={type(self.Z[0])}")
            if 1 in self.Z: print(f"Z[1] shape={self._get_shape(self.Z[1])} type={type(self.Z[1])}")
        
        if self.params.compress_feature and self.Z.get(0) is not None:
            self._updateProjectionMatrix(self.Z[0], self.params.pca_learning_rate, self.params.compressed_size)

        x_parts = [self.Z.get(0), self.Z.get(1)]
        if self.verbose:
            print(f"x_parts types: {[type(p) for p in x_parts if p is not None]}")
        if self.params.compress_feature and x_parts[0] is not None:
            x_parts[0] = self._compress(x_parts[0])
            if self.verbose:
                print(f"After compress, x_parts[0] type={type(x_parts[0])} shape={self._get_shape(x_parts[0])}")
        self.x = self._stack([p for p in x_parts if p is not None])
        if self.verbose:
            print(f"x shape={self._get_shape(self.x)} type={type(self.x)}")

        k = self._denseGaussKernel(self.params.sigma, self.x)
        if self.verbose:
            print(f"k shape={self._get_shape(k)} type={type(k)}")
        kf = self._fft2(k)
        if self.verbose:
            print(f"kf shape={self._get_shape(kf)} type={type(kf)}")
        kf_lambda = kf + self.params.lambda_
        if self.params.split_coeff:
            self.alphaf = self._pixelWiseMult(self.yf, kf)
            self.alphaf_den = self._pixelWiseMult(kf, kf_lambda)
            if self.verbose:
                print(f"alphaf shape={self._get_shape(self.alphaf)} type={type(self.alphaf)}")
                print(f"alphaf_den shape={self._get_shape(self.alphaf_den)} type={type(self.alphaf_den)}")
        else:
            self.alphaf = self.yf / (kf_lambda + 1e-10)
            if self.verbose:
                print(f"alphaf shape={self._get_shape(self.alphaf)} type={type(self.alphaf)}")

    def update(self, image):
        if self.verbose:
            print("Entering update")
        img_to_process = image
        if self.resize_image:
            img_to_process = cv2.resize(image, (image.shape[1] // 2, image.shape[0] // 2), interpolation=cv2.INTER_LINEAR)
        if self.verbose:
            print(f"img_to_process shape={img_to_process.shape}")

        if self.frame > 0:
            features_pca, features_npca = self._extractFeatures(img_to_process, self.roi, resize_size=(self.model_width, self.model_height))
            if not features_pca and not features_npca:
                return False, None

            if features_npca: self.X[1] = self._stack(features_npca)
            if features_pca: self.X[0] = self._stack(features_pca)
            if self.verbose:
                if 0 in self.X: print(f"X[0] shape={self._get_shape(self.X[0])} type={type(self.X[0])}")
                if 1 in self.X: print(f"X[1] shape={self._get_shape(self.X[1])} type={type(self.X[1])}")

            if self.params.compress_feature and self.X.get(0) is not None:
                self.X[0] = self._compress(self.X[0])
                self.Zc[0] = self._compress(self.Z[0])
            else:
                self.Zc[0] = self.Z.get(0)
            self.Zc[1] = self.Z.get(1)
            if self.verbose:
                if 0 in self.Zc: print(f"Zc[0] shape={self._get_shape(self.Zc[0])} type={type(self.Zc[0])}")
                if 1 in self.Zc: print(f"Zc[1] shape={self._get_shape(self.Zc[1])} type={type(self.Zc[1])}")

            x_parts = [self.X.get(0), self.X.get(1)]
            if self.verbose:
                print(f"x_parts types in update: {[type(p) for p in x_parts if p is not None]}")
            z_parts = [self.Zc.get(0), self.Zc.get(1)]
            if self.verbose:
                print(f"z_parts types in update: {[type(p) for p in z_parts if p is not None]}")
            x = self._stack([p for p in x_parts if p is not None])
            z = self._stack([p for p in z_parts if p is not None])
            if self.verbose:
                print(f"x shape in update={self._get_shape(x)} type={type(x)}")
                print(f"z shape in update={self._get_shape(z)} type={type(z)}")
            
            k = self._denseGaussKernel(self.params.sigma, x, z)
            if self.verbose:
                print(f"k shape in update={self._get_shape(k)} type={type(k)}")
            
            kf = self._fft2(k)
            if self.verbose:
                print(f"kf shape in update={self._get_shape(kf)} type={type(kf)}")
            self.response = self._calcResponse(self.alphaf, kf)
            if self.verbose:
                print(f"response shape={self._get_shape(self.response)} type={type(self.response)}")
            
            if self.use_gpu:
                response_np = self.response.cpu().numpy()
            else:
                response_np = self.response
            peak_y, peak_x = np.unravel_index(np.argmax(response_np), response_np.shape)
            
            peak_value = response_np[peak_y, peak_x]
            if peak_value < self.params.detect_thresh:
                return False, None

            disp_y = peak_y - response_np.shape[0] // 2 + 1
            disp_x = peak_x - response_np.shape[1] // 2 + 1

            self.roi = (self.roi[0] + disp_x, self.roi[1] + disp_y, self.roi[2], self.roi[3])
            if self.verbose:
                print(f"Updated roi after displacement={self.roi}")

            # Scale search
            current_physical_w = self.roi[2]
            current_physical_h = self.roi[3]
            scales = np.arange(0.97, 1.031, 0.015)  # Fewer scales for efficiency
            peaks = []
            z_for_kernel = self._stack([p for p in [self.Zc.get(0), self.Zc.get(1)] if p is not None])
            if self.verbose:
                print(f"z_for_kernel shape={self._get_shape(z_for_kernel)} type={type(z_for_kernel)}")
            for s in scales:
                temp_physical_w = current_physical_w * s
                temp_physical_h = current_physical_h * s
                center_x = self.roi[0] + current_physical_w / 2
                center_y = self.roi[1] + current_physical_h / 2
                temp_roi_x = center_x - temp_physical_w / 2
                temp_roi_y = center_y - temp_physical_h / 2
                temp_roi = (temp_roi_x, temp_roi_y, temp_physical_w, temp_physical_h)
                if self.verbose:
                    print(f"Scale {s}: temp_roi={temp_roi}")
                features_pca, features_npca = self._extractFeatures(img_to_process, temp_roi, resize_size=(self.model_width, self.model_height))
                if not features_pca and not features_npca:
                    peaks.append(-np.inf)
                    continue
                x_temp = {}
                if features_npca: x_temp[1] = self._stack(features_npca)
                if features_pca: x_temp[0] = self._stack(features_pca)
                if self.params.compress_feature and x_temp.get(0) is not None:
                    x_temp[0] = self._compress(x_temp.get(0))
                x = self._stack([p for p in [x_temp.get(0), x_temp.get(1)] if p is not None])
                if self.verbose:
                    print(f"x shape for scale {s}={self._get_shape(x)} type={type(x)}")
                k = self._denseGaussKernel(self.params.sigma, x, z_for_kernel)
                kf = self._fft2(k)
                response = self._calcResponse(self.alphaf, kf)
                if self.use_gpu:
                    peak = torch.max(response).item()
                else:
                    peak = np.max(response)
                peaks.append(peak)
                if self.verbose:
                    print(f"Peak for scale {s}: {peak}")
            best_idx = np.argmax(peaks)
            best_s = scales[best_idx]
            if peak_value < self.params.detect_thresh * 1.5:  # Low confidence; skip scale adaptation
                best_s = 1.0
                if self.verbose:
                    print("Low confidence; skipping scale adaptation")
            if self.verbose:
                print(f"Best scale: {best_s}")
            current_object_w = current_physical_w / 2
            current_object_h = current_physical_h / 2
            new_object_w = current_object_w * best_s
            new_object_h = current_object_h * best_s
            if self.prev_bbox_size is not None:
                prev_w, prev_h = self.prev_bbox_size
                max_w_change = max(5, prev_w * self.size_smoothing_factor)
                max_h_change = max(5, prev_h * self.size_smoothing_factor)
                delta_w = new_object_w - prev_w
                delta_h = new_object_h - prev_h
                if abs(delta_w) > max_w_change:
                    new_object_w = prev_w + np.sign(delta_w) * max_w_change
                if abs(delta_h) > max_h_change:
                    new_object_h = prev_h + np.sign(delta_h) * max_h_change
            self.prev_bbox_size = (new_object_w, new_object_h)
            if self.verbose:
                print(f"Updated prev_bbox_size={self.prev_bbox_size}")
            new_physical_w = new_object_w * 2
            new_physical_h = new_object_h * 2
            # Clamp to prevent unbounded growth
            max_w = min(self.initial_roi_width * self.max_roi_scale_factor, img_to_process.shape[1] * 0.5)
            max_h = min(self.initial_roi_height * self.max_roi_scale_factor, img_to_process.shape[0] * 0.5)
            new_physical_w = min(max(new_physical_w, self.initial_roi_width * 0.5), max_w)
            new_physical_h = min(max(new_physical_h, self.initial_roi_height * 0.5), max_h)
            center_x = self.roi[0] + self.roi[2] / 2
            center_y = self.roi[1] + self.roi[3] / 2
            self.roi = (center_x - new_physical_w / 2, center_y - new_physical_h / 2, new_physical_w, new_physical_h)
            if self.verbose:
                print(f"Updated roi after scale={self.roi}")

        features_pca, features_npca = self._extractFeatures(img_to_process, self.roi, resize_size=(self.model_width, self.model_height))
        if not features_pca and not features_npca:
            return False, None
            
        if features_npca: self.X[1] = self._stack(features_npca)
        if features_pca: self.X[0] = self._stack(features_pca)
        if self.verbose:
            if 0 in self.X: print(f"X[0] shape after update extract={self._get_shape(self.X[0])} type={type(self.X[0])}")
            if 1 in self.X: print(f"X[1] shape after update extract={self._get_shape(self.X[1])} type={type(self.X[1])}")
        
        interp = self.params.interp_factor
        if self.X.get(0) is not None:
            self.Z[0] = (1 - interp) * self.Z[0] + interp * self.X[0]
        if self.X.get(1) is not None:
            self.Z[1] = (1 - interp) * self.Z[1] + interp * self.X[1]

        if self.params.compress_feature and self.Z.get(0) is not None:
            self._updateProjectionMatrix(self.Z[0], self.params.pca_learning_rate, self.params.compressed_size)
        
        x_parts = [self.X.get(0), self.X.get(1)]
        if self.verbose:
            print(f"x_parts types for model update: {[type(p) for p in x_parts if p is not None]}")
        if self.params.compress_feature and x_parts[0] is not None:
            x_parts[0] = self._compress(x_parts[0])

        x = self._stack([p for p in x_parts if p is not None])
        if self.verbose:
            print(f"x shape for model update={self._get_shape(x)} type={type(x)}")

        k = self._denseGaussKernel(self.params.sigma, x)
        if self.verbose:
            print(f"k shape for model update={self._get_shape(k)} type={type(k)}")
        kf = self._fft2(k)
        if self.verbose:
            print(f"kf shape for model update={self._get_shape(kf)} type={type(kf)}")
        kf_lambda = kf + self.params.lambda_
        if self.params.split_coeff:
            new_alphaf = self._pixelWiseMult(self.yf, kf)
            new_alphaf_den = self._pixelWiseMult(kf, kf_lambda)
        else:
            new_alphaf = self.yf / (kf_lambda + 1e-10)

        self.alphaf = (1 - interp) * self.alphaf + interp * new_alphaf
        if self.params.split_coeff:
            self.alphaf_den = (1 - interp) * self.alphaf_den + interp * new_alphaf_den

        rx, ry, rw, rh = self.roi
        obj_w, obj_h = rw / 2, rh / 2
        obj_x, obj_y = rx + rw / 2, ry + rh / 2
        
        if self.resize_image:
            obj_x *= 2
            obj_y *= 2
            obj_w *= 2
            obj_h *= 2

        img_h, img_w = image.shape[:2]
        left = max(0.0, obj_x - obj_w / 2)
        top = max(0.0, obj_y - obj_h / 2)
        right = min(float(img_w), obj_x + obj_w / 2)
        bottom = min(float(img_h), obj_y + obj_h / 2)
        final_bbox = (left, top, right - left, bottom - top)
        if self.verbose:
            print(f"Final bbox={final_bbox}")
        
        self.frame += 1
        return True, final_bbox

    def setFeatureExtractor(self, callback, pca_func=False):
        if pca_func:
            self.extractor_pca.append(callback)
            self.use_custom_extractor_pca = True
        else:
            self.extractor_npca.append(callback)
            self.use_custom_extractor_npca = True

    def _get_shape(self, obj):
        if self.use_gpu and torch.is_tensor(obj):
            return obj.shape
        else:
            return obj.shape if hasattr(obj, 'shape') else None

    def _stack(self, arrays):
        if self.use_gpu:
            tensors = []
            for a in arrays:
                if a is not None:
                    if torch.is_tensor(a):
                        tensors.append(a.to(self.device).float())
                    else:
                        tensors.append(torch.from_numpy(a).to(self.device).float())
            return torch.dstack(tensors) if tensors else None
        else:
            return np.dstack(arrays) if arrays else None

    def _to_tensor(self, arr):
        if self.use_gpu and not torch.is_tensor(arr):
            return torch.from_numpy(arr).to(self.device).float()
        return arr

    def _to_numpy(self, tensor):
        if self.use_gpu and torch.is_tensor(tensor):
            return tensor.cpu().numpy()
        return tensor

    def _fft2(self, src):
        if self.use_gpu:
            src = self._to_tensor(src)
            return torch.fft.fft2(src, dim=(0, 1))
        else:
            return fft.fft2(src, axes=(0, 1))

    def _ifft2(self, src):
        if self.use_gpu:
            result = torch.fft.ifft2(src, dim=(0, 1)).real
            return result
        else:
            return np.real(fft.ifft2(src, axes=(0, 1)))

    def _pixelWiseMult(self, s1, s2, conjB=False):
        s1 = self._to_tensor(s1)
        s2 = self._to_tensor(s2)
        if conjB:
            result = s1 * torch.conj(s2) if self.use_gpu else s1 * np.conj(s2)
        else:
            result = s1 * s2
        return result

    def _sumChannels(self, src):
        src = self._to_tensor(src)
        if self.use_gpu:
            return torch.sum(src, dim=2) if src.dim() == 3 else src
        else:
            return np.sum(src, axis=2) if src.ndim == 3 else src

    def _createHanningWindow(self, s):
        hann = cv2.createHanningWindow(s, cv2.CV_32F)
        if self.use_gpu:
            return torch.from_numpy(hann).to(self.device).float()
        return hann

    def _shift(self, mat, dx, dy):
        if self.use_gpu:
            mat = self._to_tensor(mat)
            return torch.roll(mat, shifts=(dy, dx), dims=(0, 1))
        else:
            return np.roll(mat, (dy, dx), axis=(0, 1))
    
    def _extractFeatures(self, image, roi, resize_size=None):
        if self.verbose:
            print(f"Entering _extractFeatures with roi={roi}, resize_size={resize_size}")
        features_npca = []
        features_pca = []

        for d in self.descriptors_npca:
            if d == 'CUSTOM':
                continue
            if self.verbose:
                print(f"Processing descriptor_npca: {d}")
            patch = self._get_patch(image, roi)
            if patch is None:
                if self.verbose:
                    print("Patch is None for npca")
                continue
            if self.verbose:
                print(f"Patch shape for npca {d}: {patch.shape}")
            if resize_size:
                if self.use_gpu:
                    patch_tensor = torch.from_numpy(patch).to(self.device).permute(2, 0, 1).unsqueeze(0).float()
                    patch = torch.nn.functional.interpolate(patch_tensor, size=(resize_size[1], resize_size[0]), mode='bilinear', align_corners=False).squeeze(0).permute(1, 2, 0).cpu().numpy()
                else:
                    patch = cv2.resize(patch, resize_size, interpolation=cv2.INTER_LINEAR)
                if self.verbose:
                    print(f"Resized patch shape for npca {d}: {patch.shape}")
            if d == 'GRAY':
                f = self._extractGray(patch, apply_hann=False)
            elif d == 'CN':
                f = self._extractCN(patch, apply_hann=False)
            else:
                f = None
            if f is not None:
                if self.verbose:
                    print(f"f shape for npca {d}: {f.shape}")
                if f.ndim == 2:
                    f = f[:, :, np.newaxis]
                    if self.verbose:
                        print(f"After newaxis for npca {d}: {f.shape}")
                hann_expanded = self._to_tensor(self.hann)[:, :, None] if self.use_gpu else self.hann[:, :, np.newaxis]
                if self.verbose:
                    print(f"hann_expanded shape for npca {d}: {self._get_shape(hann_expanded)} type={type(hann_expanded)}")
                f_tensor = self._to_tensor(f)
                if self.verbose:
                    print(f"f_tensor shape before multiply: {self._get_shape(f_tensor)} type={type(f_tensor)}")
                multiplied = f_tensor * hann_expanded
                f = self._to_numpy(multiplied)
                if self.verbose:
                    print(f"After hann multiply for npca {d}: {f.shape} type={type(f)}")
                features_npca.append(f)

        for d in self.descriptors_pca:
            if d == 'CUSTOM':
                continue
            if self.verbose:
                print(f"Processing descriptor_pca: {d}")
            patch = self._get_patch(image, roi)
            if patch is None:
                if self.verbose:
                    print("Patch is None for pca")
                continue
            if self.verbose:
                print(f"Patch shape for pca {d}: {patch.shape}")
            if resize_size:
                if self.use_gpu:
                    patch_tensor = torch.from_numpy(patch).to(self.device).permute(2, 0, 1).unsqueeze(0).float()
                    patch = torch.nn.functional.interpolate(patch_tensor, size=(resize_size[1], resize_size[0]), mode='bilinear', align_corners=False).squeeze(0).permute(1, 2, 0).cpu().numpy()
                else:
                    patch = cv2.resize(patch, resize_size, interpolation=cv2.INTER_LINEAR)
                if self.verbose:
                    print(f"Resized patch shape for pca {d}: {patch.shape}")
            if d == 'GRAY':
                f = self._extractGray(patch, apply_hann=False)
            elif d == 'CN':
                f = self._extractCN(patch, apply_hann=False)
            else:
                f = None
            if f is not None:
                if self.verbose:
                    print(f"f shape for pca {d}: {f.shape}")
                if f.ndim == 2:
                    f = f[:, :, np.newaxis]
                    if self.verbose:
                        print(f"After newaxis for pca {d}: {f.shape}")
                hann_expanded = self._to_tensor(self.hann)[:, :, None] if self.use_gpu else self.hann[:, :, np.newaxis]
                if self.verbose:
                    print(f"hann_expanded shape for pca {d}: {self._get_shape(hann_expanded)} type={type(hann_expanded)}")
                f_tensor = self._to_tensor(f)
                if self.verbose:
                    print(f"f_tensor shape before multiply: {self._get_shape(f_tensor)} type={type(f_tensor)}")
                multiplied = f_tensor * hann_expanded
                f = self._to_numpy(multiplied)
                if self.verbose:
                    print(f"After hann multiply for pca {d}: {f.shape} type={type(f)}")
                features_pca.append(f)

        for extractor in self.extractor_npca:
            if self.verbose:
                print("Processing custom extractor_npca")
            feat = extractor(image, roi)
            if feat is not None:
                if self.verbose:
                    print(f"Custom feat shape npca: {feat.shape}")
                if feat.ndim == 2:
                    feat = feat[:, :, np.newaxis]
                    if self.verbose:
                        print(f"After newaxis custom npca: {feat.shape}")
                hann_expanded = self._to_tensor(self.hann)[:, :, None] if self.use_gpu else self.hann[:, :, np.newaxis]
                if self.verbose:
                    print(f"hann_expanded shape custom npca: {self._get_shape(hann_expanded)} type={type(hann_expanded)}")
                feat_tensor = self._to_tensor(feat)
                if self.verbose:
                    print(f"feat_tensor shape before multiply custom npca: {self._get_shape(feat_tensor)} type={type(feat_tensor)}")
                multiplied = feat_tensor * hann_expanded
                feat = self._to_numpy(multiplied)
                if self.verbose:
                    print(f"After hann multiply custom npca: {feat.shape} type={type(feat)}")
                features_npca.append(feat)
        
        for extractor in self.extractor_pca:
            if self.verbose:
                print("Processing custom extractor_pca")
            feat = extractor(image, roi)
            if feat is not None:
                if self.verbose:
                    print(f"Custom feat shape pca: {feat.shape}")
                if feat.ndim == 2:
                    feat = feat[:, :, np.newaxis]
                    if self.verbose:
                        print(f"After newaxis custom pca: {feat.shape}")
                hann_expanded = self._to_tensor(self.hann)[:, :, None] if self.use_gpu else self.hann[:, :, np.newaxis]
                if self.verbose:
                    print(f"hann_expanded shape custom pca: {self._get_shape(hann_expanded)} type={type(hann_expanded)}")
                feat_tensor = self._to_tensor(feat)
                if self.verbose:
                    print(f"feat_tensor shape before multiply custom pca: {self._get_shape(feat_tensor)} type={type(feat_tensor)}")
                multiplied = feat_tensor * hann_expanded
                feat = self._to_numpy(multiplied)
                if self.verbose:
                    print(f"After hann multiply custom pca: {feat.shape} type={type(feat)}")
                features_pca.append(feat)
            
        return features_pca, features_npca

    def _get_patch(self, img, roi):
        if self.verbose:
            print(f"Getting patch for roi={roi}")
        x, y, w, h = map(int, roi)
        img_h, img_w = img.shape[:2]
        vx, vy = max(0, x), max(0, y)
        rb, bb = min(x + w, img_w), min(y + h, img_h)
        vw, vh = rb - vx, bb - vy
        if vw <= 0 or vh <= 0:
            return None
        sub = img[vy:bb, vx:rb]
        top, bot = max(0, -y), max(0, y + h - img_h)
        left, rgt = max(0, -x), max(0, x + w - img_w)
        patch = cv2.copyMakeBorder(sub, top, bot, left, rgt, cv2.BORDER_REPLICATE)
        if self.verbose:
            print(f"Returned patch shape={patch.shape}")
        return patch

    def _extractGray(self, p, apply_hann=True):
        if p is None or p.size == 0: return None
        if self.use_gpu:
            p_tensor = torch.from_numpy(p).to(self.device).float()
            if p_tensor.dim() == 3:
                f = torch.mean(p_tensor, dim=2)
            else:
                f = p_tensor
            f = f / 255.0 - 0.5
            if apply_hann:
                hann = self._to_tensor(self.hann)
                f = f * hann
            return f.cpu().numpy()
        else:
            f = cv2.cvtColor(p, cv2.COLOR_BGR2GRAY) if p.ndim > 2 else p
            f = f.astype(np.float32) / 255.0 - 0.5
            if apply_hann:
                f = f * self.hann
            return f

    def _extractCN(self, p, apply_hann=True):
        if p is None or p.size == 0: return None
        if self._color_names_table is None: self._loadColorNamesTable()
        if self.use_gpu:
            if isinstance(self._color_names_table, np.ndarray):
                self._color_names_table = torch.from_numpy(self._color_names_table).to(self.device).float()
            p_tensor = torch.from_numpy(p).to(self.device).long()
            px = p_tensor.view(-1, 3) // 8
            idx = px[:, 2] + 32 * px[:, 1] + 1024 * px[:, 0]
            f = self._color_names_table[idx].view(p.shape[0], p.shape[1], -1)
            if apply_hann:
                hann = self._to_tensor(self.hann)[:, :, None]
                f = f * hann
            return f.cpu().numpy()
        else:
            px = p.reshape(-1, 3).astype(np.int32)
            idx = (px[:, 2] // 8) + 32 * (px[:, 1] // 8) + 1024 * (px[:, 0] // 8)
            f = self._color_names_table[idx].reshape(p.shape[0], p.shape[1], 10).astype(np.float32)
            if apply_hann:
                f = f * self.hann[:, :, np.newaxis]
            return f

    def _loadColorNamesTable(self, path="colornames.npy"):
        try:
            self._color_names_table = np.load(path)
        except IOError:
            raise IOError(f"Color table '{path}' not found. Ensure the file exists or generate it from the original ColorNames array.")

    def _denseGaussKernel(self, sigma, x, y=None):
        if self.verbose:
            print("Entering _denseGaussKernel")
        x = self._to_tensor(x)
        if self.verbose:
            print(f"x shape in kernel={self._get_shape(x)} type={type(x)}")
        if y is None: 
            y = x
        else:
            y = self._to_tensor(y)
            if self.verbose:
                print(f"y shape in kernel={self._get_shape(y)} type={type(y)}")
        xf = self._fft2(x)
        yf = self._fft2(y)
        if self.verbose:
            print(f"xf shape={self._get_shape(xf)} type={type(xf)}, yf shape={self._get_shape(yf)} type={type(yf)}")
        if self.use_gpu:
            nx_sq = torch.sum(x ** 2).item()
            ny_sq = torch.sum(y ** 2).item()
        else:
            nx_sq = np.sum(x**2)
            ny_sq = np.sum(y**2)
        if self.verbose:
            print(f"nx_sq={nx_sq}, ny_sq={ny_sq}")
        xyf = self._pixelWiseMult(xf, yf, conjB=True)
        if self.verbose:
            print(f"xyf shape={self._get_shape(xyf)} type={type(xyf)}")
        xy_sum = self._sumChannels(xyf)
        if self.verbose:
            print(f"xy_sum shape={self._get_shape(xy_sum)} type={type(xy_sum)}")
        xy = self._ifft2(xy_sum)
        if self.verbose:
            print(f"xy shape={self._get_shape(xy)} type={type(xy)}")
        if self.params.wrap_kernel:
            shift_dy = int(self._get_shape(x)[0] // 2)
            shift_dx = int(self._get_shape(x)[1] // 2)
            xy = self._shift(xy, shift_dx, shift_dy)
        if self.use_gpu:
            d = (nx_sq + ny_sq - 2 * xy) / x.numel()
            k = torch.exp(torch.clamp(d, min=0) / (sigma**2 * -1))
        else:
            d = (nx_sq + ny_sq - 2 * xy) / x.size
            k = np.exp(-np.maximum(0, d) / (sigma**2))
        if self.verbose:
            print(f"k shape returned={self._get_shape(k)} type={type(k)}")
        return k

    def _calcResponse(self, alphaf, kf):
        if self.verbose:
            print("Entering _calcResponse")
        alphaf = self._to_tensor(alphaf)
        kf = self._to_tensor(kf)
        if self.verbose:
            print(f"alphaf shape={self._get_shape(alphaf)} type={type(alphaf)}, kf shape={self._get_shape(kf)} type={type(kf)}")
        if not self.params.split_coeff:
            spec = self._pixelWiseMult(alphaf, kf)
            return self._ifft2(spec)
        else:
            spec = self._pixelWiseMult(alphaf, kf)
            response_spec = spec / (self._to_tensor(self.alphaf_den) + 1e-10)
            return self._ifft2(response_spec)

    def _updateProjectionMatrix(self, src, pca_rate, compressed_sz):
        if self.verbose:
            print("Entering _updateProjectionMatrix")
        src = self._to_tensor(src)
        if self.verbose:
            print(f"src shape={self._get_shape(src)} type={type(src)}")
        num_pixels = src.shape[0] * src.shape[1]
        num_channels = src.shape[2] if len(src.shape) > 2 else 1
        compressed_sz = min(compressed_sz, num_channels)
        data = src.reshape(num_pixels, num_channels)
        if self.use_gpu:
            mean = torch.mean(data, dim=0)
            data_nomean = data - mean
            new_cov = (data_nomean.t() @ data_nomean) / (num_pixels - 1)
        else:
            mean = np.mean(data, axis=0)
            data_nomean = data - mean
            new_cov = (data_nomean.T @ data_nomean) / (num_pixels - 1)
        if self.old_cov_mtx is None:
            self.old_cov_mtx = new_cov
        else:
            self.old_cov_mtx = self._to_tensor(self.old_cov_mtx)
            self.old_cov_mtx = (1 - pca_rate) * self.old_cov_mtx + pca_rate * new_cov
        if self.use_gpu:
            u, s, _ = torch.linalg.svd(self.old_cov_mtx, full_matrices=True)
        else:
            u, s, _ = linalg.svd(self.old_cov_mtx, full_matrices=True)
        self.proj_mtx = u[:, :compressed_sz]
        if self.use_gpu:
            proj_vars = torch.diag(s[:compressed_sz])
            stab_term = self.proj_mtx @ proj_vars @ self.proj_mtx.t()
        else:
            proj_vars = np.diag(s[:compressed_sz])
            stab_term = self.proj_mtx @ proj_vars @ self.proj_mtx.T
        self.old_cov_mtx = (1 - pca_rate) * self.old_cov_mtx + pca_rate * stab_term
        if self.verbose:
            print(f"proj_mtx shape={self._get_shape(self.proj_mtx)} type={type(self.proj_mtx)}")

    def _compress(self, src):
        if self.verbose:
            print(f"Compressing src shape={self._get_shape(src)} type={type(src)}")
        src = self._to_tensor(src)
        self.proj_mtx = self._to_tensor(self.proj_mtx)
        data = src.reshape(-1, src.shape[2] if len(src.shape) > 2 else 1)
        if self.use_gpu:
            compressed = data @ self.proj_mtx
        else:
            compressed = data @ self.proj_mtx
        compressed_reshaped = compressed.reshape(src.shape[0], src.shape[1], -1)
        if self.verbose:
            print(f"Compressed shape={self._get_shape(compressed_reshaped)} type={type(compressed_reshaped)}")
        if self.use_gpu:
            return compressed_reshaped
        else:
            return self._to_numpy(compressed_reshaped)

## TrackingSystem Class

In [None]:
class TrackingSystem:
    def __init__(self, model='./yolo11n.pt', tracker_type='CKCF', base_detect_interval=24, conf_threshold=0.5, 
                 max_lost_frames=30, lost_track_buffer=60,
                 use_kalman=True, track_classes=None, 
                 appearance_weight=0.6, match_cost_threshold=0.85, 
                 reid_cost_threshold=0.3, occlusion_iou_threshold=0.2,
                 iou_gating_threshold=0.1, **kwargs):
        
        # --- Check Cuda Presence ---
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f'object tracker running on {self.device}')

        # --- Core Parameters ---
        self.detect_interval = base_detect_interval
        self.conf_threshold = conf_threshold
        self.max_lost_frames = max_lost_frames
        self.track_classes = track_classes if track_classes is not None else []
        self.model = YOLO(model).to(self.device)
        print('--- Yolo loaded successfully ---')
        
        # --- Cost & Matching Parameters ---
        self.appearance_weight = appearance_weight
        self.match_cost_threshold = match_cost_threshold
        self.reid_cost_threshold = reid_cost_threshold
        self.occlusion_iou_threshold = occlusion_iou_threshold
        self.iou_gating_threshold = iou_gating_threshold
        
        # --- State Management ---
        self.frame_idx = 0
        self.tracked_objects = []
        self.lost_tracks = deque(maxlen=lost_track_buffer)
        self.next_track_id = 0
        
        # --- Kalman Filter ---
        self.use_kalman = use_kalman

        # --- Re-ID Models & Warm-up ---
        self.reid_models = self._load_reid_models()
        for config in self.reid_models.values():
            reid_model = config['model']
            dummy_input = torch.randn(1, 3, 256, 128).to(self.device)
            with torch.no_grad():
                reid_model(dummy_input)
        print("--- osnet loaded successfuly ---")

        # --- Tracker Constructors ---
        self.tracker_constructors = {
            'CSRT': cv2.legacy.TrackerCSRT_create, 'KCF': cv2.legacy.TrackerKCF_create,
            'MOSSE': cv2.legacy.TrackerMOSSE_create, 'MEDIAN_FLOW': cv2.legacy.TrackerMedianFlow_create,
            "CKCF":KCFLOW
            
        }
        if tracker_type.upper() not in self.tracker_constructors:
            raise ValueError(f"Invalid tracker type: {tracker_type}. Choose from {list(self.tracker_constructors.keys())}")
        self.tracker_type = tracker_type.upper()
        print(f"--- Object Tracker Initialized ---")

    def _load_reid_models(self):
        """Loads pre-trained Re-ID models for different object classes."""
        models = {}
        person_model = torchreid.models.build_model(name='osnet_x1_0', num_classes=4101, pretrained=False)
        torchreid.utils.load_pretrained_weights(person_model, 'osnet_x1_0_msmt17_256x128.pth')
        person_model.to(self.device).eval()
        person_transform, _ = torchreid.data.transforms.build_transforms(height=256, width=128, is_train=False)
        models['person'] = {'model': person_model, 'transform': person_transform}
        models['car'] = {'model': person_model, 'transform': person_transform}
        return models

    def _extract_embedding(self, frame, bbox, track):
        """Extracts a feature embedding from a single bounding box."""
        reid_config = track.get('reid_config')
        if not reid_config: return None
        model, transform = reid_config['model'], reid_config['transform']
        
        x1, y1, x2, y2 = [int(c) for c in bbox]
        roi = frame[y1:y2, x1:x2]
        if roi.size == 0: return None

        roi_rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
        image_tensor = transform(Image.fromarray(roi_rgb)).unsqueeze(0).to(self.device)
        
        with torch.no_grad():
            embedding = model(image_tensor)
        
        return torch.nn.functional.normalize(embedding, p=2, dim=1).cpu().numpy().flatten()

    def _extract_batch_embeddings(self, frame, detections):
        """Extracts embeddings for a batch of detections, grouped by class."""
        grouped_dets = {}
        for i, det in enumerate(detections):
            cls_name = det['class_name']
            if cls_name in self.reid_models:
                grouped_dets.setdefault(cls_name, []).append((i, det))

        for cls_name, dets_with_indices in grouped_dets.items():
            reid_config = self.reid_models[cls_name]
            model, transform = reid_config['model'], reid_config['transform']
            
            rois_with_indices = []
            for original_idx, det in dets_with_indices:
                x1, y1, x2, y2 = det['x1'], det['y1'], det['x2'], det['y2']
                roi = frame[y1:y2, x1:x2]
                if roi.size > 0:
                    roi_rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
                    rois_with_indices.append({'roi': Image.fromarray(roi_rgb), 'idx': original_idx})
            
            if not rois_with_indices: continue

            batch_tensor = torch.stack([transform(item['roi']) for item in rois_with_indices]).to(self.device)
            with torch.no_grad():
                batch_embeddings = model(batch_tensor)
            
            normalized_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1).cpu().numpy()

            for item, emb in zip(rois_with_indices, normalized_embeddings):
                detections[item['idx']]['embedding'] = emb

    def process_frame(self, frame):
        """Main processing function for each frame."""
        self._apply_boundary_conditions(frame.shape)

        newly_lost = [t for t in self.tracked_objects if t['lost_frames'] >= self.max_lost_frames]
        for t in newly_lost:
            t['state'] = 'LOST'
            self.lost_tracks.append(t)
        self.tracked_objects = [t for t in self.tracked_objects if t['lost_frames'] < self.max_lost_frames]

        if self.use_kalman: self._predict_phase()
        self._update_phase(frame)
        
        if self.frame_idx % self.detect_interval == 0:
            self._match_and_update_phase(frame)

        annotated_frame = self._drawing_phase(frame)
        self.frame_idx += 1
        return annotated_frame
    
    def _apply_boundary_conditions(self, frame_shape):
        if not self.tracked_objects: return
        
        bboxes = np.array([t['bbox'] for t in self.tracked_objects])
        visibility = self._get_box_visibility(bboxes, frame_shape)
        
        for i, track in enumerate(self.tracked_objects):
            if visibility[i] < 0.5:
                track['lost_frames'] = self.max_lost_frames

    def _predict_phase(self):
        for obj in self.tracked_objects:
            obj['kf'].predict()
            predicted_state = obj['kf'].statePost
            cx, cy, w, h = predicted_state[:4]
            obj['bbox'] = (int(cx - w/2), int(cy - h/2), int(cx + w/2), int(cy + h/2))

    def _update_phase(self, frame):
        for obj in self.tracked_objects:
            if obj['state'] == 'OCCLUDED': continue
            success, bbox = obj['tracker'].update(frame)
            if success:
                x1, y1, w, h = [int(v) for v in bbox]
                obj['bbox'] = (x1, y1, x1 + w, y1 + h)
                obj['lost_frames'] = 0 # Reset lost counter on successful short-term track
                if self.use_kalman:
                    measurement = np.array([x1 + w/2, y1 + h/2, w, h], dtype=np.float32)
                    obj['kf'].correct(measurement)
            else:
                obj['lost_frames'] += 1

    def _match_and_update_phase(self, frame):
        detections = self.detect(frame)
        if not detections: return
        self._extract_batch_embeddings(frame, detections)
        
        # --- Stage 1: Match Active Tracks with Detections ---
        if self.tracked_objects:
            cost_matrix = self._build_cost_matrix(self.tracked_objects, detections)
            track_indices, det_indices = linear_sum_assignment(cost_matrix)
            
            matched_track_indices = set()
            matched_det_indices = set()
            for t_idx, d_idx in zip(track_indices, det_indices):
                if cost_matrix[t_idx, d_idx] < self.match_cost_threshold:
                    self._update_matched_track(frame, self.tracked_objects[t_idx], detections[d_idx])
                    matched_track_indices.add(t_idx)
                    matched_det_indices.add(d_idx)
            
            unmatched_track_indices = set(range(len(self.tracked_objects))) - matched_track_indices
            for t_idx in unmatched_track_indices:
                self._handle_unmatched_track(t_idx, matched_track_indices)
        else:
            matched_det_indices = set()
        
        # --- Stage 2: Re-identify Lost Tracks with Unmatched Detections ---
        unmatched_dets = [d for i, d in enumerate(detections) if i not in matched_det_indices]
        if unmatched_dets and self.lost_tracks:
            reid_cost_matrix = self._build_cost_matrix(list(self.lost_tracks), unmatched_dets, only_appearance=True)
            lost_indices, reid_det_indices = linear_sum_assignment(reid_cost_matrix)

            revived_lost_indices = set()
            for lt_idx, d_idx in zip(lost_indices, reid_det_indices):
                if reid_cost_matrix[lt_idx, d_idx] < self.reid_cost_threshold:
                    revived_track = self.lost_tracks[lt_idx]
                    detection = unmatched_dets[d_idx]
                    
                    self._update_matched_track(frame, revived_track, detection)
                    self.tracked_objects.append(revived_track)
                    revived_lost_indices.add(lt_idx)
            
            self.lost_tracks = deque([t for i, t in enumerate(self.lost_tracks) if i not in revived_lost_indices], maxlen=self.lost_tracks.maxlen)
    
    def _build_cost_matrix(self, tracks, detections, only_appearance=False):
        """Builds the cost matrix using vectorized GPU operations."""
        num_tracks = len(tracks)
        num_dets = len(detections)
        if num_tracks == 0 or num_dets == 0:
            return np.empty((num_tracks, num_dets))

        # --- Prepare data as tensors on the GPU ---
        track_embeddings = torch.tensor(
            np.array([t['embedding_gallery'][-1] for t in tracks if t['embedding_gallery']]),
            device=self.device, dtype=torch.float32
        )
        det_embeddings = torch.tensor(
            np.array([d['embedding'] for d in detections if 'embedding' in d]),
            device=self.device, dtype=torch.float32
        )

        # --- Vectorized Appearance Cost (Cosine Distance on GPU) ---
        # 1 - cosine_similarity = cosine distance
        app_cost = 1 - (track_embeddings @ det_embeddings.T)
        
        if only_appearance:
            return app_cost.cpu().numpy()

        # --- Vectorized IoU Cost (on GPU) ---
        track_bboxes = torch.tensor([t['bbox'] for t in tracks], device=self.device, dtype=torch.float32)
        det_bboxes = torch.tensor(
            [[d['x1'], d['y1'], d['x2'], d['y2']] for d in detections], 
            device=self.device, dtype=torch.float32
        )
        iou_matrix = self._calculate_iou(track_bboxes, det_bboxes)
        iou_cost = 1 - iou_matrix

        # --- Vectorized Class Mismatch Mask (on GPU) ---
        track_classes = np.array([t['class_name'] for t in tracks])
        det_classes = np.array([d['class_name'] for d in detections])
        mismatch_mask = torch.tensor(track_classes[:, None] != det_classes, device=self.device)
        
        # --- Combine Costs ---
        cost_matrix = (self.appearance_weight * app_cost) + ((1 - self.appearance_weight) * iou_cost)
        cost_matrix[mismatch_mask] = 1e6  # Invalidate non-matching classes
        cost_matrix[iou_matrix < self.iou_gating_threshold] = 1e6 # Apply IoU gating

        return cost_matrix.cpu().numpy()

    def _update_matched_track(self, frame, track, det):
        x1, y1, x2, y2 = det['x1'], det['y1'], det['x2'], det['y2']
        w, h = x2 - x1, y2 - y1
        
        track['bbox'] = (x1, y1, x2, y2)
        track['lost_frames'] = 0
        track['state'] = 'CONFIRMED'
            
        new_cv_tracker = self.tracker_constructors[self.tracker_type]()
        new_cv_tracker.init(frame, (x1, y1, w, h))
        track['tracker'] = new_cv_tracker

        if 'embedding' in det and det['embedding'] is not None:
            track['embedding_gallery'].append(det['embedding'])
            
        if self.use_kalman:
            measurement = np.array([x1 + w/2, y1 + h/2, w, h], dtype=np.float32)
            track['kf'].correct(measurement)
            track['kf'].statePost[4:] = 0; track['kf'].statePre[4:] = 0

    def _handle_unmatched_track(self, t_idx, matched_track_indices):
        track = self.tracked_objects[int(t_idx)]
        # Check for occlusion against currently tracked (matched) objects
        if matched_track_indices:
            matched_bboxes = np.array([self.tracked_objects[int(m_idx)]['bbox'] for m_idx in matched_track_indices])
            ious = self._calculate_iou_numpy(np.array([track['bbox']]), matched_bboxes)
            if np.max(ious) > self.occlusion_iou_threshold:
                track['state'] = 'OCCLUDED'
                return

        track['lost_frames'] += 1
        if track['state'] == 'OCCLUDED': track['state'] = 'CONFIRMED'

    def add_manual_track(self, frame, bbox, class_name):
        if class_name not in self.reid_models:
            print(f"Warning: No Re-ID model for class '{class_name}'.")
            return

        x1, y1, x2, y2 = [int(c) for c in bbox]
        w = x2 - x1
        h = y2 - y1
        if w <= 0 or h <= 0:
            print("Warning: Invalid bounding box dimensions.")
            return

        new_track = {
            'id': self.next_track_id, 'class_name': class_name,
            'bbox': (x1, y1, x2, y2), 'lost_frames': 0, 'state': 'CONFIRMED', 
            'embedding_gallery': deque(maxlen=20),
            'reid_config': self.reid_models[class_name]
        }

        tracker = self.tracker_constructors[self.tracker_type]()
        tracker.init(frame, (x1, y1, w, h))
        new_track['tracker'] = tracker

        embedding = self._extract_embedding(frame, new_track['bbox'], new_track)
        if embedding is not None: new_track['embedding_gallery'].append(embedding)

        if self.use_kalman:
            new_track['kf'] = self._create_kalman_filter()
            cx, cy = x1 + w/2, y1 + h/2
            new_track['kf'].statePost = np.array([cx, cy, w, h, 0, 0, 0, 0], dtype=np.float32)
        
        self.tracked_objects.append(new_track)
        self.next_track_id += 1
                                
    def _drawing_phase(self, frame):
        frame_copy = frame.copy()
        if not self.tracked_objects: return frame_copy

        bboxes = np.array([obj['bbox'] for obj in self.tracked_objects])
        visibilities = self._get_box_visibility(bboxes, frame.shape)

        for i, obj in enumerate(self.tracked_objects):
            if obj['state'] == 'OCCLUDED' or visibilities[i] < 0.7: continue
            
            color = (0, 255, 0)
            x1, y1, x2, y2 = [int(c) for c in obj['bbox']]
            label = f"{obj['class_name']} {obj['id']}"
            cv2.rectangle(frame_copy, (x1, y1), (x2, y2), color, 2)
            cv2.putText(frame_copy, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
        return frame_copy

    # --- Utility Methods ---
    def detect(self, frame):
        results = self.model(frame, verbose=False)[0]
        detections = []
        for box in results.boxes:
            conf = box.conf[0].item()
            if conf > self.conf_threshold:
                class_name = self.model.names[int(box.cls[0].item())]
                if self.track_classes and class_name not in self.track_classes: continue
                coords = box.xyxy[0].tolist()
                detections.append({
                    'class_name': class_name, 'x1': int(coords[0]), 'y1': int(coords[1]),
                    'x2': int(coords[2]), 'y2': int(coords[3]), 'conf': conf
                })
        return detections
    
    def _get_box_visibility(self, bboxes, frame_shape):
        frame_h, frame_w = frame_shape[:2]
        x1, y1, x2, y2 = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
        
        total_area = (x2 - x1) * (y2 - y1)
        total_area[total_area <= 0] = 1e-6

        visible_x1, visible_y1 = np.maximum(x1, 0), np.maximum(y1, 0)
        visible_x2, visible_y2 = np.minimum(x2, frame_w), np.minimum(y2, frame_h)
        
        visible_w = np.maximum(0, visible_x2 - visible_x1)
        visible_h = np.maximum(0, visible_y2 - visible_y1)
        visible_area = visible_w * visible_h
        return visible_area / total_area

    def _calculate_iou(self, bboxes1, bboxes2):
        """Calculates IoU for two sets of bounding boxes using PyTorch tensors."""
        # Broadcasting to get intersection coordinates
        xA = torch.maximum(bboxes1[:, 0].unsqueeze(1), bboxes2[:, 0])
        yA = torch.maximum(bboxes1[:, 1].unsqueeze(1), bboxes2[:, 1])
        xB = torch.minimum(bboxes1[:, 2].unsqueeze(1), bboxes2[:, 2])
        yB = torch.minimum(bboxes1[:, 3].unsqueeze(1), bboxes2[:, 3])

        interArea = torch.clamp(xB - xA, min=0) * torch.clamp(yB - yA, min=0)

        boxAArea = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
        boxBArea = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])

        iou = interArea / (boxAArea.unsqueeze(1) + boxBArea - interArea + 1e-6)
        return iou

    def _calculate_iou_numpy(self, bboxes1, bboxes2):
        """A NumPy version for CPU-bound occlusion checks."""
        xA = np.maximum(bboxes1[:, 0][:, np.newaxis], bboxes2[:, 0])
        yA = np.maximum(bboxes1[:, 1][:, np.newaxis], bboxes2[:, 1])
        xB = np.minimum(bboxes1[:, 2][:, np.newaxis], bboxes2[:, 2])
        yB = np.minimum(bboxes1[:, 3][:, np.newaxis], bboxes2[:, 3])
        interArea = np.maximum(0, xB - xA) * np.maximum(0, yB - yA)
        boxAArea = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
        boxBArea = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
        return interArea / (boxAArea[:, np.newaxis] + boxBArea - interArea + 1e-6)
    
    def _create_kalman_filter(self):
        kf = cv2.KalmanFilter(8, 4)
        kf.transitionMatrix = np.array([[1,0,0,0,1,0,0,0],[0,1,0,0,0,1,0,0],[0,0,1,0,0,0,1,0],[0,0,0,1,0,0,0,1],
                                         [0,0,0,0,1,0,0,0],[0,0,0,0,0,1,0,0],[0,0,0,0,0,0,1,0],[0,0,0,0,0,0,0,1]], np.float32)
        kf.measurementMatrix = np.array([[1,0,0,0,0,0,0,0],[0,1,0,0,0,0,0,0],[0,0,1,0,0,0,0,0],[0,0,0,1,0,0,0,0]], np.float32)
        kf.processNoiseCov = np.eye(8, dtype=np.float32) * 0.03
        kf.processNoiseCov[4:, 4:] *= 10
        kf.measurementNoiseCov = np.eye(4, dtype=np.float32) * 0.1
        return kf

## VideoPlayer class

In [None]:
class VideoPlayer:
    def __init__(self, source, target_fps=24, size_multiplier=1.0, window_title="Video Playback"):
        self.window_title = window_title
        self.source = source
        self.target_fps = target_fps

        if os.path.isdir(self.source):
            self.source_type = 'images'
            image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff')
            self.image_files = sorted([os.path.join(self.source, f) for f in os.listdir(self.source) if f.lower().endswith(image_extensions)])
            if not self.image_files: raise ValueError("Source directory contains no supported image files.")
            first_frame = cv2.imread(self.image_files[0])
            if first_frame is None: raise IOError(f"Could not read the first image: {self.image_files[0]}")
            self.frame_height, self.frame_width = first_frame.shape[:2]
            self.cap = None
            self.original_fps = 30
        else:
            self.source_type = 'video'
            self.cap = cv2.VideoCapture(self.source)
            if not self.cap.isOpened(): raise IOError(f"Could not open video file: {self.source}")
            self.frame_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            self.frame_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            self.original_fps = self.cap.get(cv2.CAP_PROP_FPS)

        if self.target_fps == 0:
            self.target_fps = self.original_fps
            print(f"Target FPS set to 0. Using original video FPS: {self.target_fps:.2f}")

        # --- New: Adaptive UI Scaling Factor ---
        self.ui_scale_factor = max(0.5, min(self.frame_height, 2200.0) / 1080.0) # Base scale on 1080p, with a minimum

        self.total_processing_time = 0.0
        self.processed_frame_count = 0
        self.state = 'INITIALIZING'
        self.selectable_detections, self.user_selections = [], []
        self.is_drawing_roi, self.show_help = False, True
        self.roi_start_point, self.roi_end_point, self.new_manual_box = None, None, None
        
        self.YOLO_CLASSES = {
            0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 
            5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
            10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
            14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
            20: 'other'
        }
        
        cv2.namedWindow(self.window_title, cv2.WINDOW_NORMAL)
        cv2.resizeWindow(self.window_title, int(self.frame_width * size_multiplier), int(self.frame_height * size_multiplier))
        
        cv2.setMouseCallback(self.window_title, self._mouse_callback)
        print("--- Video Player Initialized for Interactive Tracking ---")

    def _mouse_callback(self, event, x, y, flags, param):
        if self.state != 'PAUSED_FOR_SELECTION': return

        if event == cv2.EVENT_LBUTTONDOWN:
            self.is_drawing_roi = True
            self.roi_start_point, self.roi_end_point = (x, y), (x, y)
        elif event == cv2.EVENT_MOUSEMOVE:
            if self.is_drawing_roi: self.roi_end_point = (x, y)
        elif event == cv2.EVENT_LBUTTONUP:
            if self.is_drawing_roi:
                self.is_drawing_roi = False
                if self.roi_end_point and self.roi_start_point and abs(self.roi_start_point[0] - self.roi_end_point[0]) > 5:
                    x1, y1, x2, y2 = self.roi_start_point[0], self.roi_start_point[1], self.roi_end_point[0], self.roi_end_point[1]
                    self.new_manual_box = (min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
                self.roi_start_point, self.roi_end_point = None, None
        elif event == cv2.EVENT_RBUTTONDOWN:
            removed_selection = False
            for i, sel in reversed(list(enumerate(self.user_selections))):
                bbox = sel.get('bbox') or (sel['x1'], sel['y1'], sel['x2'], sel['y2'])
                if bbox[0] < x < bbox[2] and bbox[1] < y < bbox[3]:
                    removed_item = self.user_selections.pop(i)
                    if 'x1' in removed_item: self.selectable_detections.append(removed_item)
                    removed_selection = True
                    break
            if not removed_selection:
                for i, det in reversed(list(enumerate(self.selectable_detections))):
                    if det['x1'] < x < det['x2'] and det['y1'] < y < det['y2']:
                        self.user_selections.append(self.selectable_detections.pop(i))
                        break

    def _draw_pause_menu(self, frame):
        s = self.ui_scale_factor
        # Scaled values for fonts and layout
        bg_height = int(240 * s)
        title_scale, head_scale, text_scale = 1.8 * s, 1.0 * s, 0.9 * s
        thick_main, thick_sub = max(1, int(3 * s)), max(1, int(2 * s))

        overlay = frame.copy()
        cv2.rectangle(overlay, (0, 0), (frame.shape[1], bg_height), (0, 0, 0), -1)
        frame = cv2.addWeighted(overlay, 0.7, frame, 0.3, 0)
        
        cv2.putText(frame, "PAUSED - SELECTION MODE", (int(25*s), int(60*s)), cv2.FONT_HERSHEY_TRIPLEX, title_scale, (0, 255, 255), thick_main)
        cv2.putText(frame, "Mouse Controls:", (int(25*s), int(115*s)), cv2.FONT_HERSHEY_SIMPLEX, head_scale, (255, 255, 255), thick_main)
        cv2.putText(frame, "- Left-Click & Drag: Draw a new box to track", (int(35*s), int(145*s)), cv2.FONT_HERSHEY_SIMPLEX, text_scale, (255, 255, 255), thick_sub)
        cv2.putText(frame, "- Right-Click: Select (Red) / Deselect (Green)", (int(35*s), int(170*s)), cv2.FONT_HERSHEY_SIMPLEX, text_scale, (255, 255, 255), thick_sub)
        cv2.putText(frame, "Keyboard: C: Confirm | H: Toggle Help | Space: Pause | Q: Quit", (int(25*s), int(210*s)), cv2.FONT_HERSHEY_SIMPLEX, text_scale, (255, 255, 255), thick_sub)
        return frame
    
    def _get_numeric_input(self, frame):
        s = self.ui_scale_factor
        # Scaled values for fonts and layout
        title_scale, text_scale = 1.8 * s, 1.2 * s
        thick_main, thick_sub = max(1, int(4 * s)), max(1, int(3 * s))
        y_offset_start, y_offset_inc = int(120*s), int(45*s)

        num_input = ""
        while True:
            frame_copy, overlay = frame.copy(), frame.copy()
            cv2.rectangle(overlay, (0, 0), (frame_copy.shape[1], frame_copy.shape[0]), (0, 0, 0), -1)
            frame_copy = cv2.addWeighted(overlay, 0.85, frame_copy, 0.15, 0)
            
            current_selection_id = -1
            try:
                if num_input: current_selection_id = int(num_input)
            except ValueError: pass

            cv2.putText(frame_copy, "Enter Class ID & Press Enter:", (int(50*s), int(65*s)), cv2.FONT_HERSHEY_TRIPLEX, title_scale, (0, 255, 255), thick_main)
            y_offset = y_offset_start
            for i, name in self.YOLO_CLASSES.items():
                if y_offset < frame.shape[0] - 30:
                    color = (0, 255, 0) if i == current_selection_id else (255, 255, 255)
                    thickness = thick_main if i == current_selection_id else thick_sub
                    cv2.putText(frame_copy, f"{i}: {name}", (int(50*s), y_offset), cv2.FONT_HERSHEY_SIMPLEX, text_scale, color, thickness)
                    y_offset += y_offset_inc
            
            cv2.imshow(self.window_title, frame_copy)
            key = cv2.waitKey(0)
            if key == 13:
                try:
                    if num_input and int(num_input) in self.YOLO_CLASSES: return int(num_input)
                    else: print(f"Error: Invalid ID. Please try again."); num_input = ""
                except ValueError: print("Error: Invalid input."); num_input = ""
            elif key == 8: num_input = num_input[:-1]
            elif ord('0') <= key <= ord('9'): num_input += chr(key)
            elif key == 27: return None

    def play(self, tracker):
        frame_idx = -1 # Start at -1 to handle loop logic correctly
        temp_frame = None

        while True:
            loop_start_time = time.perf_counter()

            # --- Unified Frame Loading ---
            ret, frame = False, None
            if self.state in ['INITIALIZING', 'PLAYING']:
                frame_idx += 1
                if self.source_type == 'video':
                    ret, frame = self.cap.read()
                elif self.source_type == 'images':
                    if frame_idx < len(self.image_files):
                        frame = cv2.imread(self.image_files[frame_idx])
                        ret = frame is not None
                if ret: temp_frame = frame.copy()
                else: break
            else: # Paused state
                frame = temp_frame.copy()

            # --- State Machine ---
            display_frame = frame.copy()
            if self.state == 'INITIALIZING' and frame_idx >= 1:
                self.state = 'PAUSED_FOR_SELECTION'
                self.selectable_detections = tracker.detect(display_frame)
            elif self.state == 'PLAYING':
                display_frame = tracker.process_frame(display_frame)
            elif self.state == 'PAUSED_FOR_SELECTION':
                if self.show_help: display_frame = self._draw_pause_menu(display_frame)
                for det in self.selectable_detections: cv2.rectangle(display_frame, (det['x1'], det['y1']), (det['x2'], det['y2']), (0, 0, 255), 2)
                for sel in self.user_selections:
                    bbox = sel.get('bbox') or (sel['x1'], sel['y1'], sel['x2'], sel['y2'])
                    cv2.rectangle(display_frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 3)
                if self.is_drawing_roi and self.roi_start_point and self.roi_end_point:
                    cv2.rectangle(display_frame, self.roi_start_point, self.roi_end_point, (255, 255, 0), 2)
                if self.new_manual_box:
                    class_id = self._get_numeric_input(display_frame)
                    if class_id is not None:
                        self.user_selections.append({'bbox': self.new_manual_box, 'class_name': self.YOLO_CLASSES[class_id]})
                    self.new_manual_box = None
            
            # --- Live FPS and Final Display ---
            processing_time = time.perf_counter() - loop_start_time
            live_fps = 1.0 / processing_time if processing_time > 0 else float('inf')
            if self.state != 'PAUSED_FOR_SELECTION':
                self.total_processing_time += processing_time
                self.processed_frame_count += 1
            
            s = self.ui_scale_factor
            cv2.putText(display_frame, f"FPS: {live_fps:.1f}", (int(20*s), int(40*s)), cv2.FONT_HERSHEY_SIMPLEX, 1.2*s, (0, 255, 0), max(1, int(2*s)))
            cv2.imshow(self.window_title, display_frame)

            wait_ms = 1
            if self.target_fps != -1 and self.state == 'PLAYING':
                target_duration = 1.0 / self.target_fps
                if (delay_needed := target_duration - processing_time) > 0: wait_ms = int(delay_needed * 1000)
            elif self.state == 'PAUSED_FOR_SELECTION': wait_ms = 20
            
            key = cv2.waitKey(wait_ms) & 0xFF
            if key == ord('q'): break
            elif key == ord('h'): self.show_help = not self.show_help
            elif key == 32 and self.state == 'PLAYING':
                self.state = 'PAUSED_FOR_SELECTION'
                self.selectable_detections = tracker.detect(frame)
                self.user_selections = list(tracker.tracked_objects)
            elif key == ord('c') and self.state == 'PAUSED_FOR_SELECTION':
                tracker.tracked_objects, tracker.next_track_id = [], 0
                for sel in self.user_selections:
                    bbox = sel.get('bbox') or (sel['x1'], sel['y1'], sel['x2'], sel['y2'])
                    tracker.add_manual_track(temp_frame, bbox, sel['class_name'])
                self.selectable_detections, self.user_selections, self.state = [], [], 'PLAYING'

        if self.processed_frame_count > 0:
            avg_fps = self.processed_frame_count / self.total_processing_time
            print(f"\n--- Playback Finished ---\nAverage Processing FPS: {avg_fps:.2f}\n-------------------------")
        
        self.release()

    def release(self):
        print("Releasing resources...")
        if self.cap and self.cap.isOpened(): self.cap.release()
        cv2.destroyAllWindows()
        for _ in range(5): cv2.waitKey(1)

## Realtime Playback

In [None]:
# VIDEO_PATH = './assets/OTB100/human2/img/'
VIDEO_PATH = './assets/footage/person4.mp4'
MODEL_PATH = './yolo11n.pt'
TARGET_FPS = 0 # 0: standard video fps / -1: max fps
WINDOW_SIZE = .75

try:
    tracker = TrackingSystem(
        tracker_type='ckcf',
        track_classes=['person', 'car'],
        use_kalman=False,
        base_detect_interval=8000
    )

    player = VideoPlayer(
        source=VIDEO_PATH,
        target_fps=TARGET_FPS,
        size_multiplier=WINDOW_SIZE,
        window_title="Realtime Player"
    )
    
    player.play(tracker)
except IOError as e:
    print(e)
except Exception as e:
    print(f"Error: {e}")