<div align="center">
  <a href="http://www.sharif.edu/">
    <img src="https://cdn.freebiesupply.com/logos/large/2x/sharif-logo-png-transparent.png" alt="SUT Logo" width="140">
  </a>
  
  # Sharif University of Technology
  ### Electrical Engineering Department

  ## Signals and Systems
  #### *Final Project - Spring 2025*
</div>

---

<div align="center">
  <h1>
    <b>Object Tracker</b>
  </h1>
  <p>
    An object tracking system using YOLO for detection and various algorithms (KCF, CSRT, MOSSE) for tracking.
  </p>
</div>

<br>

| Professor                  |
| :-------------------------: |
| Dr. Mohammad Mehdi Mojahedian |

<br>

| Contributors              |
| :-----------------------: |
| **Amirreza Mousavi** |
| **Mahdi Falahi** |
| **Zahra Miladipour** |

---

## Imports

In [None]:
import cv2
import numpy as np
from ultralytics import YOLO
import time
import torch
from scipy.optimize import linear_sum_assignment
import os
import torchreid
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from collections import deque

## Custom Kcf

In [None]:
class KCFParams:
    """
    A data class to hold all configurable parameters for the KCF tracker.
    """
    def __init__(self):
        self.detect_thresh = 0.1
        self.sigma = 0.1
        self.lambda_ = 0.0001
        self.interp_factor = 0.1
        self.output_sigma_factor = 1.0 / 16.0
        self.resize = True
        self.max_patch_size = 80 * 80
        self.split_coeff = True
        self.wrap_kernel = False
        self.desc_npca = 'GRAY'
        self.desc_pca = 'CN'
        self.compress_feature = True
        self.compressed_size = 2
        self.pca_learning_rate = 0.15

class TrackerKCF:
    """
    Python implementation of the Kernelized Correlation Filter (KCF) tracker.
    """
    def __init__(self, parameters=KCFParams()):
        self.params = parameters
        self.roi = None
        self.frame = 0
        self.resize_image = False
        self.output_sigma = 0.0
        self.yf = None
        self.alphaf = None
        self.alphaf_den = None
        self.z = None
        self.x = None
        self.hann = None
        self.hann_cn = None
        self.features_pca = []
        self.features_npca = []
        self.descriptors_pca = []
        self.descriptors_npca = []
        self.proj_mtx = None
        self.old_cov_mtx = None
        self.use_custom_extractor_pca = False
        self.use_custom_extractor_npca = False
        self.extractor_pca = []
        self.extractor_npca = []
        self.X = {}
        self.Z = {}
        self.Zc = {}
        self.k = None
        self.kf = None
        self.kf_lambda = None
        self.new_alphaf = None
        self.response = None
        self._color_names_table = None

    def init(self, image, boundingBox):
        if len(image.shape) == 2 or image.shape[2] == 1:
            self.params.desc_pca = self.params.desc_pca.replace('CN', '')
            self.params.desc_npca = self.params.desc_npca.replace('CN', '')

        self.frame = 0
        x, y, w, h = boundingBox
        img_to_process = image
        if self.params.resize and w * h > self.params.max_patch_size:
            self.resize_image = True
            x /= 2.0
            y /= 2.0
            w /= 2.0
            h /= 2.0
            img_to_process = cv2.resize(image, (image.shape[1] // 2, image.shape[0] // 2), interpolation=cv2.INTER_LINEAR_EXACT)

        roi_x = x - w / 2
        roi_y = y - h / 2
        roi_w = w * 2
        roi_h = h * 2
        self.roi = (roi_x, roi_y, roi_w, roi_h)

        output_sigma_val = np.sqrt(w * h) * self.params.output_sigma_factor
        self.output_sigma = -0.5 / (output_sigma_val * output_sigma_val)

        win_size = (int(roi_w), int(roi_h))
        self.hann = self._createHanningWindow(win_size)
        self.hann_cn = np.dstack([self.hann] * 10)

        sz_y, sz_x = int(roi_h), int(roi_w)
        j, i = np.meshgrid(np.arange(sz_x), np.arange(sz_y))
        cy, cx = sz_y // 2, sz_x // 2
        dist_sq = (i - cy + 1)**2 + (j - cx + 1)**2
        y = np.exp(self.output_sigma * dist_sq)
        self.yf = self._fft2(y)

        if 'GRAY' in self.params.desc_npca.upper(): self.descriptors_npca.append('GRAY')
        if 'CN' in self.params.desc_npca.upper(): self.descriptors_npca.append('CN')
        if self.use_custom_extractor_npca: self.descriptors_npca.append('CUSTOM')

        if 'GRAY' in self.params.desc_pca.upper(): self.descriptors_pca.append('GRAY')
        if 'CN' in self.params.desc_pca.upper(): self.descriptors_pca.append('CN')
        if self.use_custom_extractor_pca: self.descriptors_pca.append('CUSTOM')

        features_pca, features_npca = self._extractFeatures(img_to_process, self.roi)
        if not features_pca and not features_npca:
            raise ValueError("No valid features extracted during initialization")

        if features_npca: self.Z[1] = np.dstack(features_npca)
        if features_pca: self.Z[0] = np.dstack(features_pca)
        
        if self.params.compress_feature and self.Z.get(0) is not None:
            self._updateProjectionMatrix(self.Z[0], self.params.pca_learning_rate, self.params.compressed_size)

        x_parts = [self.Z.get(0), self.Z.get(1)]
        if self.params.compress_feature and x_parts[0] is not None:
            x_parts[0] = self._compress(x_parts[0])
        self.x = np.dstack([p for p in x_parts if p is not None])

        k = self._denseGaussKernel(self.params.sigma, self.x)
        kf = self._fft2(k)
        kf_lambda = kf + self.params.lambda_
        if self.params.split_coeff:
            self.alphaf = self._pixelWiseMult(self.yf, kf)
            self.alphaf_den = self._pixelWiseMult(kf, kf_lambda)
        else:
            self.alphaf = self.yf / (kf_lambda + 1e-10)

    def update(self, image):
        img_to_process = image
        if self.resize_image:
            img_to_process = cv2.resize(image, (image.shape[1] // 2, image.shape[0] // 2), interpolation=cv2.INTER_LINEAR_EXACT)
        
        if self.frame > 0:
            features_pca, features_npca = self._extractFeatures(img_to_process, self.roi)
            if not features_pca and not features_npca:
                return False, None

            if features_npca: self.X[1] = np.dstack(features_npca)
            if features_pca: self.X[0] = np.dstack(features_pca)

            if self.params.compress_feature and self.X.get(0) is not None:
                self.X[0] = self._compress(self.X[0])
                self.Zc[0] = self._compress(self.Z[0])
            else:
                self.Zc[0] = self.Z.get(0)
            self.Zc[1] = self.Z.get(1)

            x_parts = [self.X.get(0), self.X.get(1)]
            z_parts = [self.Zc.get(0), self.Zc.get(1)]
            x = np.dstack([p for p in x_parts if p is not None])
            z = np.dstack([p for p in z_parts if p is not None])
            
            k = self._denseGaussKernel(self.params.sigma, x, z)
            
            kf = self._fft2(k)
            self.response = self._calcResponse(self.alphaf, kf)
            
            peak_y, peak_x = np.unravel_index(np.argmax(self.response), self.response.shape)
            
            if self.response[peak_y, peak_x] < self.params.detect_thresh:
                return False, None

            disp_y = peak_y - self.response.shape[0] // 2 + 1
            disp_x = peak_x - self.response.shape[1] // 2 + 1

            self.roi = (self.roi[0] + disp_x, self.roi[1] + disp_y, self.roi[2], self.roi[3])

        features_pca, features_npca = self._extractFeatures(img_to_process, self.roi)
        if not features_pca and not features_npca:
            return False, None
            
        if features_npca: self.X[1] = np.dstack(features_npca)
        if features_pca: self.X[0] = np.dstack(features_pca)
        
        interp = self.params.interp_factor
        if self.X.get(0) is not None:
            self.Z[0] = (1 - interp) * self.Z[0] + interp * self.X[0]
        if self.X.get(1) is not None:
            self.Z[1] = (1 - interp) * self.Z[1] + interp * self.X[1]

        if self.params.compress_feature and self.Z.get(0) is not None:
            self._updateProjectionMatrix(self.Z[0], self.params.pca_learning_rate, self.params.compressed_size)
        
        x_parts = [self.X.get(0), self.X.get(1)]
        if self.params.compress_feature and x_parts[0] is not None:
            x_parts[0] = self._compress(x_parts[0])

        x = np.dstack([p for p in x_parts if p is not None])

        k = self._denseGaussKernel(self.params.sigma, x)
        kf = self._fft2(k)
        kf_lambda = kf + self.params.lambda_
        if self.params.split_coeff:
            new_alphaf = self._pixelWiseMult(self.yf, kf)
            new_alphaf_den = self._pixelWiseMult(kf, kf_lambda)
        else:
            new_alphaf = self.yf / (kf_lambda + 1e-10)

        self.alphaf = (1 - interp) * self.alphaf + interp * new_alphaf
        if self.params.split_coeff:
            self.alphaf_den = (1 - interp) * self.alphaf_den + interp * new_alphaf_den

        rx, ry, rw, rh = self.roi
        obj_w, obj_h = rw / 2, rh / 2
        obj_x, obj_y = rx + rw / 2, ry + rh / 2
        
        if self.resize_image:
            obj_x *= 2
            obj_y *= 2
            obj_w *= 2
            obj_h *= 2

        img_h, img_w = image.shape[:2]
        left = max(0.0, obj_x - obj_w / 2)
        top = max(0.0, obj_y - obj_h / 2)
        right = min(float(img_w), obj_x + obj_w / 2)
        bottom = min(float(img_h), obj_y + obj_h / 2)
        final_bbox = (left, top, right - left, bottom - top)
        
        self.frame += 1
        return True, final_bbox

    def setFeatureExtractor(self, callback, pca_func=False):
        if pca_func:
            self.extractor_pca.append(callback)
            self.use_custom_extractor_pca = True
        else:
            self.extractor_npca.append(callback)
            self.use_custom_extractor_npca = True

    def _fft2(self, src):
        return np.fft.fft2(src, axes=(0, 1))

    def _ifft2(self, src):
        return np.real(np.fft.ifft2(src, axes=(0, 1)))

    def _pixelWiseMult(self, s1, s2, conjB=False):
        return s1 * np.conj(s2) if conjB else s1 * s2

    def _sumChannels(self, src):
        return np.sum(src, axis=2)

    def _createHanningWindow(self, s):
        return cv2.createHanningWindow(s, cv2.CV_32F)

    def _shift(self, mat, dx, dy):
        return np.roll(mat, (dy, dx), axis=(0, 1))
    
    def _extractFeatures(self, image, roi):
        features_npca = []
        for d in self.descriptors_npca:
            if d != 'CUSTOM':
                f = self._getSubWindow(image, roi, d)
                if f is not None:
                    if f.ndim == 2:
                        f = f[:, :, np.newaxis]
                    features_npca.append(f)
        features_pca = []
        for d in self.descriptors_pca:
            if d != 'CUSTOM':
                f = self._getSubWindow(image, roi, d)
                if f is not None:
                    if f.ndim == 2:
                        f = f[:, :, np.newaxis]
                    features_pca.append(f)

        for extractor in self.extractor_npca:
            feat = extractor(image, roi)
            if feat is not None:
                if feat.ndim == 2:
                    feat = feat[:, :, np.newaxis]
                hann_win = np.dstack([self.hann] * feat.shape[2]) if feat.ndim == 3 and feat.shape[2] > 1 else self.hann
                features_npca.append(feat * hann_win)
        
        for extractor in self.extractor_pca:
            feat = extractor(image, roi)
            if feat is not None:
                if feat.ndim == 2:
                    feat = feat[:, :, np.newaxis]
                hann_win = np.dstack([self.hann] * feat.shape[2]) if feat.ndim == 3 and feat.shape[2] > 1 else self.hann
                features_pca.append(feat * hann_win)
            
        return features_pca, features_npca

    def _getSubWindow(self, img, roi, desc):
        x, y, w, h = int(roi[0]), int(roi[1]), int(roi[2]), int(roi[3])
        img_h, img_w = img.shape[:2]
        vx, vy = max(0, x), max(0, y)
        rb, bb = min(x + w, img_w), min(y + h, img_h)
        vw, vh = rb - vx, bb - vy
        if vw <= 0 or vh <= 0: return None
        patch = img[vy:vy + vh, vx:vx + vw]
        top, bot = vy - y, (y + h) - bb
        left, rgt = vx - x, (x + w) - rb
        patch = cv2.copyMakeBorder(patch, top, bot, left, rgt, cv2.BORDER_REPLICATE)
        if patch.size == 0: return None
        if desc == 'GRAY': return self._extractGray(patch)
        elif desc == 'CN': return self._extractCN(patch)
        return None

    def _extractGray(self, p):
        if p is None or p.size == 0: return None
        f = cv2.cvtColor(p, cv2.COLOR_BGR2GRAY) if p.ndim > 2 else p
        f = f.astype(np.float32) / 255.0 - 0.5
        return f * self.hann

    def _extractCN(self, p):
        if p is None or p.size == 0: return None
        if self._color_names_table is None: self._loadColorNamesTable()
        px = p.reshape(-1, 3).astype(np.int32)
        idx = (px[:, 2] // 8) + 32 * (px[:, 1] // 8) + 1024 * (px[:, 0] // 8)
        f = self._color_names_table[idx].reshape(p.shape[0], p.shape[1], 10).astype(np.float32)
        return f * self.hann_cn

    def _loadColorNamesTable(self, path="colornames.npy"):
        try:
            self._color_names_table = np.load(path)
        except IOError:
            raise IOError(f"Color table '{path}' not found. Ensure the file exists or generate it from the original ColorNames array.")

    def _denseGaussKernel(self, sigma, x, y=None):
        if y is None: y = x
        xf = self._fft2(x)
        yf = self._fft2(y)
        nx_sq = np.sum(x**2)
        ny_sq = np.sum(y**2)
        xyf = self._pixelWiseMult(xf, yf, conjB=True)
        xy_sum = self._sumChannels(xyf)
        xy = self._ifft2(xy_sum)
        if self.params.wrap_kernel:
            shift_dy = int(x.shape[0] // 2)
            shift_dx = int(x.shape[1] // 2)
            xy = self._shift(xy, shift_dx, shift_dy)
        d = (nx_sq + ny_sq - 2 * xy) / x.size
        k = np.exp(-np.maximum(0, d) / (sigma**2))
        return k

    def _calcResponse(self, alphaf, kf):
        if not self.params.split_coeff:
            spec = self._pixelWiseMult(alphaf, kf)
            return self._ifft2(spec)
        else:
            spec = self._pixelWiseMult(alphaf, kf)
            response_spec = spec / (self.alphaf_den + 1e-10)
            return self._ifft2(response_spec)

    def _updateProjectionMatrix(self, src, pca_rate, compressed_sz):
        num_pixels, num_channels = src.shape[0] * src.shape[1], src.shape[2]
        data = src.reshape(num_pixels, num_channels)
        mean = np.mean(data, axis=0)
        data_nomean = data - mean
        new_cov = (data_nomean.T @ data_nomean) / (num_pixels - 1)
        if self.old_cov_mtx is None: self.old_cov_mtx = new_cov
        self.old_cov_mtx = (1 - pca_rate) * self.old_cov_mtx + pca_rate * new_cov
        u, s, _ = np.linalg.svd(self.old_cov_mtx)
        self.proj_mtx = u[:, :compressed_sz]
        proj_vars = np.diag(s[:compressed_sz])
        stab_term = self.proj_mtx @ proj_vars @ self.proj_mtx.T
        self.old_cov_mtx = (1 - pca_rate) * self.old_cov_mtx + pca_rate * stab_term

    def _compress(self, src):
        data = src.reshape(-1, src.shape[2])
        compressed = data @ self.proj_mtx
        return compressed.reshape(src.shape[0], src.shape[1], -1)

## Custom Tracker

In [None]:
import cv2
import numpy as np

class CustomTracker:
    def __init__(self, psr_threshold=3.5, filter_alpha=0.025, scale_lr=0.02, lambda_trust=0.001):
        # Parameters not used in this optical flow version, but included for compatibility
        self.PSR_THRESHOLD = psr_threshold
        self.FILTER_ALPHA = filter_alpha
        self.SCALE_LR = scale_lr
        self.LAMBDA = lambda_trust
        self.p0 = None
        self.old_gray = None
        self.bbox = None
        self.lk_params = dict(winSize=(15, 15),
                              maxLevel=2,
                              criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
        self.feature_params = dict(maxCorners=200,
                                   qualityLevel=0.3,
                                   minDistance=7,
                                   blockSize=7)
        self.max_scale_change = 0.2  # Limit per-frame scale change
        self.fb_error_threshold = 2.0  # Forward-backward error threshold from MedianFlow
        self.kalman = None  # Kalman filter from SORT adaptation
        self.min_points = 10  # Increased for stability
        self.min_scale = 0.8  # Prevent excessive shrinkage (relative to previous frame)

    def _init_kalman(self):
        # 6D state: [x, y, vx, vy, s, vs] where s is scale, vs is scale velocity
        kalman = cv2.KalmanFilter(6, 3)  # Measure x, y, s
        kalman.transitionMatrix = np.eye(6, dtype=np.float32)
        kalman.transitionMatrix[0, 2] = 1.0  # x += vx
        kalman.transitionMatrix[1, 3] = 1.0  # y += vy
        kalman.transitionMatrix[4, 5] = 1.0  # s += vs
        kalman.measurementMatrix = np.zeros((3, 6), dtype=np.float32)
        kalman.measurementMatrix[0, 0] = 1.0
        kalman.measurementMatrix[1, 1] = 1.0
        kalman.measurementMatrix[2, 4] = 1.0
        kalman.processNoiseCov = np.eye(6, dtype=np.float32) * 0.05  # Increased for more flexibility
        kalman.measurementNoiseCov = np.eye(3, dtype=np.float32) * 0.05  # Lowered to trust measurements more (less lag)
        kalman.errorCovPost = np.eye(6, dtype=np.float32) * 0.1
        return kalman

    def init(self, frame, bbox):
        x, y, w, h = map(int, bbox)
        self.bbox = (x, y, w, h)
        if len(frame.shape) == 2:  # Grayscale input
            gray_frame = frame.copy()
        else:
            gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        # Apply CLAHE for contrast enhancement
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        self.old_gray = clahe.apply(gray_frame)
        roi_gray = self.old_gray[y:y+h, x:x+w]
        p0 = cv2.goodFeaturesToTrack(roi_gray, mask=None, **self.feature_params)
        if p0 is not None and len(p0) > 0:
            p0[:,:,0] += x
            p0[:,:,1] += y
            self.p0 = p0
        else:
            self.p0 = np.empty((0, 1, 2), dtype=np.float32)
            return False

        # Initialize Kalman
        self.kalman = self._init_kalman()
        cx, cy = x + w / 2, y + h / 2
        self.kalman.statePost = np.array([cx, cy, 0, 0, 1.0, 0], dtype=np.float32)  # Initial scale=1.0
        return True

    def update(self, frame):
        if len(frame.shape) == 2:  # Grayscale
            gray_frame = frame.copy()
        else:
            gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        # Apply CLAHE
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        frame_gray = clahe.apply(gray_frame)
        validated_new = np.array([])  # Initialize to empty array to avoid reference errors
        if len(self.p0) < self.min_points:
            # Dense flow fallback
            flow = cv2.calcOpticalFlowFarneback(self.old_gray, frame_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
            dx = np.median(flow[..., 0])
            dy = np.median(flow[..., 1])
            # Approximate scale from flow divergence (trace of Jacobian)
            dx_dx = np.gradient(flow[..., 0])[1]
            dy_dy = np.gradient(flow[..., 1])[0]
            divergence = dx_dx + dy_dy
            meas_scale = 1.0 + np.median(divergence) * 0.01  # Small adjustment based on divergence
            meas_scale = np.clip(meas_scale, self.min_scale, 1 + self.max_scale_change)
        else:
            # Kalman predict
            predicted = self.kalman.predict()
            pred_cx, pred_cy, _, _, pred_scale, _ = predicted.flatten()
            pred_w = self.bbox[2] * pred_scale
            pred_h = self.bbox[3] * pred_scale
            pred_x = pred_cx - pred_w / 2
            pred_y = pred_cy - pred_h / 2

            # Adjust LK params for small ROI
            roi_area = self.bbox[2] * self.bbox[3]
            if roi_area < 5000:
                self.lk_params['maxLevel'] = 1
                self.lk_params['winSize'] = (10, 10)
                self.feature_params['qualityLevel'] = 0.1  # Lower for more points in low-res

            # Optical flow forward
            p1, st, err = cv2.calcOpticalFlowPyrLK(self.old_gray, frame_gray, self.p0, None, **self.lk_params)

            if p1 is None or np.sum(st) < self.min_points:
                self.p0 = np.empty((0, 1, 2), dtype=np.float32)
                return False, self.bbox

            st_flat = st.ravel() == 1
            good_new = p1[st_flat].reshape(-1, 2)
            good_old = self.p0[st_flat].reshape(-1, 2)

            # Forward-backward validation
            p0r = good_new.reshape(-1, 1, 2)
            p_back, st_back, err_back = cv2.calcOpticalFlowPyrLK(frame_gray, self.old_gray, p0r, None, **self.lk_params)

            if p_back is not None:
                st_back_flat = st_back.ravel() == 1
                p_back_resh = p_back[st_back_flat].reshape(-1, 2)
                fb_errors = np.linalg.norm(good_old[st_back_flat] - p_back_resh, axis=1)
                self.fb_error_threshold = max(1.0, 0.01 * np.mean([self.bbox[2], self.bbox[3]]))  # Adaptive
                valid_submask = fb_errors < self.fb_error_threshold
                valid_mask = np.zeros(len(good_old), dtype=bool)
                valid_mask[st_back_flat] = valid_submask
                validated_new = good_new[valid_mask]
                validated_old = good_old[valid_mask]
            else:
                validated_new = good_new
                validated_old = good_old

            if len(validated_new) < self.min_points:
                return False, self.bbox

            # Additional deviation check
            med_dx = np.median(validated_new[:, 0] - validated_old[:, 0])
            med_dy = np.median(validated_new[:, 1] - validated_old[:, 1])
            deviations = np.sqrt((validated_new[:, 0] - validated_old[:, 0] - med_dx)**2 + (validated_new[:, 1] - validated_old[:, 1] - med_dy)**2)
            std_dev = np.std(deviations)
            dev_mask = deviations < 2 * std_dev
            validated_new = validated_new[dev_mask]
            validated_old = validated_old[dev_mask]

            # Median aggregation
            dx = np.median(validated_new[:, 0] - validated_old[:, 0])
            dy = np.median(validated_new[:, 1] - validated_old[:, 1])
            if len(validated_old) >= 2:
                old_dists = np.linalg.norm(validated_old[:, None] - validated_old[None, :], axis=2)
                new_dists = np.linalg.norm(validated_new[:, None] - validated_new[None, :], axis=2)
                ratios = new_dists[old_dists > 0] / old_dists[old_dists > 0]
                meas_scale = np.median(ratios) if len(ratios) > 0 else 1.0
                meas_scale = max(self.min_scale, np.clip(meas_scale, self.min_scale, 1 + self.max_scale_change))
            else:
                meas_scale = 1.0

        # Measured values
        x, y, w, h = self.bbox
        meas_cx = x + w / 2 + dx
        meas_cy = y + h / 2 + dy

        # Kalman correct with blend for lag reduction
        measurement = np.array([meas_cx, meas_cy, meas_scale], dtype=np.float32)
        corrected = self.kalman.correct(measurement)
        corr_cx, corr_cy, _, _, corr_scale, _ = corrected.flatten()
        corr_cx = 0.7 * meas_cx + 0.3 * corr_cx  # Blend to reduce lag
        corr_cy = 0.7 * meas_cy + 0.3 * corr_cy
        corr_scale = 0.7 * meas_scale + 0.3 * corr_scale

        # Update bbox with smoothed values
        new_w = max(w * corr_scale, w * self.min_scale)  # Anti-shrinkage
        new_h = max(h * corr_scale, h * self.min_scale)
        new_x = corr_cx - new_w / 2
        new_y = corr_cy - new_h / 2
        self.bbox = (int(new_x), int(new_y), int(new_w), int(new_h))

        # Recentering based on point centroid
        if len(validated_new) > 0:
            centroid_x, centroid_y = np.mean(validated_new, axis=0)
            center_offset_x = centroid_x - (new_x + new_w / 2)
            center_offset_y = centroid_y - (new_y + new_h / 2)
            new_x += center_offset_x * 0.5  # Damped adjustment
            new_y += center_offset_y * 0.5
            self.bbox = (int(new_x), int(new_y), int(new_w), int(new_h))

        # Clamp to frame bounds
        frame_h, frame_w = frame_gray.shape
        new_x = max(0, min(new_x, frame_w - new_w))
        new_y = max(0, min(new_y, frame_h - new_h))
        self.bbox = (int(new_x), int(new_y), int(new_w), int(new_h))

        # Update points and gray frame
        self.p0 = validated_new.reshape(-1, 1, 2)
        self.old_gray = frame_gray.copy()

        # Re-detect features with grid for symmetry
        if len(self.p0) < 20:
            x, y, w, h = self.bbox
            roi_gray = self.old_gray[max(0, y):y+h, max(0, x):x+w]
            grid_size = 4
            step_y, step_x = max(1, h // grid_size), max(1, w // grid_size)  # Handle small ROIs
            new_p0 = []
            for gy in range(grid_size):
                for gx in range(grid_size):
                    sub_y_start = gy * step_y
                    sub_y_end = (gy + 1) * step_y
                    sub_x_start = gx * step_x
                    sub_x_end = (gx + 1) * step_x
                    sub_roi = roi_gray[sub_y_start:sub_y_end, sub_x_start:sub_x_end]
                    sub_p0 = cv2.goodFeaturesToTrack(sub_roi, **self.feature_params)
                    if sub_p0 is not None:
                        sub_p0[:,:,0] += sub_x_start + max(0, x)
                        sub_p0[:,:,1] += sub_y_start + max(0, y)
                        new_p0.append(sub_p0)
            if new_p0:
                new_p0 = np.vstack(new_p0)
                self.p0 = np.vstack((self.p0, new_p0)) if len(self.p0) > 0 else new_p0

        return True, self.bbox

## ObjectTracker Wrapper Class

In [None]:
class ObjectTracker:
    def __init__(self, model='./yolo11n.pt', tracker_type='CSRT', base_detect_interval=24, conf_threshold=0.5, 
                 max_lost_frames=30, lost_track_buffer=60,
                 use_kalman=True, track_classes=None, 
                 appearance_weight=0.6, match_cost_threshold=0.85, 
                 reid_cost_threshold=0.3, occlusion_iou_threshold=0.2,
                 iou_gating_threshold=0.1, **kwargs):
        
        # --- Check Cuda Presence ---
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f'object tracker running on {self.device}')

        # --- Core Parameters ---
        self.detect_interval = base_detect_interval
        self.conf_threshold = conf_threshold
        self.max_lost_frames = max_lost_frames
        self.track_classes = track_classes if track_classes is not None else []
        self.model = YOLO(model).to(self.device)
        print('--- Yolo loaded successfully ---')
        
        # --- Cost & Matching Parameters ---
        self.appearance_weight = appearance_weight
        self.match_cost_threshold = match_cost_threshold
        self.reid_cost_threshold = reid_cost_threshold
        self.occlusion_iou_threshold = occlusion_iou_threshold
        self.iou_gating_threshold = iou_gating_threshold
        
        # --- State Management ---
        self.frame_idx = 0
        self.tracked_objects = []
        self.lost_tracks = deque(maxlen=lost_track_buffer)
        self.next_track_id = 0
        
        # --- Kalman Filter ---
        self.use_kalman = use_kalman

        # --- Re-ID Models & Warm-up ---
        self.reid_models = self._load_reid_models()
        for config in self.reid_models.values():
            reid_model = config['model']
            dummy_input = torch.randn(1, 3, 256, 128).to(self.device)
            with torch.no_grad():
                reid_model(dummy_input)
        print("--- osnet loaded successfuly ---")

        self.tracker_params = {}
        if tracker_type.upper() == 'CUSTOM':
            self.tracker_params = {
                'psr_threshold': kwargs.get('psr_threshold', 5.5),
                'filter_alpha': kwargs.get('filter_alpha', 0.025),
                'scale_lr': kwargs.get('scale_lr', 0.02),
                'lambda_trust': kwargs.get('lambda_trust', 0.01)
            }
            print(f"--- Custom tracker configured with params: {self.tracker_params} ---")

        # --- Tracker Constructors ---
        self.tracker_constructors = {
            'CSRT': cv2.legacy.TrackerCSRT_create, 'KCF': cv2.legacy.TrackerKCF_create,
            'MOSSE': cv2.legacy.TrackerMOSSE_create, 'MEDIAN_FLOW': cv2.legacy.TrackerMedianFlow_create,
            'CUSTOM': CustomTracker, "CKCF":TrackerKCF
            
        }
        if tracker_type.upper() not in self.tracker_constructors:
            raise ValueError(f"Invalid tracker type: {tracker_type}. Choose from {list(self.tracker_constructors.keys())}")
        self.tracker_type = tracker_type.upper()
        print(f"--- Object Tracker Initialized ---")

    def _load_reid_models(self):
        """Loads pre-trained Re-ID models for different object classes."""
        models = {}
        person_model = torchreid.models.build_model(name='osnet_x1_0', num_classes=4101, pretrained=False)
        torchreid.utils.load_pretrained_weights(person_model, 'osnet_x1_0_msmt17_256x128.pth')
        person_model.to(self.device).eval()
        person_transform, _ = torchreid.data.transforms.build_transforms(height=256, width=128, is_train=False)
        models['person'] = {'model': person_model, 'transform': person_transform}
        models['car'] = {'model': person_model, 'transform': person_transform}
        return models

    def _extract_embedding(self, frame, bbox, track):
        """Extracts a feature embedding from a single bounding box."""
        reid_config = track.get('reid_config')
        if not reid_config: return None
        model, transform = reid_config['model'], reid_config['transform']
        
        x1, y1, x2, y2 = [int(c) for c in bbox]
        roi = frame[y1:y2, x1:x2]
        if roi.size == 0: return None

        roi_rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
        image_tensor = transform(Image.fromarray(roi_rgb)).unsqueeze(0).to(self.device)
        
        with torch.no_grad():
            embedding = model(image_tensor)
        
        return torch.nn.functional.normalize(embedding, p=2, dim=1).cpu().numpy().flatten()

    def _extract_batch_embeddings(self, frame, detections):
        """Extracts embeddings for a batch of detections, grouped by class."""
        grouped_dets = {}
        for i, det in enumerate(detections):
            cls_name = det['class_name']
            if cls_name in self.reid_models:
                grouped_dets.setdefault(cls_name, []).append((i, det))

        for cls_name, dets_with_indices in grouped_dets.items():
            reid_config = self.reid_models[cls_name]
            model, transform = reid_config['model'], reid_config['transform']
            
            rois_with_indices = []
            for original_idx, det in dets_with_indices:
                x1, y1, x2, y2 = det['x1'], det['y1'], det['x2'], det['y2']
                roi = frame[y1:y2, x1:x2]
                if roi.size > 0:
                    roi_rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
                    rois_with_indices.append({'roi': Image.fromarray(roi_rgb), 'idx': original_idx})
            
            if not rois_with_indices: continue

            batch_tensor = torch.stack([transform(item['roi']) for item in rois_with_indices]).to(self.device)
            with torch.no_grad():
                batch_embeddings = model(batch_tensor)
            
            normalized_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1).cpu().numpy()

            for item, emb in zip(rois_with_indices, normalized_embeddings):
                detections[item['idx']]['embedding'] = emb

    def process_frame(self, frame):
        """Main processing function for each frame."""
        self._apply_boundary_conditions(frame.shape)

        newly_lost = [t for t in self.tracked_objects if t['lost_frames'] >= self.max_lost_frames]
        for t in newly_lost:
            t['state'] = 'LOST'
            self.lost_tracks.append(t)
        self.tracked_objects = [t for t in self.tracked_objects if t['lost_frames'] < self.max_lost_frames]

        if self.use_kalman: self._predict_phase()
        self._update_phase(frame)
        
        if self.frame_idx % self.detect_interval == 0:
            self._match_and_update_phase(frame)

        annotated_frame = self._drawing_phase(frame)
        self.frame_idx += 1
        return annotated_frame
    
    def _apply_boundary_conditions(self, frame_shape):
        if not self.tracked_objects: return
        
        bboxes = np.array([t['bbox'] for t in self.tracked_objects])
        visibility = self._get_box_visibility(bboxes, frame_shape)
        
        for i, track in enumerate(self.tracked_objects):
            if visibility[i] < 0.5:
                track['lost_frames'] = self.max_lost_frames

    def _predict_phase(self):
        for obj in self.tracked_objects:
            obj['kf'].predict()
            predicted_state = obj['kf'].statePost
            cx, cy, w, h = predicted_state[:4]
            obj['bbox'] = (int(cx - w/2), int(cy - h/2), int(cx + w/2), int(cy + h/2))

    def _update_phase(self, frame):
        for obj in self.tracked_objects:
            if obj['state'] == 'OCCLUDED': continue
            success, bbox = obj['tracker'].update(frame)
            if success:
                x1, y1, w, h = [int(v) for v in bbox]
                obj['bbox'] = (x1, y1, x1 + w, y1 + h)
                obj['lost_frames'] = 0 # Reset lost counter on successful short-term track
                if self.use_kalman:
                    measurement = np.array([x1 + w/2, y1 + h/2, w, h], dtype=np.float32)
                    obj['kf'].correct(measurement)
            else:
                obj['lost_frames'] += 1

    def _match_and_update_phase(self, frame):
        detections = self.detect(frame)
        if not detections: return
        self._extract_batch_embeddings(frame, detections)
        
        # --- Stage 1: Match Active Tracks with Detections ---
        if self.tracked_objects:
            cost_matrix = self._build_cost_matrix(self.tracked_objects, detections)
            track_indices, det_indices = linear_sum_assignment(cost_matrix)
            
            matched_track_indices = set()
            matched_det_indices = set()
            for t_idx, d_idx in zip(track_indices, det_indices):
                if cost_matrix[t_idx, d_idx] < self.match_cost_threshold:
                    self._update_matched_track(frame, self.tracked_objects[t_idx], detections[d_idx])
                    matched_track_indices.add(t_idx)
                    matched_det_indices.add(d_idx)
            
            unmatched_track_indices = set(range(len(self.tracked_objects))) - matched_track_indices
            for t_idx in unmatched_track_indices:
                self._handle_unmatched_track(t_idx, matched_track_indices)
        else:
            matched_det_indices = set()
        
        # --- Stage 2: Re-identify Lost Tracks with Unmatched Detections ---
        unmatched_dets = [d for i, d in enumerate(detections) if i not in matched_det_indices]
        if unmatched_dets and self.lost_tracks:
            reid_cost_matrix = self._build_cost_matrix(list(self.lost_tracks), unmatched_dets, only_appearance=True)
            lost_indices, reid_det_indices = linear_sum_assignment(reid_cost_matrix)

            revived_lost_indices = set()
            for lt_idx, d_idx in zip(lost_indices, reid_det_indices):
                if reid_cost_matrix[lt_idx, d_idx] < self.reid_cost_threshold:
                    revived_track = self.lost_tracks[lt_idx]
                    detection = unmatched_dets[d_idx]
                    
                    self._update_matched_track(frame, revived_track, detection)
                    self.tracked_objects.append(revived_track)
                    revived_lost_indices.add(lt_idx)
            
            self.lost_tracks = deque([t for i, t in enumerate(self.lost_tracks) if i not in revived_lost_indices], maxlen=self.lost_tracks.maxlen)
    
    def _build_cost_matrix(self, tracks, detections, only_appearance=False):
        """Builds the cost matrix using vectorized GPU operations."""
        num_tracks = len(tracks)
        num_dets = len(detections)
        if num_tracks == 0 or num_dets == 0:
            return np.empty((num_tracks, num_dets))

        # --- Prepare data as tensors on the GPU ---
        track_embeddings = torch.tensor(
            np.array([t['embedding_gallery'][-1] for t in tracks if t['embedding_gallery']]),
            device=self.device, dtype=torch.float32
        )
        det_embeddings = torch.tensor(
            np.array([d['embedding'] for d in detections if 'embedding' in d]),
            device=self.device, dtype=torch.float32
        )

        # --- Vectorized Appearance Cost (Cosine Distance on GPU) ---
        # 1 - cosine_similarity = cosine distance
        app_cost = 1 - (track_embeddings @ det_embeddings.T)
        
        if only_appearance:
            return app_cost.cpu().numpy()

        # --- Vectorized IoU Cost (on GPU) ---
        track_bboxes = torch.tensor([t['bbox'] for t in tracks], device=self.device, dtype=torch.float32)
        det_bboxes = torch.tensor(
            [[d['x1'], d['y1'], d['x2'], d['y2']] for d in detections], 
            device=self.device, dtype=torch.float32
        )
        iou_matrix = self._calculate_iou(track_bboxes, det_bboxes)
        iou_cost = 1 - iou_matrix

        # --- Vectorized Class Mismatch Mask (on GPU) ---
        track_classes = np.array([t['class_name'] for t in tracks])
        det_classes = np.array([d['class_name'] for d in detections])
        mismatch_mask = torch.tensor(track_classes[:, None] != det_classes, device=self.device)
        
        # --- Combine Costs ---
        cost_matrix = (self.appearance_weight * app_cost) + ((1 - self.appearance_weight) * iou_cost)
        cost_matrix[mismatch_mask] = 1e6  # Invalidate non-matching classes
        cost_matrix[iou_matrix < self.iou_gating_threshold] = 1e6 # Apply IoU gating

        return cost_matrix.cpu().numpy()

    def _update_matched_track(self, frame, track, det):
        x1, y1, x2, y2 = det['x1'], det['y1'], det['x2'], det['y2']
        w, h = x2 - x1, y2 - y1
        
        track['bbox'] = (x1, y1, x2, y2)
        track['lost_frames'] = 0
        track['state'] = 'CONFIRMED'
            
        new_cv_tracker = self.tracker_constructors[self.tracker_type](**self.tracker_params)
        new_cv_tracker.init(frame, (x1, y1, w, h))
        track['tracker'] = new_cv_tracker

        if 'embedding' in det and det['embedding'] is not None:
            track['embedding_gallery'].append(det['embedding'])
            
        if self.use_kalman:
            measurement = np.array([x1 + w/2, y1 + h/2, w, h], dtype=np.float32)
            track['kf'].correct(measurement)
            track['kf'].statePost[4:] = 0; track['kf'].statePre[4:] = 0

    def _handle_unmatched_track(self, t_idx, matched_track_indices):
        track = self.tracked_objects[int(t_idx)]
        # Check for occlusion against currently tracked (matched) objects
        if matched_track_indices:
            matched_bboxes = np.array([self.tracked_objects[int(m_idx)]['bbox'] for m_idx in matched_track_indices])
            ious = self._calculate_iou_numpy(np.array([track['bbox']]), matched_bboxes)
            if np.max(ious) > self.occlusion_iou_threshold:
                track['state'] = 'OCCLUDED'
                return

        track['lost_frames'] += 1
        if track['state'] == 'OCCLUDED': track['state'] = 'CONFIRMED'

    def add_manual_track(self, frame, bbox, class_name):
        if class_name not in self.reid_models:
            print(f"Warning: No Re-ID model for class '{class_name}'.")
            return

        x1, y1, x2, y2 = [int(c) for c in bbox]
        w = x2 - x1
        h = y2 - y1
        if w <= 0 or h <= 0:
            print("Warning: Invalid bounding box dimensions.")
            return

        new_track = {
            'id': self.next_track_id, 'class_name': class_name,
            'bbox': (x1, y1, x2, y2), 'lost_frames': 0, 'state': 'CONFIRMED', 
            'embedding_gallery': deque(maxlen=20),
            'reid_config': self.reid_models[class_name]
        }

        tracker = self.tracker_constructors[self.tracker_type](**self.tracker_params)
        tracker.init(frame, (x1, y1, w, h))
        new_track['tracker'] = tracker

        embedding = self._extract_embedding(frame, new_track['bbox'], new_track)
        if embedding is not None: new_track['embedding_gallery'].append(embedding)

        if self.use_kalman:
            new_track['kf'] = self._create_kalman_filter()
            cx, cy = x1 + w/2, y1 + h/2
            new_track['kf'].statePost = np.array([cx, cy, w, h, 0, 0, 0, 0], dtype=np.float32)
        
        self.tracked_objects.append(new_track)
        self.next_track_id += 1
                                
    def _drawing_phase(self, frame):
        frame_copy = frame.copy()
        if not self.tracked_objects: return frame_copy

        bboxes = np.array([obj['bbox'] for obj in self.tracked_objects])
        visibilities = self._get_box_visibility(bboxes, frame.shape)

        for i, obj in enumerate(self.tracked_objects):
            if obj['state'] == 'OCCLUDED' or visibilities[i] < 0.7: continue
            
            color = (0, 255, 0)
            x1, y1, x2, y2 = [int(c) for c in obj['bbox']]
            label = f"{obj['class_name']} {obj['id']}"
            cv2.rectangle(frame_copy, (x1, y1), (x2, y2), color, 2)
            cv2.putText(frame_copy, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
        return frame_copy

    # --- Utility Methods ---
    def detect(self, frame):
        results = self.model(frame, verbose=False)[0]
        detections = []
        for box in results.boxes:
            conf = box.conf[0].item()
            if conf > self.conf_threshold:
                class_name = self.model.names[int(box.cls[0].item())]
                if self.track_classes and class_name not in self.track_classes: continue
                coords = box.xyxy[0].tolist()
                detections.append({
                    'class_name': class_name, 'x1': int(coords[0]), 'y1': int(coords[1]),
                    'x2': int(coords[2]), 'y2': int(coords[3]), 'conf': conf
                })
        return detections
    
    def _get_box_visibility(self, bboxes, frame_shape):
        frame_h, frame_w = frame_shape[:2]
        x1, y1, x2, y2 = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
        
        total_area = (x2 - x1) * (y2 - y1)
        total_area[total_area <= 0] = 1e-6

        visible_x1, visible_y1 = np.maximum(x1, 0), np.maximum(y1, 0)
        visible_x2, visible_y2 = np.minimum(x2, frame_w), np.minimum(y2, frame_h)
        
        visible_w = np.maximum(0, visible_x2 - visible_x1)
        visible_h = np.maximum(0, visible_y2 - visible_y1)
        visible_area = visible_w * visible_h
        return visible_area / total_area

    def _calculate_iou(self, bboxes1, bboxes2):
        """Calculates IoU for two sets of bounding boxes using PyTorch tensors."""
        # Broadcasting to get intersection coordinates
        xA = torch.maximum(bboxes1[:, 0].unsqueeze(1), bboxes2[:, 0])
        yA = torch.maximum(bboxes1[:, 1].unsqueeze(1), bboxes2[:, 1])
        xB = torch.minimum(bboxes1[:, 2].unsqueeze(1), bboxes2[:, 2])
        yB = torch.minimum(bboxes1[:, 3].unsqueeze(1), bboxes2[:, 3])

        interArea = torch.clamp(xB - xA, min=0) * torch.clamp(yB - yA, min=0)

        boxAArea = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
        boxBArea = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])

        iou = interArea / (boxAArea.unsqueeze(1) + boxBArea - interArea + 1e-6)
        return iou

    def _calculate_iou_numpy(self, bboxes1, bboxes2):
        """A NumPy version for CPU-bound occlusion checks."""
        xA = np.maximum(bboxes1[:, 0][:, np.newaxis], bboxes2[:, 0])
        yA = np.maximum(bboxes1[:, 1][:, np.newaxis], bboxes2[:, 1])
        xB = np.minimum(bboxes1[:, 2][:, np.newaxis], bboxes2[:, 2])
        yB = np.minimum(bboxes1[:, 3][:, np.newaxis], bboxes2[:, 3])
        interArea = np.maximum(0, xB - xA) * np.maximum(0, yB - yA)
        boxAArea = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
        boxBArea = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
        return interArea / (boxAArea[:, np.newaxis] + boxBArea - interArea + 1e-6)
    
    def _create_kalman_filter(self):
        kf = cv2.KalmanFilter(8, 4)
        kf.transitionMatrix = np.array([[1,0,0,0,1,0,0,0],[0,1,0,0,0,1,0,0],[0,0,1,0,0,0,1,0],[0,0,0,1,0,0,0,1],
                                         [0,0,0,0,1,0,0,0],[0,0,0,0,0,1,0,0],[0,0,0,0,0,0,1,0],[0,0,0,0,0,0,0,1]], np.float32)
        kf.measurementMatrix = np.array([[1,0,0,0,0,0,0,0],[0,1,0,0,0,0,0,0],[0,0,1,0,0,0,0,0],[0,0,0,1,0,0,0,0]], np.float32)
        kf.processNoiseCov = np.eye(8, dtype=np.float32) * 0.03
        kf.processNoiseCov[4:, 4:] *= 10
        kf.measurementNoiseCov = np.eye(4, dtype=np.float32) * 0.1
        return kf

## VideoPlayer class

In [None]:
class VideoPlayer:
    def __init__(self, source, target_fps=24, size_multiplier=1.0, window_title="Video Playback"):
        self.window_title = window_title
        self.source = source
        self.target_fps = target_fps

        if os.path.isdir(self.source):
            self.source_type = 'images'
            image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff')
            self.image_files = sorted([os.path.join(self.source, f) for f in os.listdir(self.source) if f.lower().endswith(image_extensions)])
            if not self.image_files: raise ValueError("Source directory contains no supported image files.")
            first_frame = cv2.imread(self.image_files[0])
            if first_frame is None: raise IOError(f"Could not read the first image: {self.image_files[0]}")
            self.frame_height, self.frame_width = first_frame.shape[:2]
            self.cap = None
            self.original_fps = 30
        else:
            self.source_type = 'video'
            self.cap = cv2.VideoCapture(self.source)
            if not self.cap.isOpened(): raise IOError(f"Could not open video file: {self.source}")
            self.frame_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            self.frame_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            self.original_fps = self.cap.get(cv2.CAP_PROP_FPS)

        if self.target_fps == 0:
            self.target_fps = self.original_fps
            print(f"Target FPS set to 0. Using original video FPS: {self.target_fps:.2f}")

        # --- New: Adaptive UI Scaling Factor ---
        self.ui_scale_factor = max(0.5, min(self.frame_height, 2200.0) / 1080.0) # Base scale on 1080p, with a minimum

        self.total_processing_time = 0.0
        self.processed_frame_count = 0
        self.state = 'INITIALIZING'
        self.selectable_detections, self.user_selections = [], []
        self.is_drawing_roi, self.show_help = False, True
        self.roi_start_point, self.roi_end_point, self.new_manual_box = None, None, None
        
        self.YOLO_CLASSES = {
            0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 
            5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
            10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
            14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
            20: 'other'
        }
        
        cv2.namedWindow(self.window_title, cv2.WINDOW_NORMAL)
        cv2.resizeWindow(self.window_title, int(self.frame_width * size_multiplier), int(self.frame_height * size_multiplier))
        
        cv2.setMouseCallback(self.window_title, self._mouse_callback)
        print("--- Video Player Initialized for Interactive Tracking ---")

    def _mouse_callback(self, event, x, y, flags, param):
        if self.state != 'PAUSED_FOR_SELECTION': return

        if event == cv2.EVENT_LBUTTONDOWN:
            self.is_drawing_roi = True
            self.roi_start_point, self.roi_end_point = (x, y), (x, y)
        elif event == cv2.EVENT_MOUSEMOVE:
            if self.is_drawing_roi: self.roi_end_point = (x, y)
        elif event == cv2.EVENT_LBUTTONUP:
            if self.is_drawing_roi:
                self.is_drawing_roi = False
                if self.roi_end_point and self.roi_start_point and abs(self.roi_start_point[0] - self.roi_end_point[0]) > 5:
                    x1, y1, x2, y2 = self.roi_start_point[0], self.roi_start_point[1], self.roi_end_point[0], self.roi_end_point[1]
                    self.new_manual_box = (min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
                self.roi_start_point, self.roi_end_point = None, None
        elif event == cv2.EVENT_RBUTTONDOWN:
            removed_selection = False
            for i, sel in reversed(list(enumerate(self.user_selections))):
                bbox = sel.get('bbox') or (sel['x1'], sel['y1'], sel['x2'], sel['y2'])
                if bbox[0] < x < bbox[2] and bbox[1] < y < bbox[3]:
                    removed_item = self.user_selections.pop(i)
                    if 'x1' in removed_item: self.selectable_detections.append(removed_item)
                    removed_selection = True
                    break
            if not removed_selection:
                for i, det in reversed(list(enumerate(self.selectable_detections))):
                    if det['x1'] < x < det['x2'] and det['y1'] < y < det['y2']:
                        self.user_selections.append(self.selectable_detections.pop(i))
                        break

    def _draw_pause_menu(self, frame):
        s = self.ui_scale_factor
        # Scaled values for fonts and layout
        bg_height = int(240 * s)
        title_scale, head_scale, text_scale = 1.8 * s, 1.0 * s, 0.9 * s
        thick_main, thick_sub = max(1, int(3 * s)), max(1, int(2 * s))

        overlay = frame.copy()
        cv2.rectangle(overlay, (0, 0), (frame.shape[1], bg_height), (0, 0, 0), -1)
        frame = cv2.addWeighted(overlay, 0.7, frame, 0.3, 0)
        
        cv2.putText(frame, "PAUSED - SELECTION MODE", (int(25*s), int(60*s)), cv2.FONT_HERSHEY_TRIPLEX, title_scale, (0, 255, 255), thick_main)
        cv2.putText(frame, "Mouse Controls:", (int(25*s), int(115*s)), cv2.FONT_HERSHEY_SIMPLEX, head_scale, (255, 255, 255), thick_main)
        cv2.putText(frame, "- Left-Click & Drag: Draw a new box to track", (int(35*s), int(145*s)), cv2.FONT_HERSHEY_SIMPLEX, text_scale, (255, 255, 255), thick_sub)
        cv2.putText(frame, "- Right-Click: Select (Red) / Deselect (Green)", (int(35*s), int(170*s)), cv2.FONT_HERSHEY_SIMPLEX, text_scale, (255, 255, 255), thick_sub)
        cv2.putText(frame, "Keyboard: C: Confirm | H: Toggle Help | Space: Pause | Q: Quit", (int(25*s), int(210*s)), cv2.FONT_HERSHEY_SIMPLEX, text_scale, (255, 255, 255), thick_sub)
        return frame
    
    def _get_numeric_input(self, frame):
        s = self.ui_scale_factor
        # Scaled values for fonts and layout
        title_scale, text_scale = 1.8 * s, 1.2 * s
        thick_main, thick_sub = max(1, int(4 * s)), max(1, int(3 * s))
        y_offset_start, y_offset_inc = int(120*s), int(45*s)

        num_input = ""
        while True:
            frame_copy, overlay = frame.copy(), frame.copy()
            cv2.rectangle(overlay, (0, 0), (frame_copy.shape[1], frame_copy.shape[0]), (0, 0, 0), -1)
            frame_copy = cv2.addWeighted(overlay, 0.85, frame_copy, 0.15, 0)
            
            current_selection_id = -1
            try:
                if num_input: current_selection_id = int(num_input)
            except ValueError: pass

            cv2.putText(frame_copy, "Enter Class ID & Press Enter:", (int(50*s), int(65*s)), cv2.FONT_HERSHEY_TRIPLEX, title_scale, (0, 255, 255), thick_main)
            y_offset = y_offset_start
            for i, name in self.YOLO_CLASSES.items():
                if y_offset < frame.shape[0] - 30:
                    color = (0, 255, 0) if i == current_selection_id else (255, 255, 255)
                    thickness = thick_main if i == current_selection_id else thick_sub
                    cv2.putText(frame_copy, f"{i}: {name}", (int(50*s), y_offset), cv2.FONT_HERSHEY_SIMPLEX, text_scale, color, thickness)
                    y_offset += y_offset_inc
            
            cv2.imshow(self.window_title, frame_copy)
            key = cv2.waitKey(0)
            if key == 13:
                try:
                    if num_input and int(num_input) in self.YOLO_CLASSES: return int(num_input)
                    else: print(f"Error: Invalid ID. Please try again."); num_input = ""
                except ValueError: print("Error: Invalid input."); num_input = ""
            elif key == 8: num_input = num_input[:-1]
            elif ord('0') <= key <= ord('9'): num_input += chr(key)
            elif key == 27: return None

    def play(self, tracker):
        frame_idx = -1 # Start at -1 to handle loop logic correctly
        temp_frame = None

        while True:
            loop_start_time = time.perf_counter()

            # --- Unified Frame Loading ---
            ret, frame = False, None
            if self.state in ['INITIALIZING', 'PLAYING']:
                frame_idx += 1
                if self.source_type == 'video':
                    ret, frame = self.cap.read()
                elif self.source_type == 'images':
                    if frame_idx < len(self.image_files):
                        frame = cv2.imread(self.image_files[frame_idx])
                        ret = frame is not None
                if ret: temp_frame = frame.copy()
                else: break
            else: # Paused state
                frame = temp_frame.copy()

            # --- State Machine ---
            display_frame = frame.copy()
            if self.state == 'INITIALIZING' and frame_idx >= 1:
                self.state = 'PAUSED_FOR_SELECTION'
                self.selectable_detections = tracker.detect(display_frame)
            elif self.state == 'PLAYING':
                display_frame = tracker.process_frame(display_frame)
            elif self.state == 'PAUSED_FOR_SELECTION':
                if self.show_help: display_frame = self._draw_pause_menu(display_frame)
                for det in self.selectable_detections: cv2.rectangle(display_frame, (det['x1'], det['y1']), (det['x2'], det['y2']), (0, 0, 255), 2)
                for sel in self.user_selections:
                    bbox = sel.get('bbox') or (sel['x1'], sel['y1'], sel['x2'], sel['y2'])
                    cv2.rectangle(display_frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 3)
                if self.is_drawing_roi and self.roi_start_point and self.roi_end_point:
                    cv2.rectangle(display_frame, self.roi_start_point, self.roi_end_point, (255, 255, 0), 2)
                if self.new_manual_box:
                    class_id = self._get_numeric_input(display_frame)
                    if class_id is not None:
                        self.user_selections.append({'bbox': self.new_manual_box, 'class_name': self.YOLO_CLASSES[class_id]})
                    self.new_manual_box = None
            
            # --- Live FPS and Final Display ---
            processing_time = time.perf_counter() - loop_start_time
            live_fps = 1.0 / processing_time if processing_time > 0 else float('inf')
            if self.state != 'PAUSED_FOR_SELECTION':
                self.total_processing_time += processing_time
                self.processed_frame_count += 1
            
            s = self.ui_scale_factor
            cv2.putText(display_frame, f"FPS: {live_fps:.1f}", (int(20*s), int(40*s)), cv2.FONT_HERSHEY_SIMPLEX, 1.2*s, (0, 255, 0), max(1, int(2*s)))
            cv2.imshow(self.window_title, display_frame)

            wait_ms = 1
            if self.target_fps != -1 and self.state == 'PLAYING':
                target_duration = 1.0 / self.target_fps
                if (delay_needed := target_duration - processing_time) > 0: wait_ms = int(delay_needed * 1000)
            elif self.state == 'PAUSED_FOR_SELECTION': wait_ms = 20
            
            key = cv2.waitKey(wait_ms) & 0xFF
            if key == ord('q'): break
            elif key == ord('h'): self.show_help = not self.show_help
            elif key == 32 and self.state == 'PLAYING':
                self.state = 'PAUSED_FOR_SELECTION'
                self.selectable_detections = tracker.detect(frame)
                self.user_selections = list(tracker.tracked_objects)
            elif key == ord('c') and self.state == 'PAUSED_FOR_SELECTION':
                tracker.tracked_objects, tracker.next_track_id = [], 0
                for sel in self.user_selections:
                    bbox = sel.get('bbox') or (sel['x1'], sel['y1'], sel['x2'], sel['y2'])
                    tracker.add_manual_track(temp_frame, bbox, sel['class_name'])
                self.selectable_detections, self.user_selections, self.state = [], [], 'PLAYING'

        if self.processed_frame_count > 0:
            avg_fps = self.processed_frame_count / self.total_processing_time
            print(f"\n--- Playback Finished ---\nAverage Processing FPS: {avg_fps:.2f}\n-------------------------")
        
        self.release()

    def release(self):
        print("Releasing resources...")
        if self.cap and self.cap.isOpened(): self.cap.release()
        cv2.destroyAllWindows()
        for _ in range(5): cv2.waitKey(1)

## Realtime Playback

In [None]:
# VIDEO_PATH = './assets/OTB100/human2/img/'
VIDEO_PATH = './assets/footage/person2.mp4'
MODEL_PATH = './yolo11n.pt'
TARGET_FPS = 0 # 0: standard video fps / -1: max fps
WINDOW_SIZE = .75

try:
    tracker = ObjectTracker(
        tracker_type='ckcf',
        track_classes=['person', 'car'],
        use_kalman=False,
        base_detect_interval=80
    )

    player = VideoPlayer(
        source=VIDEO_PATH,
        target_fps=TARGET_FPS,
        size_multiplier=WINDOW_SIZE,
        window_title="Realtime Player"
    )
    
    player.play(tracker)
except IOError as e:
    print(e)
except Exception as e:
    print(f"Error: {e}")