In [2]:
# Install required packages if not installed:
# pip install ultralytics torch torchvision torchreid opencv-python filterpy scikit-learn numpy

In [3]:
from ultralytics import YOLO
import torch
import torchreid
import cv2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from filterpy.kalman import KalmanFilter



In [4]:
import os

# Path to your YOLO model weights
yolo_weights_path = 'best.pt'

if not os.path.exists(yolo_weights_path):
	raise FileNotFoundError(f"YOLO weights file not found: {yolo_weights_path}. Please provide the correct path to your model weights.")

# Load YOLOv11 Model
yolo_model = YOLO(yolo_weights_path)

In [5]:
# Load ReID Model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
reid_model = torchreid.models.build_model(
    name='osnet_x1_0',
    num_classes=1000,
    loss='softmax',
    pretrained=True
)
reid_model.eval().to(device)

Successfully loaded imagenet pretrained weights from "C:\Users\Karthikeya/.cache\torch\checkpoints\osnet_x1_0_imagenet.pth"


OSNet(
  (conv1): ConvLayer(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
  )
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (conv2): Sequential(
    (0): OSBlock(
      (conv1): Conv1x1(
        (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      (conv2a): LightConv3x3(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
        (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      (conv2b): Sequential(
        (

In [6]:
# Preprocessing for ReID
def preprocess(crop):
    if crop is None or crop.size == 0:
        return None
    crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
    crop = cv2.resize(crop, (128, 256))

    img = crop.astype(np.float32) / 255.0
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img = (img - mean) / std

    img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).float().to(device)
    return img

def extract_reid_features(crop):
    img_tensor = preprocess(crop)
    if img_tensor is None:
        return None
    with torch.no_grad():
        features = reid_model(img_tensor)
    return features.cpu().numpy().flatten()


In [7]:
# Kalman Filter initialization
def create_kalman_filter(x, y):
    kf = KalmanFilter(dim_x=4, dim_z=2)
    kf.x = np.array([x, y, 0, 0])
    kf.F = np.array([[1, 0, 1, 0],
                     [0, 1, 0, 1],
                     [0, 0, 1, 0],
                     [0, 0, 0, 1]])
    kf.H = np.array([[1, 0, 0, 0],
                     [0, 1, 0, 0]])
    kf.P *= 1000.
    kf.R *= 10.
    kf.Q *= 0.01
    return kf

In [8]:
# IoU calculation
def iou(bbox1, bbox2):
    x1, y1, x2, y2 = bbox1
    x1g, y1g, x2g, y2g = bbox2

    xi1, yi1 = max(x1, x1g), max(y1, y1g)
    xi2, yi2 = min(x2, x2g), min(y2, y2g)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)

    bbox1_area = (x2 - x1) * (y2 - y1)
    bbox2_area = (x2g - x1g) * (y2g - y1g)

    union_area = bbox1_area + bbox2_area - inter_area
    return inter_area / union_area if union_area > 0 else 0


In [9]:
# Tracker class
class Tracker:
    def __init__(self, id, bbox, embedding):
        self.id = id
        self.bbox = bbox
        self.embedding = embedding
        cx = (bbox[0] + bbox[2]) / 2
        cy = (bbox[1] + bbox[3]) / 2
        self.kf = create_kalman_filter(cx, cy)
        self.age = 0
        self.missing = 0

    def predict(self):
        self.kf.predict()
        cx, cy = self.kf.x[0], self.kf.x[1]
        w = self.bbox[2] - self.bbox[0]
        h = self.bbox[3] - self.bbox[1]
        self.bbox = (cx - w/2, cy - h/2, cx + w/2, cy + h/2)

    def update(self, bbox, embedding):
        cx = (bbox[0] + bbox[2]) / 2
        cy = (bbox[1] + bbox[3]) / 2
        self.kf.update([cx, cy])
        self.bbox = bbox
        self.embedding = embedding
        self.missing = 0
        self.age += 1

In [10]:
# Parameters
IOU_THRESHOLD = 0.3
REID_THRESHOLD = 0.7
MAX_MISSING_FRAMES = 30
padding = 20

# Initialize
trackers = []

# Video setup
cap = cv2.VideoCapture('Inputs/15sec_input_720p.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))

out = cv2.VideoWriter('Output/output_new.mp4',
                      cv2.VideoWriter_fourcc(*'mp4v'),
                      fps,
                      (frame_width, frame_height))

In [11]:
# Main Loop
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    detections = []
    # Use torch.no_grad() to speed up inference and reduce memory usage
    with torch.no_grad():
        results = yolo_model(frame)

        # Use numpy array only once
        boxes = results[0].boxes.xyxy.cpu().numpy() if hasattr(results[0].boxes.xyxy, 'cpu') else results[0].boxes.xyxy
        for box in boxes:
            x1, y1, x2, y2 = map(int, box[:4])
            x1 = max(0, x1 - padding)
            y1 = max(0, y1 - padding)
            x2 = min(frame.shape[1]-1, x2 + padding)
            y2 = min(frame.shape[0]-1, y2 + padding)

            crop = frame[y1:y2, x1:x2]
            # Only extract features if crop is valid
            if crop.size == 0 or (y2 - y1) <= 0 or (x2 - x1) <= 0:
                continue
            embedding = extract_reid_features(crop)
            if embedding is None:
                continue
            detections.append((x1, y1, x2, y2, embedding))

    # Predict existing trackers (vectorized for performance)
    for tracker in trackers:
        tracker.predict()
        tracker.missing += 1

    matched = set()
    # Precompute tracker embeddings for batch cosine similarity
    tracker_embeddings = np.array([t.embedding for t in trackers]) if trackers else None
    for det in detections:
        x1, y1, x2, y2, embedding = det
        bbox = (x1, y1, x2, y2)

        best_match = None
        best_score = 0

        if tracker_embeddings is not None and len(tracker_embeddings) > 0:
            # Batch cosine similarity for all trackers
            reid_scores = cosine_similarity(embedding.reshape(1, -1), tracker_embeddings)[0]
        else:
            reid_scores = []

        for idx, tracker in enumerate(trackers):
            iou_score = iou(bbox, tracker.bbox)
            reid_score = reid_scores[idx] if len(reid_scores) > idx else 0
            combined = (0.6 * iou_score) + (0.4 * reid_score)

            if combined > best_score and (iou_score > IOU_THRESHOLD or reid_score > REID_THRESHOLD):
                best_score = combined
                best_match = tracker

        if best_match:
            best_match.update(bbox, embedding)
            matched.add(best_match.id)
        else:
            # Assign a human-readable ID (e.g., Player 1, Player 2, ...)
            new_id = f"Player {len(trackers) + 1}"
            trackers.append(Tracker(new_id, bbox, embedding))

    # Remove lost trackers
    trackers = [t for t in trackers if t.missing <= MAX_MISSING_FRAMES]

    # Draw results
    for tracker in trackers:
        x1, y1, x2, y2 = map(int, tracker.bbox)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f'ID {tracker.id}', (x1, y1-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    out.write(frame)

cap.release()
out.release()
print(f"✅ Processing complete. Total unique IDs: {len(trackers)}")


0: 384x640 1 ball, 16 players, 2 referees, 191.5ms
Speed: 13.1ms preprocess, 191.5ms inference, 498.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 18 players, 2 referees, 68.9ms
Speed: 2.3ms preprocess, 68.9ms inference, 4.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 16 players, 2 referees, 66.4ms
Speed: 3.4ms preprocess, 66.4ms inference, 8.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 2 referees, 70.3ms
Speed: 2.2ms preprocess, 70.3ms inference, 10.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 2 referees, 72.0ms
Speed: 2.1ms preprocess, 72.0ms inference, 15.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 16 players, 2 referees, 71.9ms
Speed: 1.9ms preprocess, 71.9ms inference, 5.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 2 referees, 70.0ms
Speed: 3.3ms preprocess, 70.0ms inference, 30.2ms postprocess per imag