In [15]:
import cv2
import numpy as np
from scipy.optimize import linear_sum_assignment
from ultralytics import YOLO

# Load YOLO model
model = YOLO("yolo11n.pt")

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Tracking variables
frame_id = 0
tracks = {}
id_counter = 0
colors = {}
unique_people_ids = set()  # Set to keep track of unique person IDs seen in the video

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def compute_iou(bbox1, bbox2):
    x1, y1, w1, h1 = bbox1
    x2, y2, w2, h2 = bbox2
    xi1, yi1 = max(x1, x2), max(y1, y2)
    xi2, yi2 = min(x1 + w1, x2 + w2), min(y1 + h1, y2 + h2)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
    bbox1_area = w1 * h1
    bbox2_area = w2 * h2
    union_area = bbox1_area + bbox2_area - inter_area
    return inter_area / union_area if union_area != 0 else 0

def compute_cost_matrix(detections, previous_detections):
    cost_matrix = np.zeros((len(previous_detections), len(detections)))
    for i, prev_det in enumerate(previous_detections):
        for j, det in enumerate(detections):
            cost_matrix[i, j] = 1 - compute_iou(prev_det['bbox'], det['bbox'])
    return cost_matrix

def update_tracks(tracks, detections, previous_detections):
    global id_counter
    active_ids = set()

    if len(previous_detections) == 0:
        for det in detections:
            det_id = id_counter
            tracks[det_id] = {'bbox': det['bbox'], 'color': generate_color(det_id)}
            det['id'] = det_id
            unique_people_ids.add(det_id)  # Add new ID to unique set
            active_ids.add(det_id)
            id_counter += 1
    else:
        cost_matrix = compute_cost_matrix(detections, previous_detections)
        row_ind, col_ind = linear_sum_assignment(cost_matrix)
        assigned = set()

        for r, c in zip(row_ind, col_ind):
            if cost_matrix[r, c] < 0.5:  # Match only if cost (1 - IoU) is below threshold
                prev_det = previous_detections[r]
                det = detections[c]
                det_id = prev_det['id']
                tracks[det_id]['bbox'] = det['bbox']
                det['id'] = det_id
                active_ids.add(det_id)
                assigned.add(c)
        
        for i, det in enumerate(detections):
            if i not in assigned:
                det_id = id_counter
                tracks[det_id] = {'bbox': det['bbox'], 'color': generate_color(det_id)}
                det['id'] = det_id
                unique_people_ids.add(det_id)  # Add new ID to unique set
                active_ids.add(det_id)
                id_counter += 1

    return active_ids

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

previous_detections = []

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Step 1: Perform detection
    results = model(frame)

    # Step 2: Extract bounding boxes for people (class index 0)
    detections = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append({
                'bbox': (x, y, w, h),
                'confidence': confidence
            })
    
    # Step 3: Update tracks and get current frame IDs
    current_frame_ids = update_tracks(tracks, detections, previous_detections)
    previous_detections = detections

    # Step 4: Draw results on frame
    for det in detections:
        if 'id' in det:
            draw_bounding_box(frame, det['bbox'], tracks[det['id']]['color'], det['id'])
    
    # Display the total count of unique people seen in the video
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    # Step 5: Save the processed frame to the output video
    out.write(frame)
    frame_id += 1

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))



0: 384x640 7 persons, 2 cars, 1 truck, 25.0ms
Speed: 3.0ms preprocess, 25.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 23.7ms
Speed: 4.5ms preprocess, 23.7ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 24.2ms
Speed: 4.0ms preprocess, 24.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 24.0ms
Speed: 5.0ms preprocess, 24.0ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 22.1ms
Speed: 4.0ms preprocess, 22.1ms inference, 3.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 21.5ms
Speed: 4.5ms preprocess, 21.5ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 21.1ms
Speed: 2.1ms preprocess, 21.1ms inference, 2.0ms postprocess per image at shape (1, 3, 38

In [19]:
import cv2
import numpy as np
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort

# Load YOLO model
model = YOLO("yolo11n.pt")

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Initialize Deep SORT tracker
deepsort = DeepSort(max_age=30, n_init=3, nn_budget=70)

# Colors for drawing
colors = {}
unique_people_ids = set()  # Set to keep track of unique person IDs seen in the video

def generate_color(id):
    # Ensure the id is converted to an integer, handling non-integer inputs gracefully
    try:
        id = int(id)  # Convert id to integer if possible
    except ValueError:
        id = 0  # Fallback to a default seed in case of a non-integer id
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Step 1: Perform detection
    results = model(frame)

    # Step 2: Extract bounding boxes for people (class index 0)
    detections = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            # Format as [[x, y, w, h], confidence]
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append([[x, y, w, h], confidence])

    # Step 3: Update Deep SORT tracker with detections
    tracks = deepsort.update_tracks(detections, frame=frame)
    
    # Step 4: Draw results on frame
    for track in tracks:
        if not track.is_confirmed():
            continue
        track_id = track.track_id
        unique_people_ids.add(track_id)
        bbox = track.to_ltrb()  # Convert to [x1, y1, x2, y2] format
        x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2] - bbox[0]), int(bbox[3] - bbox[1])
        
        # Draw bounding box and ID
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, (x, y, w, h), colors[track_id], track_id)

    # Display the total count of unique people seen in the video
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    # Step 5: Save the processed frame to the output video
    out.write(frame)

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))



0: 384x640 7 persons, 2 cars, 1 truck, 24.1ms
Speed: 4.0ms preprocess, 24.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.0ms
Speed: 3.0ms preprocess, 20.0ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.5ms
Speed: 2.0ms preprocess, 20.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 7.0ms
Speed: 1.0ms preprocess, 7.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 8.0ms
Speed: 1.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 15.2ms
Speed: 1.5ms preprocess, 15.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 10.0ms
Speed: 4.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 6

In [25]:
import cv2
import numpy as np
from ultralytics import YOLO
from scipy.spatial.distance import cdist

# Load YOLO model
model = YOLO("yolo11n.pt")

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Colors for drawing
colors = {}
unique_people_ids = set()  # Set to keep track of unique person IDs seen in the video

# Tracking parameters
max_distance = 50  # Maximum distance to consider a match between detections
max_frames_to_keep = 30  # Keep track up to this many frames without a match

class Track:
    def __init__(self, bbox, track_id):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0  # Frames since last update

# Initialize tracker dictionary
tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Step 1: Perform detection
    results = model(frame)

    # Step 2: Extract bounding boxes for people (class index 0)
    detections = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            # Format as [[x, y, w, h], confidence]
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append((x, y, w, h))

    # Step 3: Calculate distance between current detections and existing tracks
    track_bboxes = np.array([track.bbox for track in tracks.values()])
    detected_bboxes = np.array(detections)
    
    if track_bboxes.size > 0 and detected_bboxes.size > 0:
        distances = cdist(track_bboxes[:, :2], detected_bboxes[:, :2])  # Compute distance between track and detection centers
        row_indices, col_indices = np.where(distances < max_distance)

        # Step 4: Update matched tracks
        matched_detections = set()
        for row, col in zip(row_indices, col_indices):
            track_id = list(tracks.keys())[row]
            bbox = detections[col]
            tracks[track_id].bbox = bbox
            tracks[track_id].age = 0
            matched_detections.add(col)

        # Step 5: Handle unmatched detections
        unmatched_detections = set(range(len(detections))) - matched_detections
        for i in unmatched_detections:
            tracks[next_track_id] = Track(detections[i], next_track_id)
            unique_people_ids.add(next_track_id)
            next_track_id += 1

    else:
        # Add all detections as new tracks if no previous tracks exist
        for i, bbox in enumerate(detections):
            tracks[next_track_id] = Track(bbox, next_track_id)
            unique_people_ids.add(next_track_id)
            next_track_id += 1

    # Step 6: Remove stale tracks
    to_remove = [track_id for track_id, track in tracks.items() if track.age > max_frames_to_keep]
    for track_id in to_remove:
        del tracks[track_id]

    # Step 7: Draw results on frame
    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    # Display the total count of unique people seen in the video
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    # Step 8: Save the processed frame to the output video
    out.write(frame)

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))



0: 384x640 7 persons, 2 cars, 1 truck, 25.5ms
Speed: 3.3ms preprocess, 25.5ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 25.6ms
Speed: 3.1ms preprocess, 25.6ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.3ms
Speed: 4.0ms preprocess, 20.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 19.8ms
Speed: 3.5ms preprocess, 19.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.2ms
Speed: 4.0ms preprocess, 20.2ms inference, 3.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.0ms
Speed: 3.1ms preprocess, 20.0ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.5ms
Speed: 4.9ms preprocess, 20.5ms inference, 2.1ms postprocess per image at shape (1, 3, 38

In [6]:
import cv2
import numpy as np
from ultralytics import YOLO
from scipy.spatial.distance import cdist

# Load YOLO model
model = YOLO("yolo11n.pt")

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Colors for drawing
colors = {}
unique_people_ids = set()  # Set to keep track of unique person IDs seen in the video

# Tracking parameters
max_distance = 50  # Maximum distance to consider a match between detections
max_frames_to_keep = 30  # Keep track up to this many frames without a match

class Track:
    def __init__(self, bbox, track_id, frame):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0  # Frames since last update
        self.color_histogram = self.compute_color_histogram(frame, bbox)
        self.kalman_filter = self.create_kalman_filter(bbox)

    def compute_color_histogram(self, frame, bbox):
        x, y, w, h = bbox
        person_img = frame[y:y + h, x:x + w]
        if person_img.size > 0:  # Ensure the cropped image is not empty
            hist = cv2.calcHist([person_img], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
            cv2.normalize(hist, hist)
            return hist
        return None  # Return None if the image region is invalid

    def create_kalman_filter(self, bbox):
        # Initialize Kalman Filter for tracking
        kalman = cv2.KalmanFilter(4, 2)
        kalman.measurementMatrix = np.array([[1, 0, 0, 0], [0, 1, 0, 0]], np.float32)
        kalman.transitionMatrix = np.array([[1, 0, 1, 0], [0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]], np.float32)
        kalman.processNoiseCov = np.eye(4, dtype=np.float32) * 0.03
        x, y, w, h = bbox
        kalman.statePre = np.array([[x + w / 2], [y + h / 2], [0], [0]], np.float32)
        return kalman

    def update(self, bbox, frame):
        self.bbox = bbox
        self.color_histogram = self.compute_color_histogram(frame, bbox)
        self.age = 0

# Initialize tracker dictionary
tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret or frame is None:
        break  # Exit if the frame is not valid

    # Step 1: Perform detection
    results = model(frame)

    # Step 2: Extract bounding boxes for people (class index 0)
    detections = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append((x, y, w, h))

    # Step 3: Calculate distance between current detections and existing tracks
    track_bboxes = np.array([track.bbox for track in tracks.values()])
    detected_bboxes = np.array(detections)
    
    unmatched_detections = set(range(len(detections)))
    if track_bboxes.size > 0 and detected_bboxes.size > 0:
        for track_id, track in list(tracks.items()):
            if len(unmatched_detections) == 0:
                break

            track_center = np.array([track.bbox[0] + track.bbox[2] // 2, track.bbox[1] + track.bbox[3] // 2])
            detected_centers = np.array([[d[0] + d[2] // 2, d[1] + d[3] // 2] for d in detections])

            if detected_centers.size > 0:
                distances = np.linalg.norm(detected_centers - track_center, axis=1)
                similarities = []
                for i in unmatched_detections:
                    hist = Track(detections[i], track_id, frame).color_histogram
                    similarity = cv2.compareHist(track.color_histogram, hist, cv2.HISTCMP_CORREL)
                    similarities.append(similarity)

                match_scores = [
                    (d, distances[d], similarities[i]) 
                    for i, d in enumerate(unmatched_detections) 
                    if distances[d] < max_distance and similarities[i] > 0.5
                ]

                if match_scores:
                    match_scores.sort(key=lambda x: (x[1], -x[2]))
                    best_match = match_scores[0][0]
                    track.update(detections[best_match], frame)
                    unmatched_detections.remove(best_match)
                else:
                    track.age += 1

    # Step 5: Create new tracks for unmatched detections
    for i in unmatched_detections:
        if frame is not None:
            tracks[next_track_id] = Track(detections[i], next_track_id, frame)
            unique_people_ids.add(next_track_id)
            next_track_id += 1

    # Step 6: Remove stale tracks
    to_remove = [track_id for track_id, track in tracks.items() if track.age > max_frames_to_keep]
    for track_id in to_remove:
        del tracks[track_id]

    # Step 7: Draw results on frame
    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    # Display the total count of unique people seen in the video
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    # Step 8: Save the processed frame to the output video
    out.write(frame)

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))



0: 384x640 7 persons, 2 cars, 1 truck, 30.0ms
Speed: 0.0ms preprocess, 30.0ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 21.9ms
Speed: 5.2ms preprocess, 21.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 17.8ms
Speed: 6.3ms preprocess, 17.8ms inference, 6.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 21.1ms
Speed: 5.8ms preprocess, 21.1ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 21.8ms
Speed: 8.2ms preprocess, 21.8ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 21.0ms
Speed: 7.2ms preprocess, 21.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.7ms
Speed: 10.2ms preprocess, 20.7ms inference, 2.3ms postprocess per image at shape (1, 3, 3

In [21]:
import cv2
import numpy as np
from ultralytics import YOLO
from scipy.spatial.distance import cdist

# Load YOLO model
model = YOLO("yolo11n.pt")

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Colors for drawing
colors = {}
unique_people_ids = set()  # Set to keep track of unique person IDs seen in the video

# Tracking parameters
max_frames_to_keep = 30  # Keep track up to this many frames without a match

class Track:
    def __init__(self, bbox, track_id, frame):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0  # Frames since last update
        self.color_histogram = self.compute_color_histogram(frame, bbox)
        self.kalman_filter = self.create_kalman_filter(bbox)
        self.history = [bbox]  # Store history of positions for consistency

    def compute_color_histogram(self, frame, bbox):
        x, y, w, h = bbox
        person_img = frame[y:y + h, x:x + w]
        if person_img.size > 0:  # Ensure the cropped image is not empty
            hist = cv2.calcHist([person_img], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
            cv2.normalize(hist, hist)
            return hist
        return None  # Return None if the image region is invalid

    def create_kalman_filter(self, bbox):
        kalman = cv2.KalmanFilter(4, 2)
        kalman.measurementMatrix = np.array([[1, 0, 0, 0], [0, 1, 0, 0]], np.float32)
        kalman.transitionMatrix = np.array([[1, 0, 1, 0], [0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]], np.float32)
        kalman.processNoiseCov = np.eye(4, dtype=np.float32) * 0.03
        x, y, w, h = bbox
        kalman.statePre = np.array([[x + w / 2], [y + h / 2], [0], [0]], np.float32)
        return kalman

    def update(self, bbox, frame):
        self.bbox = bbox
        self.color_histogram = self.compute_color_histogram(frame, bbox)
        self.age = 0
        self.history.append(bbox)  # Update history with new position
        if len(self.history) > 5:  # Limit history length
            self.history.pop(0)

# Initialize tracker dictionary
tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret or frame is None:
        break  # Exit if the frame is not valid

    # Step 1: Perform detection
    results = model(frame)

    # Step 2: Extract bounding boxes for people (class index 0)
    detections = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append((x, y, w, h))

    # Step 3: Track Matching and Update
    unmatched_detections = set(range(len(detections)))
    for track_id, track in list(tracks.items()):
        if len(unmatched_detections) == 0:
            break

        track_center = np.array([track.bbox[0] + track.bbox[2] // 2, track.bbox[1] + track.bbox[3] // 2])
        detected_centers = np.array([[d[0] + d[2] // 2, d[1] + d[3] // 2] for d in detections])

        if detected_centers.size > 0:
            distances = np.linalg.norm(detected_centers - track_center, axis=1)
            similarities = []
            for i in unmatched_detections:
                hist = Track(detections[i], track_id, frame).color_histogram
                similarity = cv2.compareHist(track.color_histogram, hist, cv2.HISTCMP_CORREL)
                similarities.append(similarity)

            match_scores = [
                (d, distances[d], similarities[i])
                for i, d in enumerate(unmatched_detections)
                if distances[d] < max(50, 0.2 * detections[d][2]) and similarities[i] > 0.5
            ]

            if match_scores:
                match_scores.sort(key=lambda x: (x[1], -x[2]))
                best_match = match_scores[0][0]
                track.update(detections[best_match], frame)
                unmatched_detections.remove(best_match)
            else:
                track.age += 1

    # Step 4: Add New Tracks for Unmatched Detections
    for i in unmatched_detections:
        if frame is not None:
            tracks[next_track_id] = Track(detections[i], next_track_id, frame)
            unique_people_ids.add(next_track_id)
            next_track_id += 1

    # Step 5: Remove stale tracks
    to_remove = [track_id for track_id, track in tracks.items() if track.age > max_frames_to_keep]
    for track_id in to_remove:
        del tracks[track_id]

    # Step 6: Draw results on frame
    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    # Display the total count of unique people seen in the video
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    # Step 7: Save the processed frame to the output video
    out.write(frame)

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))



0: 384x640 7 persons, 2 cars, 1 truck, 30.3ms
Speed: 0.0ms preprocess, 30.3ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 23.8ms
Speed: 4.5ms preprocess, 23.8ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 21.3ms
Speed: 4.8ms preprocess, 21.3ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 22.7ms
Speed: 4.8ms preprocess, 22.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 23.9ms
Speed: 1.0ms preprocess, 23.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 22.2ms
Speed: 7.1ms preprocess, 22.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 19.8ms
Speed: 2.5ms preprocess, 19.8ms inference, 0.0ms postprocess per image at shape (1, 3, 38

In [None]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from scipy.spatial.distance import cdist
import torchreid  # Import torchreid library for Re-ID

# Load YOLO model for object detection
model = YOLO("yolo11n.pt")

# Load pre-trained Re-ID model from torchreid
reid_model = torchreid.models.build_model(name='osnet_x1_0', num_classes=1, pretrained=True)
reid_model.eval()
reid_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)

# Check if the video capture opened successfully
if not cap.isOpened():
    print("Error: Could not open input video.")
    exit()

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Colors for drawing
colors = {}
unique_people_ids = set()  # Set to keep track of unique person IDs seen in the video

# Tracking parameters
max_frames_to_keep = 30  # Keep track up to this many frames without a match

class Track:
    def __init__(self, bbox, track_id, frame):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0  # Frames since last update
        self.reid_feature = self.compute_reid_feature(frame, bbox)
        self.history = [bbox]  # Store history of positions for consistency

    def compute_reid_feature(self, frame, bbox):
        # Extract the image region of the bounding box
        x, y, w, h = bbox
        person_img = frame[y:y + h, x:x + w]
        if person_img.size > 0:
            # Preprocess image and compute Re-ID feature
            person_img = cv2.resize(person_img, (128, 256))  # Resize for Re-ID model
            person_img = cv2.cvtColor(person_img, cv2.COLOR_BGR2RGB)  # Convert to RGB
            person_img = torch.tensor(person_img).permute(2, 0, 1).unsqueeze(0).float() / 255.0
            person_img = person_img.to('cuda' if torch.cuda.is_available() else 'cpu')
            with torch.no_grad():
                feature = reid_model(person_img)
            return feature.cpu().numpy().flatten()
        return None  # Return None if the image region is invalid

    def update(self, bbox, frame):
        self.bbox = bbox
        self.reid_feature = self.compute_reid_feature(frame, bbox)
        self.age = 0
        self.history.append(bbox)  # Update history with new position
        if len(self.history) > 5:  # Limit history length
            self.history.pop(0)

# Initialize tracker dictionary
tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret or frame is None:
        break  # Exit if the frame is not valid

    # Step 1: Perform detection
    results = model(frame)

    # Step 2: Extract bounding boxes for people (class index 0)
    detections = []
    reid_features = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:  # Assuming class_id 0 is for 'person'
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append((x, y, w, h))
            # Compute Re-ID feature for each detection
            reid_features.append(Track((x, y, w, h), None, frame).compute_reid_feature(frame, (x, y, w, h)))

    # Step 3: Track Matching and Update
    unmatched_detections = set(range(len(detections)))
    for track_id, track in list(tracks.items()):
        if len(unmatched_detections) == 0:
            break

        # Match using Re-ID feature similarity and distance
        similarities = []
        for i in unmatched_detections:
            similarity = 1 - cdist([track.reid_feature], [reid_features[i]], metric='cosine')[0][0]
            similarities.append((i, similarity))

        if similarities:
            similarities.sort(key=lambda x: -x[1])  # Sort by similarity in descending order
            best_match = similarities[0][0]
            if similarities[0][1] > 0.5:  # Match threshold
                track.update(detections[best_match], frame)
                unmatched_detections.remove(best_match)
            else:
                track.age += 1

    # Step 4: Add New Tracks for Unmatched Detections
    for i in unmatched_detections:
        tracks[next_track_id] = Track(detections[i], next_track_id, frame)
        unique_people_ids.add(next_track_id)
        next_track_id += 1

    # Step 5: Remove stale tracks
    to_remove = [track_id for track_id, track in tracks.items() if track.age > max_frames_to_keep]
    for track_id in to_remove:
        del tracks[track_id]

    # Step 6: Draw results on frame
    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    # Display the total count of unique people seen in the video
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    # Step 7: Save the processed frame to the output video
    out.write(frame)

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))


Successfully loaded imagenet pretrained weights from "C:\Users\User/.cache\torch\checkpoints\osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']

0: 384x640 7 persons, 2 cars, 1 truck, 25.3ms
Speed: 24.0ms preprocess, 25.3ms inference, 20.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 0.0ms
Speed: 0.0ms preprocess, 0.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 9.7ms
Speed: 8.5ms preprocess, 9.7ms inference, 6.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 3.0ms
Speed: 4.0ms preprocess, 3.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 14.3ms
Speed: 0.0ms preprocess, 14.3ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 9.3ms
Spee

In [25]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from scipy.spatial.distance import cdist
import torchreid  # Import torchreid library for Re-ID

# Load YOLO model for object detection
model = YOLO("yolo11n.pt")

# Load pre-trained Re-ID model from torchreid
reid_model = torchreid.models.build_model(name='osnet_x1_0', num_classes=1, pretrained=True)
reid_model.eval()
reid_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)

if not cap.isOpened():
    print("Error: Could not open input video.")
    exit()

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Colors for drawing
colors = {}
unique_people_ids = set()

# Tracking parameters
max_frames_to_keep = 30

class Track:
    def __init__(self, bbox, track_id, frame):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0
        self.reid_feature = self.compute_reid_feature(frame, bbox)
        self.history = [bbox]

    def compute_reid_feature(self, frame, bbox):
        x, y, w, h = bbox
        person_img = frame[y:y + h, x:x + w]
        if person_img.size > 0:
            person_img = cv2.resize(person_img, (128, 256))
            person_img = cv2.cvtColor(person_img, cv2.COLOR_BGR2RGB)
            person_img = torch.tensor(person_img).permute(2, 0, 1).unsqueeze(0).float() / 255.0
            person_img = person_img.to('cuda' if torch.cuda.is_available() else 'cpu')
            with torch.no_grad():
                feature = reid_model(person_img)
            return feature.cpu().numpy().flatten()
        return None

    def update(self, bbox, frame):
        self.bbox = bbox
        self.reid_feature = self.compute_reid_feature(frame, bbox)
        self.age = 0
        self.history.append(bbox)
        if len(self.history) > 5:
            self.history.pop(0)

# Initialize tracker dictionary
tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret or frame is None:
        break

    # Detection
    results = model(frame)
    detections = []
    reid_features = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append((x, y, w, h))
            reid_features.append(Track((x, y, w, h), None, frame).compute_reid_feature(frame, (x, y, w, h)))

    # Tracking
    unmatched_detections = set(range(len(detections)))
    for track_id, track in list(tracks.items()):
        if len(unmatched_detections) == 0:
            break

        similarities = []
        for i in unmatched_detections:
            similarity = 1 - cdist([track.reid_feature], [reid_features[i]], metric='cosine')[0][0]
            similarities.append((i, similarity))

        if similarities:
            similarities.sort(key=lambda x: -x[1])
            best_match = similarities[0][0]
            if similarities[0][1] > 0.5:
                track.update(detections[best_match], frame)
                unmatched_detections.remove(best_match)
            else:
                track.age += 1

    # New Tracks
    for i in unmatched_detections:
        tracks[next_track_id] = Track(detections[i], next_track_id, frame)
        unique_people_ids.add(next_track_id)
        next_track_id += 1

    # Remove stale tracks
    to_remove = [track_id for track_id, track in tracks.items() if track.age > max_frames_to_keep]
    for track_id in to_remove:
        del tracks[track_id]

    # Draw
    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    # Total unique people count
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    # Write frame
    out.write(frame)

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))


Successfully loaded imagenet pretrained weights from "C:\Users\User/.cache\torch\checkpoints\osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']

0: 384x640 7 persons, 2 cars, 1 truck, 22.6ms
Speed: 9.7ms preprocess, 22.6ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 13.5ms
Speed: 6.8ms preprocess, 13.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 8.0ms
Speed: 0.0ms preprocess, 8.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 1.7ms
Speed: 5.2ms preprocess, 1.7ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 8.6ms
Speed: 0.0ms preprocess, 8.6ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 15.7ms
Speed

In [27]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from scipy.spatial.distance import cdist
import torchreid
from filterpy.kalman import KalmanFilter

# Load YOLO model for object detection
model = YOLO("yolo11n.pt")

# Load pre-trained Re-ID model from torchreid
reid_model = torchreid.models.build_model(name='osnet_x1_0', num_classes=1, pretrained=True)
reid_model.eval()
reid_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)

if not cap.isOpened():
    print("Error: Could not open input video.")
    exit()

fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

colors = {}
unique_people_ids = set()
max_frames_to_keep = 30

class Track:
    def __init__(self, bbox, track_id, frame):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0
        self.reid_features = [self.compute_reid_feature(frame, bbox)]
        self.kf = self.init_kalman_filter(bbox)

    def compute_reid_feature(self, frame, bbox):
        x, y, w, h = bbox
        person_img = frame[y:y + h, x:x + w]
        if person_img.size > 0:
            person_img = cv2.resize(person_img, (128, 256))
            person_img = cv2.cvtColor(person_img, cv2.COLOR_BGR2RGB)
            person_img = torch.tensor(person_img).permute(2, 0, 1).unsqueeze(0).float() / 255.0
            person_img = person_img.to('cuda' if torch.cuda.is_available() else 'cpu')
            with torch.no_grad():
                feature = reid_model(person_img)
            return feature.cpu().numpy().flatten()
        return None

    def init_kalman_filter(self, bbox):
        kf = KalmanFilter(dim_x=7, dim_z=4)
        kf.x[:4] = np.array([bbox[0], bbox[1], bbox[2], bbox[3]]).reshape(-1, 1)  # Reshape to (4, 1)
        kf.F = np.eye(7)
        kf.H = np.eye(4, 7)
        kf.P *= 10.
        kf.R *= 0.01
        kf.Q *= 0.1
        return kf

    def predict(self):
        self.kf.predict()
        self.bbox = self.kf.x[:4].flatten()  # Flatten to get back to (4,)

    def update(self, bbox, frame):
        self.kf.update(np.array(bbox).reshape(-1, 1))  # Reshape bbox to (4, 1)
        self.reid_features.append(self.compute_reid_feature(frame, bbox))
        self.reid_features = self.reid_features[-5:]  # Keep last 5 features
        self.age = 0

tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Detection
    results = model(frame)
    detections = []
    reid_features = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append((x, y, w, h))
            reid_features.append(Track((x, y, w, h), None, frame).compute_reid_feature(frame, (x, y, w, h)))

    unmatched_detections = set(range(len(detections)))
    for track_id, track in list(tracks.items()):
        track.predict()
        if len(unmatched_detections) == 0:
            break

        similarities = []
        for i in unmatched_detections:
            avg_feature = np.mean(track.reid_features, axis=0)
            similarity = 1 - cdist([avg_feature], [reid_features[i]], metric='cosine')[0][0]
            similarities.append((i, similarity))

        if similarities:
            similarities.sort(key=lambda x: -x[1])
            best_match = similarities[0][0]
            if similarities[0][1] > 0.6:  # Adjust the threshold as needed
                track.update(detections[best_match], frame)
                unmatched_detections.remove(best_match)
            else:
                track.age += 1

    for i in unmatched_detections:
        tracks[next_track_id] = Track(detections[i], next_track_id, frame)
        unique_people_ids.add(next_track_id)
        next_track_id += 1

    to_remove = [track_id for track_id, track in tracks.items() if track.age > max_frames_to_keep]
    for track_id in to_remove:
        del tracks[track_id]

    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    out.write(frame)

cap.release()
out.release()
print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))


Successfully loaded imagenet pretrained weights from "C:\Users\User/.cache\torch\checkpoints\osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']

0: 384x640 7 persons, 2 cars, 1 truck, 26.9ms
Speed: 8.7ms preprocess, 26.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 8.6ms
Speed: 8.4ms preprocess, 8.6ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 6.7ms
Speed: 7.7ms preprocess, 6.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 3.8ms
Speed: 3.6ms preprocess, 3.8ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 2.2ms
Speed: 7.1ms preprocess, 2.2ms inference, 7.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 2.3ms
Speed: 1

In [28]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import torchreid
from filterpy.kalman import KalmanFilter

# Load YOLO model for object detection
model = YOLO("yolo11n.pt")

# Load pre-trained Re-ID model from torchreid
reid_model = torchreid.models.build_model(name='osnet_x1_0', num_classes=1, pretrained=True)
reid_model.eval()
reid_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)

if not cap.isOpened():
    print("Error: Could not open input video.")
    exit()

fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

colors = {}
unique_people_ids = set()
max_frames_to_keep = 30

class Track:
    def __init__(self, bbox, track_id, frame):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0
        self.reid_features = [self.compute_reid_feature(frame, bbox)]
        self.kf = self.init_kalman_filter(bbox)

    def compute_reid_feature(self, frame, bbox):
        x, y, w, h = bbox
        person_img = frame[y:y + h, x:x + w]
        if person_img.size > 0:
            person_img = cv2.resize(person_img, (128, 256))
            person_img = cv2.cvtColor(person_img, cv2.COLOR_BGR2RGB)
            person_img = torch.tensor(person_img).permute(2, 0, 1).unsqueeze(0).float() / 255.0
            person_img = person_img.to('cuda' if torch.cuda.is_available() else 'cpu')
            with torch.no_grad():
                feature = reid_model(person_img)
            return feature.cpu().numpy().flatten()
        return None

    def init_kalman_filter(self, bbox):
        kf = KalmanFilter(dim_x=7, dim_z=4)
        kf.x[:4] = np.array([bbox[0], bbox[1], bbox[2], bbox[3]]).reshape(-1, 1)  # Reshape to (4, 1)
        kf.F = np.eye(7)
        kf.H = np.eye(4, 7)
        kf.P *= 10.
        kf.R *= 0.01
        kf.Q *= 0.1
        return kf

    def predict(self):
        self.kf.predict()
        self.bbox = self.kf.x[:4].flatten()  # Flatten to get back to (4,)

    def update(self, bbox, frame):
        self.kf.update(np.array(bbox).reshape(-1, 1))  # Reshape bbox to (4, 1)
        self.reid_features.append(self.compute_reid_feature(frame, bbox))
        self.reid_features = self.reid_features[-5:]  # Keep last 5 features
        self.age = 0

tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Detection
    results = model(frame)
    detections = []
    reid_features = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append((x, y, w, h))
            reid_features.append(Track((x, y, w, h), None, frame).compute_reid_feature(frame, (x, y, w, h)))

    # Kalman prediction for existing tracks
    for track_id, track in tracks.items():
        track.predict()

    # Hungarian algorithm for matching
    if len(tracks) > 0 and len(detections) > 0:
        cost_matrix = np.ones((len(tracks), len(detections)))
        for i, track in enumerate(tracks.values()):
            avg_feature = np.mean(track.reid_features, axis=0)
            for j, reid_feature in enumerate(reid_features):
                similarity = 1 - cdist([avg_feature], [reid_feature], metric='cosine')[0][0]
                cost_matrix[i, j] = 1 - similarity  # Hungarian algorithm minimizes cost

        row_ind, col_ind = linear_sum_assignment(cost_matrix)

        matched_detections = set()
        for r, c in zip(row_ind, col_ind):
            if cost_matrix[r, c] < 0.4:  # Threshold for matching
                track_id = list(tracks.keys())[r]
                tracks[track_id].update(detections[c], frame)
                matched_detections.add(c)
            else:
                tracks[list(tracks.keys())[r]].age += 1

        unmatched_detections = set(range(len(detections))) - matched_detections
    else:
        unmatched_detections = set(range(len(detections)))

    # Create new tracks for unmatched detections
    for i in unmatched_detections:
        tracks[next_track_id] = Track(detections[i], next_track_id, frame)
        unique_people_ids.add(next_track_id)
        next_track_id += 1

    # Remove stale tracks
    to_remove = [track_id for track_id, track in tracks.items() if track.age > max_frames_to_keep]
    for track_id in to_remove:
        del tracks[track_id]

    # Draw
    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    out.write(frame)

cap.release()
out.release()
print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))


Successfully loaded imagenet pretrained weights from "C:\Users\User/.cache\torch\checkpoints\osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']

0: 384x640 7 persons, 2 cars, 1 truck, 25.8ms
Speed: 3.0ms preprocess, 25.8ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 7.1ms
Speed: 2.4ms preprocess, 7.1ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 8.2ms
Speed: 0.0ms preprocess, 8.2ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 0.0ms
Speed: 0.6ms preprocess, 0.0ms inference, 7.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 15.9ms
Speed: 0.0ms preprocess, 15.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 0.0ms
Speed:

In [38]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import torchreid
from filterpy.kalman import KalmanFilter

# Load YOLO model for object detection
model = YOLO("yolo11n.pt")

# Load pre-trained Re-ID model from torchreid
reid_model = torchreid.models.build_model(name='osnet_x1_0', num_classes=1, pretrained=True)
reid_model.eval()
reid_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)

if not cap.isOpened():
    print("Error: Could not open input video.")
    exit()

fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

colors = {}
unique_people_ids = set()
max_missed = 10  # Increased for occlusion handling

class Track:
    def __init__(self, bbox, track_id, reid_feature):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0
        self.missed = 0
        self.reid_features = [reid_feature]  # Store multiple features for consistency
        self.reid_feature = reid_feature  # Initial re-ID feature
        self.kf = self.init_kalman_filter(bbox)

    def init_kalman_filter(self, bbox):
        kf = KalmanFilter(dim_x=7, dim_z=4)
        kf.x[:4] = np.array([bbox[0], bbox[1], bbox[2], bbox[3]]).reshape(-1, 1)
        kf.F = np.eye(7)
        kf.H = np.eye(4, 7)
        kf.P *= 10.
        kf.R *= 0.01
        kf.Q *= 0.1
        return kf

    def predict(self):
        self.kf.predict()
        self.bbox = self.kf.x[:4].flatten()

    def update(self, bbox, reid_feature):
        self.kf.update(np.array(bbox).reshape(-1, 1))
        self.reid_features.append(reid_feature)
        # Keep last 5 features to maintain stable ID during occlusions
        self.reid_features = self.reid_features[-5:]
        # Update the average re-ID feature
        self.reid_feature = np.mean(self.reid_features, axis=0)
        self.age = 0
        self.missed = 0

def extract_reid_feature(frame, bbox):
    x, y, w, h = bbox
    person_img = frame[y:y + h, x:x + w]
    if person_img.size > 0:
        person_img = cv2.resize(person_img, (128, 256))
        person_img = cv2.cvtColor(person_img, cv2.COLOR_BGR2RGB)
        person_img = torch.tensor(person_img).permute(2, 0, 1).unsqueeze(0).float() / 255.0
        person_img = person_img.to('cuda' if torch.cuda.is_available() else 'cpu')
        with torch.no_grad():
            feature = reid_model(person_img)
        return feature.cpu().numpy().flatten()
    return None

tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

def iou(bbox1, bbox2):
    x1, y1, w1, h1 = bbox1
    x2, y2, w2, h2 = bbox2
    xi1, yi1 = max(x1, x2), max(y1, y2)
    xi2, yi2 = min(x1 + w1, x2 + w2), min(y1 + h1, y2 + h2)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
    union_area = w1 * h1 + w2 * h2 - inter_area
    return inter_area / union_area if union_area != 0 else 0

# Calculate cost matrix with appearance and IoU similarity
def calculate_cost_matrix(tracks, detections, features):
    cost_matrix = np.ones((len(tracks), len(detections)))
    for i, track in enumerate(tracks.values()):
        for j, (detection, feature) in enumerate(zip(detections, features)):
            similarity = 1 - cdist([track.reid_feature], [feature], metric='cosine')[0][0]
            overlap = iou(track.bbox, detection)
            cost_matrix[i, j] = 1 - (0.7 * similarity + 0.3 * overlap)  # Adjust weights as needed
    return cost_matrix

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Detection
    results = model(frame)
    detections, features = [], []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:  # Filter non-pedestrians
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            if h / w > 1.5:  # Simple ratio to filter out non-human objects
                detections.append((x, y, w, h))
                features.append(extract_reid_feature(frame, (x, y, w, h)))

    # Predict existing track locations
    for track in tracks.values():
        track.predict()

    # Calculate cost matrix and perform matching
    if tracks and detections:
        cost_matrix = calculate_cost_matrix(tracks, detections, features)
        row_ind, col_ind = linear_sum_assignment(cost_matrix)

        matched, unmatched_dets = set(), set(range(len(detections)))
        for r, c in zip(row_ind, col_ind):
            if cost_matrix[r, c] < 0.5:  # Threshold for matching
                tracks[list(tracks.keys())[r]].update(detections[c], features[c])
                matched.add(c)
            else:
                tracks[list(tracks.keys())[r]].missed += 1

        unmatched_dets -= matched
    else:
        unmatched_dets = set(range(len(detections)))

    # Add new tracks for unmatched detections
    for i in unmatched_dets:
        tracks[next_track_id] = Track(detections[i], next_track_id, features[i])
        unique_people_ids.add(next_track_id)
        next_track_id += 1

    # Remove stale tracks
    tracks = {track_id: track for track_id, track in tracks.items() if track.missed <= max_missed}

    # Draw bounding boxes
    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    out.write(frame)

cap.release()
out.release()
print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))


Successfully loaded imagenet pretrained weights from "C:\Users\User/.cache\torch\checkpoints\osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']

0: 384x640 7 persons, 2 cars, 1 truck, 28.6ms
Speed: 0.0ms preprocess, 28.6ms inference, 3.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 10.0ms
Speed: 0.0ms preprocess, 10.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 0.0ms
Speed: 0.0ms preprocess, 0.0ms inference, 7.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 8.7ms
Speed: 0.0ms preprocess, 8.7ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 0.0ms
Speed: 0.0ms preprocess, 0.0ms inference, 8.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 8.3ms
Speed:

In [39]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import torchreid
from filterpy.kalman import KalmanFilter

# Load YOLO model for object detection
model = YOLO("yolo11n.pt")

# Load pre-trained Re-ID model from torchreid
reid_model = torchreid.models.build_model(name='osnet_x1_0', num_classes=1, pretrained=True)
reid_model.eval()
reid_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)

if not cap.isOpened():
    print("Error: Could not open input video.")
    exit()

fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

colors = {}
unique_people_ids = set()
max_missed = 10  # Allowable missed frames before deletion

class Track:
    def __init__(self, bbox, track_id, reid_feature):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0
        self.missed = 0
        self.reid_features = [reid_feature]
        self.reid_feature = reid_feature
        self.kf = self.init_kalman_filter(bbox)

    def init_kalman_filter(self, bbox):
        kf = KalmanFilter(dim_x=7, dim_z=4)
        kf.x[:4] = np.array([bbox[0], bbox[1], bbox[2], bbox[3]]).reshape(-1, 1)
        kf.F = np.eye(7)
        kf.H = np.eye(4, 7)
        kf.P *= 10.
        kf.R *= 0.01
        kf.Q *= 0.1
        return kf

    def predict(self):
        self.kf.predict()
        self.bbox = self.kf.x[:4].flatten()

    def update(self, bbox, reid_feature):
        self.kf.update(np.array(bbox).reshape(-1, 1))
        self.reid_features.append(reid_feature)
        self.reid_features = self.reid_features[-5:]
        self.reid_feature = np.mean(self.reid_features, axis=0)
        self.age = 0
        self.missed = 0

def extract_reid_feature(frame, bbox):
    x, y, w, h = bbox
    person_img = frame[y:y + h, x:x + w]
    if person_img.size > 0:
        person_img = cv2.resize(person_img, (128, 256))
        person_img = cv2.cvtColor(person_img, cv2.COLOR_BGR2RGB)
        person_img = torch.tensor(person_img).permute(2, 0, 1).unsqueeze(0).float() / 255.0
        person_img = person_img.to('cuda' if torch.cuda.is_available() else 'cpu')
        with torch.no_grad():
            feature = reid_model(person_img)
        return feature.cpu().numpy().flatten()
    return None

tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

def iou(bbox1, bbox2):
    x1, y1, w1, h1 = bbox1
    x2, y2, w2, h2 = bbox2
    xi1, yi1 = max(x1, x2), max(y1, y2)
    xi2, yi2 = min(x1 + w1, x2 + w2), min(y1 + h1, y2 + h2)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
    union_area = w1 * h1 + w2 * h2 - inter_area
    return inter_area / union_area if union_area != 0 else 0

def calculate_cost_matrix(tracks, detections, features):
    cost_matrix = np.ones((len(tracks), len(detections)))
    for i, track in enumerate(tracks.values()):
        for j, (detection, feature) in enumerate(zip(detections, features)):
            similarity = 1 - cdist([track.reid_feature], [feature], metric='cosine')[0][0]
            overlap = iou(track.bbox, detection)
            cost_matrix[i, j] = 1 - (0.7 * similarity + 0.3 * overlap)  # Adjust weights as needed
    return cost_matrix

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)
    detections, features = [], []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            if h / w > 1.5:
                detections.append((x, y, w, h))
                features.append(extract_reid_feature(frame, (x, y, w, h)))

    for track in tracks.values():
        track.predict()

    if tracks and detections:
        cost_matrix = calculate_cost_matrix(tracks, detections, features)
        row_ind, col_ind = linear_sum_assignment(cost_matrix)

        matched, unmatched_dets = set(), set(range(len(detections)))
        for r, c in zip(row_ind, col_ind):
            if cost_matrix[r, c] < 0.5:
                tracks[list(tracks.keys())[r]].update(detections[c], features[c])
                matched.add(c)
            else:
                tracks[list(tracks.keys())[r]].missed += 1

        unmatched_dets -= matched
    else:
        unmatched_dets = set(range(len(detections)))

    for i in unmatched_dets:
        tracks[next_track_id] = Track(detections[i], next_track_id, features[i])
        unique_people_ids.add(next_track_id)
        next_track_id += 1

    tracks = {track_id: track for track_id, track in tracks.items() if track.missed <= max_missed}

    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    out.write(frame)

cap.release()
out.release()
print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))


Successfully loaded imagenet pretrained weights from "C:\Users\User/.cache\torch\checkpoints\osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']

0: 384x640 7 persons, 2 cars, 1 truck, 23.9ms
Speed: 4.0ms preprocess, 23.9ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 26.1ms
Speed: 0.0ms preprocess, 26.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 14.4ms
Speed: 0.0ms preprocess, 14.4ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 0.0ms
Speed: 0.0ms preprocess, 0.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 8.5ms
Speed: 0.0ms preprocess, 8.5ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 13.1ms
Spe