In [15]:
import cv2
import numpy as np
from scipy.optimize import linear_sum_assignment
from ultralytics import YOLO

# Load YOLO model
model = YOLO("yolo11n.pt")

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Tracking variables
frame_id = 0
tracks = {}
id_counter = 0
colors = {}
unique_people_ids = set()  # Set to keep track of unique person IDs seen in the video

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def compute_iou(bbox1, bbox2):
    x1, y1, w1, h1 = bbox1
    x2, y2, w2, h2 = bbox2
    xi1, yi1 = max(x1, x2), max(y1, y2)
    xi2, yi2 = min(x1 + w1, x2 + w2), min(y1 + h1, y2 + h2)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
    bbox1_area = w1 * h1
    bbox2_area = w2 * h2
    union_area = bbox1_area + bbox2_area - inter_area
    return inter_area / union_area if union_area != 0 else 0

def compute_cost_matrix(detections, previous_detections):
    cost_matrix = np.zeros((len(previous_detections), len(detections)))
    for i, prev_det in enumerate(previous_detections):
        for j, det in enumerate(detections):
            cost_matrix[i, j] = 1 - compute_iou(prev_det['bbox'], det['bbox'])
    return cost_matrix

def update_tracks(tracks, detections, previous_detections):
    global id_counter
    active_ids = set()

    if len(previous_detections) == 0:
        for det in detections:
            det_id = id_counter
            tracks[det_id] = {'bbox': det['bbox'], 'color': generate_color(det_id)}
            det['id'] = det_id
            unique_people_ids.add(det_id)  # Add new ID to unique set
            active_ids.add(det_id)
            id_counter += 1
    else:
        cost_matrix = compute_cost_matrix(detections, previous_detections)
        row_ind, col_ind = linear_sum_assignment(cost_matrix)
        assigned = set()

        for r, c in zip(row_ind, col_ind):
            if cost_matrix[r, c] < 0.5:  # Match only if cost (1 - IoU) is below threshold
                prev_det = previous_detections[r]
                det = detections[c]
                det_id = prev_det['id']
                tracks[det_id]['bbox'] = det['bbox']
                det['id'] = det_id
                active_ids.add(det_id)
                assigned.add(c)
        
        for i, det in enumerate(detections):
            if i not in assigned:
                det_id = id_counter
                tracks[det_id] = {'bbox': det['bbox'], 'color': generate_color(det_id)}
                det['id'] = det_id
                unique_people_ids.add(det_id)  # Add new ID to unique set
                active_ids.add(det_id)
                id_counter += 1

    return active_ids

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

previous_detections = []

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Step 1: Perform detection
    results = model(frame)

    # Step 2: Extract bounding boxes for people (class index 0)
    detections = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append({
                'bbox': (x, y, w, h),
                'confidence': confidence
            })
    
    # Step 3: Update tracks and get current frame IDs
    current_frame_ids = update_tracks(tracks, detections, previous_detections)
    previous_detections = detections

    # Step 4: Draw results on frame
    for det in detections:
        if 'id' in det:
            draw_bounding_box(frame, det['bbox'], tracks[det['id']]['color'], det['id'])
    
    # Display the total count of unique people seen in the video
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    # Step 5: Save the processed frame to the output video
    out.write(frame)
    frame_id += 1

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))



0: 384x640 7 persons, 2 cars, 1 truck, 25.0ms
Speed: 3.0ms preprocess, 25.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 23.7ms
Speed: 4.5ms preprocess, 23.7ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 24.2ms
Speed: 4.0ms preprocess, 24.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 24.0ms
Speed: 5.0ms preprocess, 24.0ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 22.1ms
Speed: 4.0ms preprocess, 22.1ms inference, 3.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 21.5ms
Speed: 4.5ms preprocess, 21.5ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 21.1ms
Speed: 2.1ms preprocess, 21.1ms inference, 2.0ms postprocess per image at shape (1, 3, 38

In [19]:
import cv2
import numpy as np
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort

# Load YOLO model
model = YOLO("yolo11n.pt")

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Initialize Deep SORT tracker
deepsort = DeepSort(max_age=30, n_init=3, nn_budget=70)

# Colors for drawing
colors = {}
unique_people_ids = set()  # Set to keep track of unique person IDs seen in the video

def generate_color(id):
    # Ensure the id is converted to an integer, handling non-integer inputs gracefully
    try:
        id = int(id)  # Convert id to integer if possible
    except ValueError:
        id = 0  # Fallback to a default seed in case of a non-integer id
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Step 1: Perform detection
    results = model(frame)

    # Step 2: Extract bounding boxes for people (class index 0)
    detections = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            # Format as [[x, y, w, h], confidence]
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append([[x, y, w, h], confidence])

    # Step 3: Update Deep SORT tracker with detections
    tracks = deepsort.update_tracks(detections, frame=frame)
    
    # Step 4: Draw results on frame
    for track in tracks:
        if not track.is_confirmed():
            continue
        track_id = track.track_id
        unique_people_ids.add(track_id)
        bbox = track.to_ltrb()  # Convert to [x1, y1, x2, y2] format
        x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2] - bbox[0]), int(bbox[3] - bbox[1])
        
        # Draw bounding box and ID
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, (x, y, w, h), colors[track_id], track_id)

    # Display the total count of unique people seen in the video
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    # Step 5: Save the processed frame to the output video
    out.write(frame)

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))



0: 384x640 7 persons, 2 cars, 1 truck, 24.1ms
Speed: 4.0ms preprocess, 24.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.0ms
Speed: 3.0ms preprocess, 20.0ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.5ms
Speed: 2.0ms preprocess, 20.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 7.0ms
Speed: 1.0ms preprocess, 7.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 8.0ms
Speed: 1.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 15.2ms
Speed: 1.5ms preprocess, 15.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 10.0ms
Speed: 4.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 6

In [25]:
import cv2
import numpy as np
from ultralytics import YOLO
from scipy.spatial.distance import cdist

# Load YOLO model
model = YOLO("yolo11n.pt")

# Set up video capture and output
input_video = 'easy_9.mp4'
output_video = 'output_video.mp4'
cap = cv2.VideoCapture(input_video)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Colors for drawing
colors = {}
unique_people_ids = set()  # Set to keep track of unique person IDs seen in the video

# Tracking parameters
max_distance = 50  # Maximum distance to consider a match between detections
max_frames_to_keep = 30  # Keep track up to this many frames without a match

class Track:
    def __init__(self, bbox, track_id):
        self.bbox = bbox
        self.track_id = track_id
        self.age = 0  # Frames since last update

# Initialize tracker dictionary
tracks = {}
next_track_id = 0

def generate_color(id):
    np.random.seed(id)
    return tuple(np.random.randint(0, 255, 3).tolist())

def draw_bounding_box(frame, bbox, color, det_id):
    x, y, w, h = map(int, bbox)
    cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
    cv2.putText(frame, f'ID: {det_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Step 1: Perform detection
    results = model(frame)

    # Step 2: Extract bounding boxes for people (class index 0)
    detections = []
    for det in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = det
        if int(class_id) == 0:
            # Format as [[x, y, w, h], confidence]
            x, y, w, h = int(x1), int(y1), int(x2 - x1), int(y2 - y1)
            detections.append((x, y, w, h))

    # Step 3: Calculate distance between current detections and existing tracks
    track_bboxes = np.array([track.bbox for track in tracks.values()])
    detected_bboxes = np.array(detections)
    
    if track_bboxes.size > 0 and detected_bboxes.size > 0:
        distances = cdist(track_bboxes[:, :2], detected_bboxes[:, :2])  # Compute distance between track and detection centers
        row_indices, col_indices = np.where(distances < max_distance)

        # Step 4: Update matched tracks
        matched_detections = set()
        for row, col in zip(row_indices, col_indices):
            track_id = list(tracks.keys())[row]
            bbox = detections[col]
            tracks[track_id].bbox = bbox
            tracks[track_id].age = 0
            matched_detections.add(col)

        # Step 5: Handle unmatched detections
        unmatched_detections = set(range(len(detections))) - matched_detections
        for i in unmatched_detections:
            tracks[next_track_id] = Track(detections[i], next_track_id)
            unique_people_ids.add(next_track_id)
            next_track_id += 1

    else:
        # Add all detections as new tracks if no previous tracks exist
        for i, bbox in enumerate(detections):
            tracks[next_track_id] = Track(bbox, next_track_id)
            unique_people_ids.add(next_track_id)
            next_track_id += 1

    # Step 6: Remove stale tracks
    to_remove = [track_id for track_id, track in tracks.items() if track.age > max_frames_to_keep]
    for track_id in to_remove:
        del tracks[track_id]

    # Step 7: Draw results on frame
    for track_id, track in tracks.items():
        if track_id not in colors:
            colors[track_id] = generate_color(track_id)
        draw_bounding_box(frame, track.bbox, colors[track_id], track_id)
        track.age += 1

    # Display the total count of unique people seen in the video
    cv2.putText(frame, f'Total Unique People: {len(unique_people_ids)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    # Step 8: Save the processed frame to the output video
    out.write(frame)

# Release resources
cap.release()
out.release()

print("Tracking complete. Video saved to", output_video)
print("Total unique people seen in video:", len(unique_people_ids))



0: 384x640 7 persons, 2 cars, 1 truck, 25.5ms
Speed: 3.3ms preprocess, 25.5ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 25.6ms
Speed: 3.1ms preprocess, 25.6ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.3ms
Speed: 4.0ms preprocess, 20.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 19.8ms
Speed: 3.5ms preprocess, 19.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.2ms
Speed: 4.0ms preprocess, 20.2ms inference, 3.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.0ms
Speed: 3.1ms preprocess, 20.0ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 cars, 1 truck, 20.5ms
Speed: 4.9ms preprocess, 20.5ms inference, 2.1ms postprocess per image at shape (1, 3, 38