In [1]:
# YOLOv8 Object Tracking and Video Output to File (Jupyter Cell)

import cv2
import numpy as np
from ultralytics import YOLO
from collections import deque
import os
import uuid

# Initialize YOLOv8 model
model = YOLO("yolov8n.pt")

# Constants
MAX_TRAIL_LENGTH = 30
tracks = {}

# Ensure output directory exists
os.makedirs("outputs", exist_ok=True)

# Utility functions
def get_color(idx):
    np.random.seed(idx)
    return tuple(map(int, np.random.randint(0, 255, 3)))

def calculate_iou(box1, box2):
    x1_1, y1_1, x2_1, y2_1 = box1
    x1_2, y1_2, x2_2, y2_2 = box2
    x_left = max(x1_1, x1_2)
    y_top = max(y1_1, y1_2)
    x_right = min(x2_1, x2_2)
    y_bottom = min(y2_1, y2_2)
    if x_right < x_left or y_bottom < y_top:
        return 0.0
    intersection = (x_right - x_left) * (y_bottom - y_top)
    union = (x2_1 - x1_1)*(y2_1 - y1_1) + (x2_2 - x1_2)*(y2_2 - y1_2) - intersection
    return intersection / union if union > 0 else 0

# Frame processing
def process_frame(frame, frame_count):
    global tracks

    results = model(frame)[0]

    current_detections = []
    for box in results.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        cls_id = int(box.cls[0])
        conf = float(box.conf[0])
        class_name = model.names[cls_id]
        centroid = ((x1 + x2) // 2, (y1 + y2) // 2)
        current_detections.append({
            "bbox": (x1, y1, x2, y2),
            "centroid": centroid,
            "class_name": class_name,
            "confidence": conf
        })

    if frame_count == 0:
        for i, det in enumerate(current_detections):
            tracks[i+1] = {
                "bbox": det["bbox"],
                "centroid": det["centroid"],
                "class_name": det["class_name"],
                "confidence": det["confidence"],
                "trail": deque([det["centroid"]], maxlen=MAX_TRAIL_LENGTH),
                "color": get_color(i+1),
                "last_seen": frame_count
            }
    else:
        matched_tracks = set()
        matched_detections = set()
        for track_id, track in tracks.items():
            if frame_count - track["last_seen"] > 30:
                continue
            best_iou = 0.3
            best_det = -1
            for i, det in enumerate(current_detections):
                if i in matched_detections:
                    continue
                iou = calculate_iou(track["bbox"], det["bbox"])
                if iou > best_iou:
                    best_iou = iou
                    best_det = i
            if best_det >= 0:
                det = current_detections[best_det]
                track.update({
                    "bbox": det["bbox"],
                    "centroid": det["centroid"],
                    "class_name": det["class_name"],
                    "confidence": det["confidence"],
                    "last_seen": frame_count
                })
                track["trail"].append(det["centroid"])
                matched_tracks.add(track_id)
                matched_detections.add(best_det)

        next_id = max(tracks.keys()) + 1 if tracks else 1
        for i, det in enumerate(current_detections):
            if i not in matched_detections:
                tracks[next_id] = {
                    "bbox": det["bbox"],
                    "centroid": det["centroid"],
                    "class_name": det["class_name"],
                    "confidence": det["confidence"],
                    "trail": deque([det["centroid"]], maxlen=MAX_TRAIL_LENGTH),
                    "color": get_color(next_id),
                    "last_seen": frame_count
                }
                next_id += 1

    for track_id, track in tracks.items():
        if frame_count - track["last_seen"] > 30:
            continue
        x1, y1, x2, y2 = track["bbox"]
        color = track["color"]
        label = f"{track['class_name']} #{track_id} {track['confidence']:.2f}"
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
        cv2.circle(frame, track["centroid"], 4, color, -1)
        trail = list(track["trail"])
        for i in range(1, len(trail)):
            alpha = 0.3 + 0.7 * (i / len(trail))
            trail_color = tuple(int(c * alpha + 255 * (1 - alpha)) for c in color)
            cv2.line(frame, trail[i-1], trail[i], trail_color, 2)
    return frame

# Main function
def process_video(input_path):
    cap = cv2.VideoCapture(input_path)
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)
    output_path = f"outputs/processed_{uuid.uuid4().hex}.mp4"
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        processed = process_frame(frame, frame_count)
        out.write(processed)
        frame_count += 1

    cap.release()
    out.release()
    print(f"✅ Video saved to: {output_path}")

# Usage Example: Replace with your actual video file name
input_video = r"D:\projects\Object_Tracking\macv-obj-tracking-video.mp4"  # <-- Change this to your video filename
process_video(input_video)



0: 384x640 8 persons, 1 chair, 69.6ms
Speed: 5.2ms preprocess, 69.6ms inference, 226.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 chair, 17.7ms
Speed: 3.8ms preprocess, 17.7ms inference, 5.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 handbag, 1 chair, 8.4ms
Speed: 2.0ms preprocess, 8.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 handbag, 1 chair, 6.4ms
Speed: 1.3ms preprocess, 6.4ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 handbag, 1 chair, 8.8ms
Speed: 1.6ms preprocess, 8.8ms inference, 3.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 chair, 8.2ms
Speed: 1.6ms preprocess, 8.2ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 handbags, 1 chair, 6.2ms
Speed: 1.3ms preprocess, 6.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640