In [None]:
!pip install supervision
!pip install ultralytics

In [None]:
from ultralytics import YOLO
import cv2
from matplotlib import pyplot as plt
import numpy as np
from PIL import Image
import time
import supervision as sv

3.1.1 Draw the segmentation mask of the largest car: Use ultralytics library only

In [None]:
!git clone https://github.com/ashanW004/ETM4272.git

In [None]:
img = cv2.imread("/content/ETM4272/demo5_images/cars1.jpg")

In [None]:
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
from ultralytics import YOLO

# Load YOLOv8 segmentation model
model = YOLO('yolov8x-seg.pt')  # Using the largest segmentation model

# Load the image
img = cv2.imread("/content/ETM4272/demo5_images/cars1.jpg")
image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Run segmentation on the image
results = model(image)

# Get segmentation masks
masks = results[0].masks.xy  # List of masks (each mask is a set of polygon points)
boxes = results[0].boxes.xyxy  # Bounding boxes

# Find the largest car based on bounding box area
largest_idx = None
max_area = 0

for i, box in enumerate(boxes):
    x1, y1, x2, y2 = box
    area = (x2 - x1) * (y2 - y1)
    if area > max_area:
        max_area = area
        largest_idx = i

# Draw the mask for the largest car
if largest_idx is not None:
    mask = np.array(masks[largest_idx], dtype=np.int32)

    # Draw filled polygon mask on a blank image
    mask_img = np.zeros_like(image)
    cv2.fillPoly(mask_img, [mask], (255, 0, 0))  # Red mask for visualization

    # Overlay mask on the original image
    overlaid = cv2.addWeighted(image, 0.7, mask_img, 0.3, 0)

    # Display the image with the largest car's mask
    plt.figure(figsize=(10, 6))
    plt.imshow(overlaid)
    plt.axis("off")
    plt.show()


3.2 Estimate distance to centroid of chair

In [None]:
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
from ultralytics import YOLO
import supervision as sv
from transformers import AutoProcessor
from transformers import AutoModelForDepthEstimation

# Load YOLOv8 model for object detection
model = YOLO('yolov8x.pt')  # YOLOv8x for high accuracy

# Load the image
image_path = "/content/ETM4272/demo5_images/kingChair.jpg"  # Updated image path
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Perform object detection
results = model(image)

# Initialize centroid coordinates
centroid_x, centroid_y = None, None

# Extract bounding box of chair
for result in results:
    for box in result.boxes:
        cls = int(box.cls[0])  # Class ID
        if model.names[cls] == "chair":  # Check if detected object is a chair
            x1, y1, x2, y2 = map(int, box.xyxy[0])  # Get bounding box coordinates
            centroid_x = (x1 + x2) // 2
            centroid_y = (y1 + y2) // 2
            print(f"Chair detected at: ({x1}, {y1}), ({x2}, {y2})")
            print(f"Centroid: ({centroid_x}, {centroid_y})")

            # Draw bounding box and centroid
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.circle(image, (centroid_x, centroid_y), 5, (0, 0, 255), -1)

# Save and display detected image
detected_image_path = "detected_chair.jpg"
cv2.imwrite(detected_image_path, image)

# Show detected image
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.axis("off")
plt.show()

# Check if centroid was found
if centroid_x is None or centroid_y is None:
    print("No chair detected!")
else:
    # Load Depth Anything V2 model
    # Load Depth Anything V2 model
    processor = AutoProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
    depth_model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")

    # Convert image to PIL format
    image_pil = Image.open(image_path).convert("RGB")

    # Get depth map
    inputs = processor(image_pil, return_tensors="pt")
    with torch.no_grad():
        depth_output = depth_model(**inputs).predicted_depth

    # Convert depth map to NumPy array
    depth_map = depth_output.squeeze().cpu().numpy()

    # Show depth map
    plt.imshow(depth_map, cmap="plasma")
    plt.colorbar(label="Depth Value")
    plt.scatter(centroid_x, centroid_y, color="red", marker="x", label="Centroid")
    plt.legend()
    plt.axis("off")
    plt.show()

    # Query depth value at centroid
    depth_value = depth_map[centroid_y, centroid_x]
    print(f"Depth at centroid ({centroid_x}, {centroid_y}): {depth_value}")
    # Display results
    print(f"Estimated Distance to Chair: {depth_value:.2f} meters")


3.3.1 Draw bounding boxes around the "cars" using supervision.

In [None]:
import gdown

url = "https://drive.google.com/uc?id=1zcKvnDDEdyFF4B0B3eYud6DHU19nl0o4"
output = "vehicles_video.mp4"
gdown.download(url, output, quiet=False)

In [None]:
import numpy as np
import supervision as sv
from ultralytics import YOLO

# Load YOLO model
model = YOLO("yolo11n.pt")  # Ensure the model file is present

# Define a BoxAnnotator instance (for drawing bounding boxes)
box_annotator = sv.BoxAnnotator()

def callback(frame: np.ndarray, _: int) -> np.ndarray:
    # Run YOLO on the frame
    results = model(frame)

    # Convert results to Supervision format
    detections = sv.Detections.from_ultralytics(results[0])

    # Annotate frame using the BoxAnnotator
    annotated_frame = box_annotator.annotate(scene=frame, detections=detections)

    return annotated_frame

# Process the video
sv.process_video(
    source_path="vehicles_video.mp4",
    target_path="result_tracking.mp4",
    callback=callback
)

print("Processing complete. Check 'result_tracking.mp4'")


3.3.1 Track the cars. Add the tracker id to the bounding boxes around the cars. Use supervision

In [None]:
import numpy as np
import supervision as sv
from ultralytics import YOLO

# Load YOLO model
model = YOLO("yolov8n.pt")  # Ensure this model exists

# Create a ByteTrack tracker
tracker = sv.ByteTrack()

# Define the BoxAnnotator
box_annotator = sv.BoxAnnotator()

def callback(frame: np.ndarray, _: int) -> np.ndarray:
    # Run YOLO on the frame
    results = model(frame)

    # Convert YOLO results to Supervision detections
    detections = sv.Detections.from_ultralytics(results[0])

    # Update tracker with new detections
    detections = tracker.update_with_detections(detections)

    # Generate labels with tracker ID
    labels = [f"ID {round(tracker_id)}" for tracker_id in detections.tracker_id]

    # Annotate frame with bounding boxes and tracker IDs
    annotated_frame = box_annotator.annotate(scene=frame, detections=detections)

    # Add labels manually using Supervision's LabelAnnotator
    label_annotator = sv.LabelAnnotator()
    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)

    return annotated_frame

# Process the video
sv.process_video(
    source_path="vehicles_video.mp4",
    target_path="tracked_cars.mp4",
    callback=callback
)

print("✅ Processing complete! Check 'tracked_cars.mp4'")


3.3.2 Draw the track traces for the tracked cars. Use supervision

In [None]:
import numpy as np
import supervision as sv
from ultralytics import YOLO
import cv2  # OpenCV for drawing traces

# Load YOLO model
model = YOLO("yolov8n.pt")  # Ensure this model exists

# Create a ByteTrack tracker
tracker = sv.ByteTrack()

# Define the BoxAnnotator
box_annotator = sv.BoxAnnotator()

# Dictionary to store car paths (each car's tracked positions)
car_paths = {}

def callback(frame: np.ndarray, _: int) -> np.ndarray:
    # Run YOLO on the frame
    results = model(frame)

    # Convert YOLO results to Supervision detections
    detections = sv.Detections.from_ultralytics(results[0])

    # Update tracker with new detections
    detections = tracker.update_with_detections(detections)

    # Generate labels with rounded tracker ID
    labels = [f"ID {round(tracker_id)}" for tracker_id in detections.tracker_id]

    # Track the positions of each car over time
    for tracker_id, bbox in zip(detections.tracker_id, detections.xyxy):  # Use detections.xyxy
        if tracker_id not in car_paths:
            car_paths[tracker_id] = []  # Initialize path for this car
        # Save the current position (center of the bounding box)
        center = ((bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2)
        car_paths[tracker_id].append(center)

    # Draw traces for each car
    for tracker_id, path in car_paths.items():
        # Ensure there are at least two points to draw a line
        if len(path) > 1:
            for i in range(1, len(path)):
                cv2.line(frame, tuple(map(int, path[i - 1])), tuple(map(int, path[i])), (0, 255, 0), 2)

    # Annotate frame with bounding boxes and tracker IDs
    annotated_frame = box_annotator.annotate(scene=frame, detections=detections)

    # Add labels manually using Supervision's LabelAnnotator
    label_annotator = sv.LabelAnnotator()
    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)

    return annotated_frame

# Process the video
sv.process_video(
    source_path="vehicles_video.mp4",
    target_path="tracked_cars_with_traces.mp4",
    callback=callback
)

print("✅ Processing complete! Check 'tracked_cars_with_traces.mp4'")


Questions:

Explain what "def callback(frame: np.ndarray, _: int) -> np.ndarray:" does.


---


The function definition def callback(frame: np.ndarray, _: int) -> np.ndarray: declares a callback function designed to process a numpy array (typically an image/video frame) and return a modified numpy array. Here's a breakdown:

**Key Components:**

Parameters:

frame: np.ndarray:
A numpy array representing input data (e.g., an image or video frame).

_: int:
An integer argument (conventionally named _ to indicate it is intentionally ignored/unused by the function).

Return Type:

-> np.ndarray:
The function returns a processed/modified numpy array (e.g., a filtered or transformed version of frame).

Purpose:
This function is typically passed to another system (e.g., a video processing pipeline) that invokes it repeatedly (e.g., for each frame in a video stream).

The second parameter (often a frame index, timestamp, or unused flag) is included for API compatibility but not used in the function's logic.

**Completed section**