# Imports

In [1]:
from ultralytics import YOLO
import matplotlib.pyplot as plt
from PIL import Image
from matplotlib.patches import Polygon
import numpy as np
import cv2
from time import time
import torch
import supervision as sv
from depth.depth_anything_v2.dpt import DepthAnythingV2
from matplotlib.colors import Normalize

xFormers not available
xFormers not available


# Device

In [2]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("GPU is available. Using GPU.")
else:
    device = torch.device('cpu')
    print("GPU is not available. Using CPU.")

GPU is available. Using GPU.


# Object Detection Model

In [3]:
# Build a YOLOv9c model from pretrained weight
model = YOLO("yolov9c-seg.pt")
tracker = sv.ByteTrack()

model.to(device)

YOLO(
  (model): SegmentationModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): RepNCSPELAN4(
        (cv1): Conv(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Sequential(
          (0): RepCSP(
            (cv1): Conv(
              (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
           

In [4]:

def calculate_head_and_leg_points(polygon_points, threshold=0.12):
    """
    Calculate the average points for the head and leg based on the top and bottom threshold percentage of Y-coordinates.

    Parameters:
    polygon_points (np.array): The polygon points.
    threshold (float): The percentage to consider for the top and bottom points (default is 0.07).

    Returns:
    tuple: The average points for the head and leg.
    """
    poly = np.array(polygon_points, dtype=np.int32)

    # Extract Y-coordinates
    y_coords = poly[:, 1]

    # Calculate top and bottom threshold percentage
    top_threshold_indices = np.argsort(y_coords)[:max(int(threshold * len(y_coords)), 1)]
    bottom_threshold_indices = np.argsort(y_coords)[- max (int(threshold * len(y_coords)), 1):]

    # Get average points for head and leg
    head_points = poly[top_threshold_indices]
    leg_points = poly[bottom_threshold_indices]

    head_avg = np.mean(head_points, axis=0).astype(int)
    leg_avg = np.mean(leg_points, axis=0).astype(int)

    leg_avg[1] = np.max(y_coords)
    head_avg[1] = np.min(y_coords)


    return head_avg, leg_avg

In [5]:
def process_detection_results(model, frame):
    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Perform inference using the model
    results = model(frame_rgb, classes=[0], conf=0.45)[0]
    masks = results.masks
    boxes = results.boxes

    # Dictionary to store head and leg positions
    legs_and_heads = {}

    # List to store bounding boxes
    boxes_list = []

    # Iterate over each detected box and mask
    for idx, box in enumerate(boxes):
        # Convert box coordinates to list
        xyxy = box.cpu().xyxy.tolist()[0]
        boxes_list.append(xyxy)

        # Calculate head and leg positions for the current mask
        head_pos, leg_pos = calculate_head_and_leg_points(masks[idx].xy[0])
        legs_and_heads[idx] = (head_pos, leg_pos)

    return results, legs_and_heads, boxes_list

In [6]:
def get_IOU( bbox1, bbox2):
      """
      Calculate the Intersection over Union (IoU) between two bounding boxes.

      Args:
      - bbox1: Coordinates of the first bounding box in the format [x1, y1, x2, y2].
      - bbox2: Coordinates of the second bounding box in the format [x1, y1, x2, y2].

      Returns:
      - iou: Intersection over Union (IoU) score between the two bounding boxes.
      """

      x1, y1, x2, y2 = bbox1
      X1, Y1, X2, Y2 = bbox2

      # Calculate intersection area
      interArea = max(0, min(x2, X2) - max(x1, X1)) * max(0, min(y2, Y2) - max(y1, Y1))

      # Calculate areas of bounding boxes
      bbox1_area = (x2 - x1) * (y2 - y1)
      bbox2_area = (X2 - X1) * (Y2 - Y1)

      # Calculate IoU
      iou = interArea / (bbox1_area + bbox2_area - interArea)

      return iou

In [7]:
def match(xyxy, boxes):
    best_idx = 0
    best_iou = 0.0

    for idx, box in enumerate(boxes):
        iou = get_IOU(box, xyxy)

        if iou>best_iou:
            best_iou = iou
            best_idx = idx

    return best_idx


# Depth estimation model

In [5]:
def load_depth_model(device, encoder='vitl', load_from='depth/assets/depth_anything_v2_metric_vkitti_vitl.pth', max_depth=30):
    
    model_configs = {
        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
        'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
    }
    
    depth_anything = DepthAnythingV2(**{**model_configs[encoder], 'max_depth': max_depth})
    depth_anything.load_state_dict(torch.load(load_from, map_location=device))
    depth_anything = depth_anything.to(device).eval()
    
    return depth_anything

In [6]:
depth_anything= load_depth_model(device)

# Camera calibration on the first frame

In [8]:
from CameraCalibrateApp import CameraCalibrateApp

video_path = "walking.mp4"  # Update the video path accordingly
app = CameraCalibrateApp(video_path)
line_data, frame = app.start()

depth_map = depth_anything.infer_image(frame)

av_length_ratio = 0.0
for start, end, true_length in line_data:
    length_pixels = np.sqrt((end[0] - start[0]) ** 2 + (end[1] - start[1]) ** 2)
    middle_point = ((start[0] + end[0]) // 2, (start[1] + end[1]) // 2)
    depth = depth_map[middle_point[1], middle_point[0]]

    length_ratio = (length_pixels * depth )/ true_length

    av_length_ratio+= length_ratio/len(line_data)


print(av_length_ratio)

963.3033538486895


# Video

In [11]:
def annotate_frame(frame, head_pos, leg_pos, obj_id):          
    # Draw circles or markers for head and leg positions
    cv2.circle(frame, head_pos, 5, (0, 255, 0), -1)  # Green circle for head
    cv2.circle(frame, leg_pos, 5, (0, 0, 255), -1)   # Red circle for leg
    cv2.line(frame, head_pos, leg_pos, (255, 0, 0), 2) 
    
    # Draw the id in the center point of the head and the leg
    midpoint = ((head_pos[0] + leg_pos[0]) // 2, (head_pos[1] + leg_pos[1]) // 2)
    cv2.putText(frame, str(obj_id), midpoint, cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    return frame