# 1.0 Imports

In [1]:
from ultralytics import YOLO
import matplotlib.pyplot as plt
from PIL import Image
from matplotlib.patches import Polygon
import numpy as np
import cv2
from tqdm import tqdm
import itertools
from time import time
import torch
import supervision as sv
from depthAnythingModel.depth_anything_v2.dpt import DepthAnythingV2
from matplotlib.colors import Normalize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from Utils.ObjectDetection import ObjectDetectionUtils
from collections import defaultdict, deque
from sklearn.linear_model import RANSACRegressor
from Gui.CameraCalibrateApp import CameraCalibrateApp

xFormers not available
xFormers not available


In [2]:
VIDEO_PATH = 'walking4.mp4'

In [3]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("GPU is available. Using GPU.")
else:
    device = torch.device('cpu')
    print("GPU is not available. Using CPU.")

GPU is available. Using GPU.


# 2.0 load the models

## 2.1 object Detection Model

In [4]:
# Build a YOLOv9c model from pretrained weight
yolo_model = YOLO("pretrained weights/yolov9c-seg.pt")
tracker = sv.ByteTrack()

yolo_model.to(device)

YOLO(
  (model): SegmentationModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): RepNCSPELAN4(
        (cv1): Conv(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Sequential(
          (0): RepCSP(
            (cv1): Conv(
              (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
           

In [5]:
MyDetector =  ObjectDetectionUtils(yolo_model)

## 2.2 Depth estimation model

In [6]:
def load_depth_model(device, encoder='vitl', load_from='pretrained weights/depth_anything_v2_metric_vkitti_vitl.pth', max_depth=25):
    
    model_configs = {
        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
        'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
    }
    
    depth_anything = DepthAnythingV2(**{**model_configs[encoder], 'max_depth': max_depth})
    depth_anything.load_state_dict(torch.load(load_from, map_location=device))
    depth_anything = depth_anything.to(device).eval()
    
    return depth_anything

In [7]:
depth_anything= load_depth_model(device)

## 2.3 Automatic Camera Calibration Model

In [8]:
class AutomaticCalibrationModel(nn.Module):
    def __init__(self):
        super(AutomaticCalibrationModel, self).__init__()
        self.fc1 = nn.Linear(2, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc21 = nn.Linear(128, 260)
        self.fc22 = nn.Linear(260, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc21(x))
        x = torch.relu(self.fc22(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))  
        return x

def custom_loss(y_pred, y_true):
    epsilon = 1e-8  # Small constant to prevent log(0)
    y_pred = torch.clamp(y_pred, min=epsilon)  # Clipping predictions to avoid log(0)
    y_true = torch.clamp(y_true, min=epsilon)
    l = torch.mean((torch.log(y_pred) - torch.log(y_true))**2)
    return l

In [9]:
# Initialize the model, loss function, and optimizer
cam_clib_model = AutomaticCalibrationModel().to(device)
optimizer = optim.Adam(cam_clib_model.parameters(), lr=0.001)

# 3.0 Select the ground truth heights on the first frame

In [26]:
app = CameraCalibrateApp(VIDEO_PATH)
ground_truth_lines, frame = app.start()

print(f'You have drawn {len(ground_truth_lines)} lines.')

depth_map = depth_anything.infer_image(frame)

You have drawn 2 lines.


In [27]:
anchor_lines = []

for start_loaction, end_location, true_length in ground_truth_lines:
    length_pixels = np.sqrt((end_location[0] - start_loaction[0]) ** 2 + (end_location[1] - start_loaction[1]) ** 2)
    middle_point = ((start_loaction[0] + end_location[0]) // 2, (start_loaction[1] + end_location[1]) // 2)

    depth = depth_map[middle_point[1], middle_point[0]]

    
    anchor_lines.append((length_pixels,  true_length,  depth))

anchor_lines


[(463.62161295608297, 175.0, 4.2877007), (270.22398117117586, 165.0, 7.024391)]

# 4.0 Data Generation for the automatic camera calibration model

## 4.1 collect data from tracking

In [31]:
cap = cv2.VideoCapture(VIDEO_PATH)

number_of_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

tracking_data = {}
frame_count = -1

step = 5  # sample each 10 frames
frames_limit = min (1000, number_of_frames)

# Initialize tqdm progress bar
pbar = tqdm(total=frames_limit, desc="Data Genartion in progress")

while cap.isOpened():
    ret, frame = cap.read()
    frame_count += 1
    if not ret:
        break

    if frame_count % step != 0:
        continue

    results, legs_and_heads, boxes = MyDetector.infer_obj_detection(frame)

    detections = sv.Detections.from_ultralytics(results)
    detections = tracker.update_with_detections(detections)
    depth_map = depth_anything.infer_image(frame)


    for detection_idx, _ in enumerate(detections):
        xyxy = detections[detection_idx].xyxy.tolist()[0]
        obj_id = detections[detection_idx].tracker_id[0]

        best_idx = MyDetector.match_best_box(xyxy, boxes)

        head_pos, leg_pos = legs_and_heads[best_idx]

        start, end = head_pos, leg_pos
        length_pixels = np.sqrt((end[0] - start[0]) ** 2 + (end[1] - start[1]) ** 2)
        middle_point = ((start[0] + end[0]) // 2, (start[1] + end[1]) // 2)

        depth = depth_map[middle_point[1], middle_point[0]]

        if obj_id not in tracking_data:
            tracking_data[obj_id] = [(depth, length_pixels)]
        else:
            tracking_data[obj_id].append((depth, length_pixels))

    # Update progress bar
    pbar.update(step)

    cv2.waitKey(1)

    if frame_count >= frames_limit:
        break

pbar.close()
cap.release()
cv2.destroyAllWindows()

Data Genartion in progress:   0%|          | 0/1000 [00:00<?, ?it/s]

Data Genartion in progress: 1005it [02:24,  6.97it/s]                          


In [32]:
tracking_data

{120: [(4.339463, 492.7961038807024),
  (4.2860794, 484.11672146291335),
  (4.590423, 465.47395201020646),
  (4.646207, 444.25330612163145),
  (4.626579, 442.4669479181468),
  (4.7669263, 426.0422514258416),
  (5.0626855, 408.0784238354192),
  (4.9861913, 408.0110292626904),
  (4.951779, 393.24928480545265),
  (5.1532793, 394.1433749284643),
  (5.3978295, 379.26903380054637),
  (5.3774185, 366.0),
  (5.439351, 360.068049124051),
  (5.5783195, 351.2406582387637),
  (5.538653, 348.01293079424505),
  (5.6337037, 349.3780187705002),
  (5.8793564, 345.63853951780317),
  (5.994641, 333.54160160315837),
  (5.9638095, 321.0763149159402),
  (5.972226, 309.16176995223714),
  (6.341868, 312.1938500355188),
  (6.2516265, 312.0256399721023),
  (6.2985654, 309.0),
  (6.256668, 294.49108645254444),
  (6.385944, 288.39036044916617),
  (6.558166, 285.70089254323307),
  (6.540873, 273.02930245671433),
  (6.6526103, 270.0074073057997),
  (6.555584, 270.09072549793336),
  (6.7820992, 252.57276179350774),


## 4.2 preparing the dataset for the DL model

In [33]:
def get_data_element(p1, p2):
    """Calculate the depth difference and scale change between two points."""
    depth_diff = p2[0] - p1[0]
    
    if depth_diff == 0:
        return None
    
    return p1[0], depth_diff, p2[1] / p1[1]

def generate_and_prepare_tensors(tracking_data, device):
    """Generate dataset using permutations of tracking data elements and prepare tensors on the specified device."""
    dataset = []
    for id_data in tracking_data.values():
        # Generate permutations of two elements
        permutations = list(itertools.permutations(id_data, 2))
        for perm in permutations:
            p1, p2 = perm
            scale_change = get_data_element(p1, p2)
            if scale_change is not None:
                dataset.append(scale_change)
    
    X = [[elem[0], elem[1]] for elem in dataset]
    y = [elem[2] for elem in dataset]

    X = torch.tensor(X, dtype=torch.float32).to(device)
    y = torch.tensor(y, dtype=torch.float32).view(-1, 1).to(device)

    return X, y

In [34]:
X, y = generate_and_prepare_tensors(tracking_data, device)

print(X.shape)

torch.Size([113282, 2])


In [35]:
# Create a dataset and split into training and test sets
full_dataset = TensorDataset(X, y)
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Training the model

In [36]:
num_epochs = 5
for epoch in range(num_epochs):
    cam_clib_model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = cam_clib_model(X_batch)
        loss = custom_loss(y_pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(cam_clib_model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

    if (epoch+1) % 1 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')


Epoch 1/5, Loss: 0.02941455878317356
Epoch 2/5, Loss: 0.003180544124916196
Epoch 3/5, Loss: 0.001214879914186895
Epoch 4/5, Loss: 0.0013922632206231356
Epoch 5/5, Loss: 0.5053448677062988


testing the model

In [39]:
# Evaluate the model
cam_clib_model.eval()
test_loss = 0.0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = cam_clib_model(X_batch)
        loss = custom_loss(y_pred, y_batch)
        test_loss += loss.item()

test_loss /= len(test_loader)
print(f'Test Loss: {test_loss}')

Test Loss: 0.044290874654503355


the infer fuction 

In [40]:
def infer_cam_calib(cam_clib_model, current_depth, depth_diff):
    current_depth = float(current_depth)
    depth_diff = float(depth_diff)
    scale_change = cam_clib_model(torch.tensor([current_depth, depth_diff], dtype=torch.float32 , device=device )  )
    return scale_change.item()
print (infer_cam_calib(cam_clib_model, 4, 5))
print (infer_cam_calib(cam_clib_model, 4, -2))

0.40119585394859314
1.9375927448272705


# 5.0 Estimating heights from the Video

In [41]:
def annotate_frame(frame, head_pos, leg_pos, height):          
    # Draw circles or markers for head and leg positions
    cv2.circle(frame, head_pos, 5, (0, 255, 0), -1)  # Green circle for head
    cv2.circle(frame, leg_pos, 5, (0, 0, 255), -1)   # Red circle for leg
    cv2.line(frame, head_pos, leg_pos, (255, 0, 0), 2) 
    
    # Draw the height in the center point of the head and the leg
    midpoint = ((head_pos[0] + leg_pos[0]) // 2, (head_pos[1] + leg_pos[1]) // 2)
    cv2.putText(frame, str(height), midpoint, cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    return frame

In [42]:
def estimate_height(c_length_pixels, c_depth, anchors):
    n = len(anchors)

    av_c_tall = 0.0

    for a_length_pixels,  a_true_length,  a_depth in anchors:
        c_tranformed_length_pixels = infer_cam_calib(cam_clib_model, c_depth, a_depth - c_depth) * c_length_pixels
        c_tall = a_true_length * ( c_tranformed_length_pixels / a_length_pixels   )

        av_c_tall += c_tall/n

    return av_c_tall  

In [43]:
def detect_head_and_leg(video_path, output_video_path, tracker, depth_anything, anchors):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Couldn't open video.")
        return

    # Get video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    frame_count = 0
    object_heights = defaultdict(lambda: deque(maxlen=10))  # Store height measurements for each object

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        results, legs_and_heads, boxes = MyDetector.infer_obj_detection(frame)

        detections = sv.Detections.from_ultralytics(results)
        detections = tracker.update_with_detections(detections)

        depth_map = depth_anything.infer_image(frame)

        for detection_idx, _ in enumerate(detections):
            xyxy = detections[detection_idx].xyxy.tolist()[0]
            obj_id = detections[detection_idx].tracker_id[0]

            best_idx = MyDetector.match_best_box(xyxy, boxes)
            head_pos, leg_pos = legs_and_heads[best_idx]

            # Calculate pixel distance and depth
            start, end = head_pos, leg_pos
            length_pixels = np.sqrt((end[0] - start[0]) ** 2 + (end[1] - start[1]) ** 2)
            middle_point = ((start[0] + end[0]) // 2, (start[1] + end[1]) // 2)
            depth = depth_map[middle_point[1], middle_point[0]]

            # Estimate height
            height = round(estimate_height(length_pixels, depth, anchors), 2)
            object_heights[obj_id].append(height)

            # Use RANSAC to robustly estimate the height
            if len(object_heights[obj_id]) >= 5:  # Minimum samples required for RANSAC
                X = np.arange(len(object_heights[obj_id])).reshape(-1, 1)
                y = np.array(object_heights[obj_id])
                ransac = RANSACRegressor(min_samples=0.5)  # Use more samples for RANSAC
                ransac.fit(X, y)
                height_ransac = ransac.predict(np.array([[len(object_heights[obj_id])-1]]))[0]
            else:
                height_ransac = height

            # Apply a moving average to smooth the RANSAC output
            smoothed_height = np.mean(list(object_heights[obj_id])[-5:])  # Moving average over last 5 measurements
            stable_height = 0.7 * height_ransac + 0.3 * smoothed_height  # Weighted average for more stability

            frame = annotate_frame(frame, head_pos, leg_pos, round(round (stable_height), 2))

        # Display the frame
        cv2.imshow('Frame', frame)

        # Write the frame into the output video file
        out.write(frame)

        # Exit if 'Esc' key is pressed
        if cv2.waitKey(1) & 0xFF == 27:  # 27 is the Esc key
            break

        frame_count += 1
        print(f"Processed frame {frame_count}/{total_frames}")

    # Release everything when finished
    cap.release()
    out.release()
    cv2.destroyAllWindows()

# Example usage
output_video_path = 'output_video.mp4'
detect_head_and_leg(VIDEO_PATH, output_video_path, tracker, depth_anything, anchor_lines)

Processed frame 1/3493
Processed frame 2/3493
Processed frame 3/3493
Processed frame 4/3493




Processed frame 5/3493




Processed frame 6/3493




Processed frame 7/3493




Processed frame 8/3493
Processed frame 9/3493




Processed frame 10/3493
Processed frame 11/3493




Processed frame 12/3493




Processed frame 13/3493




Processed frame 14/3493
Processed frame 15/3493
Processed frame 16/3493




Processed frame 17/3493




Processed frame 18/3493




Processed frame 19/3493
Processed frame 20/3493




Processed frame 21/3493
Processed frame 22/3493




Processed frame 23/3493




Processed frame 24/3493




Processed frame 25/3493
Processed frame 26/3493




Processed frame 27/3493




Processed frame 28/3493




Processed frame 29/3493




Processed frame 30/3493
Processed frame 31/3493
Processed frame 32/3493




Processed frame 33/3493




Processed frame 34/3493
Processed frame 35/3493
Processed frame 36/3493




Processed frame 37/3493




Processed frame 38/3493
Processed frame 39/3493
Processed frame 40/3493




Processed frame 41/3493
Processed frame 42/3493
Processed frame 43/3493
Processed frame 44/3493
Processed frame 45/3493
Processed frame 46/3493
Processed frame 47/3493
Processed frame 48/3493
Processed frame 49/3493
Processed frame 50/3493




Processed frame 51/3493




Processed frame 52/3493




Processed frame 53/3493
Processed frame 54/3493
Processed frame 55/3493
Processed frame 56/3493
Processed frame 57/3493
Processed frame 58/3493




Processed frame 59/3493




Processed frame 60/3493




Processed frame 61/3493
Processed frame 62/3493




Processed frame 63/3493
Processed frame 64/3493




Processed frame 65/3493
Processed frame 66/3493
Processed frame 67/3493




Processed frame 68/3493




Processed frame 69/3493




Processed frame 70/3493
Processed frame 71/3493
Processed frame 72/3493
Processed frame 73/3493




Processed frame 74/3493




Processed frame 75/3493




Processed frame 76/3493




Processed frame 77/3493




Processed frame 78/3493




Processed frame 79/3493
Processed frame 80/3493
Processed frame 81/3493
Processed frame 82/3493
Processed frame 83/3493
Processed frame 84/3493




Processed frame 85/3493
Processed frame 86/3493
Processed frame 87/3493
Processed frame 88/3493
Processed frame 89/3493




Processed frame 90/3493
Processed frame 91/3493
Processed frame 92/3493
Processed frame 93/3493
Processed frame 94/3493
Processed frame 95/3493
Processed frame 96/3493
Processed frame 97/3493




Processed frame 98/3493




Processed frame 99/3493




Processed frame 100/3493




Processed frame 101/3493
Processed frame 102/3493
Processed frame 103/3493
Processed frame 104/3493




Processed frame 105/3493
Processed frame 106/3493




Processed frame 107/3493




Processed frame 108/3493
Processed frame 109/3493


