In [16]:
!pip install opencv-python opencv-python-headless torch torchvision timm



In [17]:
!pip install opencv-contrib-python



#Import Libraries

In [18]:
import cv2
import numpy as np
import torch
import time

In [19]:
print(dir(cv2.aruco))

['ArucoDetector', 'Board', 'CORNER_REFINE_APRILTAG', 'CORNER_REFINE_CONTOUR', 'CORNER_REFINE_NONE', 'CORNER_REFINE_SUBPIX', 'CharucoBoard', 'CharucoDetector', 'CharucoParameters', 'DICT_4X4_100', 'DICT_4X4_1000', 'DICT_4X4_250', 'DICT_4X4_50', 'DICT_5X5_100', 'DICT_5X5_1000', 'DICT_5X5_250', 'DICT_5X5_50', 'DICT_6X6_100', 'DICT_6X6_1000', 'DICT_6X6_250', 'DICT_6X6_50', 'DICT_7X7_100', 'DICT_7X7_1000', 'DICT_7X7_250', 'DICT_7X7_50', 'DICT_APRILTAG_16H5', 'DICT_APRILTAG_16h5', 'DICT_APRILTAG_25H9', 'DICT_APRILTAG_25h9', 'DICT_APRILTAG_36H10', 'DICT_APRILTAG_36H11', 'DICT_APRILTAG_36h10', 'DICT_APRILTAG_36h11', 'DICT_ARUCO_MIP_36H12', 'DICT_ARUCO_MIP_36h12', 'DICT_ARUCO_ORIGINAL', 'DetectorParameters', 'Dictionary', 'Dictionary_getBitsFromByteList', 'Dictionary_getByteListFromBits', 'GridBoard', 'RefineParameters', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_native', 'drawDetectedCornersCharuco', 'drawDetectedDiamonds', 'drawDetectedMarkers', 

#Depth Estimation Model

In [20]:
class DepthEstimationModel:
    def __init__(self, model_type="MiDaS_small"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = torch.hub.load("intel-isl/MiDaS", model_type)
        self.model.to(self.device)
        self.model.eval()

        self.transform = torch.hub.load("intel-isl/MiDaS", "transforms").small_transform

    def estimate_depth(self, img):
        input_batch = self.transform(img).to(self.device)

        with torch.no_grad():
            prediction = self.model(input_batch)
            prediction = torch.nn.functional.interpolate(
                prediction.unsqueeze(1),
                size=img.shape[:2],
                mode="bicubic",
                align_corners=False,
            ).squeeze()

        depth_map = prediction.cpu().numpy()
        return cv2.normalize(depth_map, None, 0, 1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)

#Calibrate Camera

In [21]:
def calibrate_camera_with_aruco(video_path):
    # Load the predefined dictionary
    aruco_dict = cv2.aruco.getPredefinedDictionary(cv2.aruco.DICT_4X4_250)
    aruco_params = cv2.aruco.DetectorParameters()

    # Create ArucoDetector object
    detector = cv2.aruco.ArucoDetector(aruco_dict, aruco_params)

    cap = cv2.VideoCapture(video_path)

    all_corners = []  # List to store 2D corners (image points)
    all_ids = []      # List to store marker IDs
    obj_points = []   # List to store 3D object points

    # Define the 3D points for a single marker (assuming square markers)
    marker_length = 0.05  # Example marker length (adjust as necessary)

    # 3D coordinates for the ArUco marker in real world (z=0 because it's on a plane)
    obj_point = np.array([[0, 0, 0],
                          [marker_length, 0, 0],
                          [marker_length, marker_length, 0],
                          [0, marker_length, 0]], dtype=np.float32)

    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert to grayscale
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Detect ArUco markers
        corners, ids, _ = detector.detectMarkers(gray)

        if ids is not None:
            # Add detected 2D corners and corresponding 3D object points
            for i in range(len(ids)):
                all_corners.append(corners[i])  # Add 2D corners for this marker
                all_ids.append(ids[i])          # Add marker ID
                obj_points.append(obj_point)    # Add the same 3D points for each detected marker

        frame_count += 1
        if frame_count >= 600:  # Limit to 600 frames for calibration
            break

    cap.release()

    # Perform camera calibration (now with properly formatted object and image points)
    ret, camera_matrix, dist_coeffs, rvecs, tvecs = cv2.calibrateCamera(
        obj_points, all_corners, gray.shape[::-1], None, None
    )

    if ret:
        print("Camera calibration successful!")
        return camera_matrix, dist_coeffs
    else:
        print("Camera calibration failed.")
        return None, None

# Replace 'your_video.mp4' with your video file path
camera_matrix, dist_coeffs = calibrate_camera_with_aruco('ex2.mp4')
print("Camera Matrix:", camera_matrix)
print("Distortion Coefficients:", dist_coeffs)


Camera calibration successful!
Camera Matrix: [[     341.92           0      426.06]
 [          0      196.29      170.51]
 [          0           0           1]]
Distortion Coefficients: [[   0.061903   -0.073217    0.049853    0.015175    0.011426]]


#Initialise the video path

In [22]:
# Video capture from file or camera
video_path = 'ex2.mp4'  # Replace with 0 for webcam if needed

#Loading YOLOv5 for Car Detection

In [23]:
# Load YOLOv5 model (for car detection)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
yolo_model.eval()

Using cache found in /Users/shaden/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 ðŸš€ 2024-9-11 Python-3.12.1 torch-2.4.0 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


AutoShape(
  (model): DetectMultiBackend(
    (model): DetectionModel(
      (model): Sequential(
        (0): Conv(
          (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
          (act): SiLU(inplace=True)
        )
        (1): Conv(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (act): SiLU(inplace=True)
        )
        (2): C3(
          (cv1): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv3): Conv(
            (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (m): Sequential(
            (0): Bottleneck(
              (cv1): Conv(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
  

# Initialize depth estimation model

In [24]:
pip install timm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [25]:
import torch
import torch.nn as nn
import timm
# Initialize depth estimation model
depth_model = DepthEstimationModel()

Using cache found in /Users/shaden/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /Users/shaden/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master
Using cache found in /Users/shaden/.cache/torch/hub/intel-isl_MiDaS_master


#Load example

In [26]:
# Calibrate the camera
camera_matrix, dist_coeffs = calibrate_camera_with_aruco(video_path)
if camera_matrix is None or dist_coeffs is None:
    print("Camera calibration failed. Exiting.")
    exit()

Camera calibration successful!


In [27]:
cap = cv2.VideoCapture(video_path)

In [28]:
# Get video dimensions
ret, frame = cap.read()
height, width = frame.shape[:2]
print(f"Video dimensions: {width}x{height}")

Video dimensions: 848x480


#Codec and create VideoWriter object

In [29]:
# Define the codec and create VideoWriter object
output_path = 'Output/output_video.mp4'
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for .mp4 files
out = cv2.VideoWriter(output_path, fourcc, 20.0, (width, height))  # 20 FPS

#Process the video and transform to 3D

In [30]:
from IPython.display import clear_output

# Initialize frame count
frame_count = 0

# Process the video and transform it to 3D
while cap.isOpened():
    ret, frame = cap.read()

    if not ret:
        break

    # Estimate depth (assuming your depth model works like this)
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    depth_map = depth_model.estimate_depth(img)

    # YOLOv5 car detection
    results = yolo_model(frame)
    detections = results.xyxy[0]

    # Filter out cars (YOLOv5 class 'car' is class 2)
    car_detections = detections[detections[:, 5] == 2]

    # Create a blank canvas for 3D visualization
    canvas_3d = np.zeros((height, width, 3), dtype=np.uint8)

    # Loop over detected cars
    for det in car_detections:
        x1, y1, x2, y2, conf, cls = map(int, det[:6])

        # Calculate the center of the car
        center_x = (x1 + x2) // 2
        center_y = (y1 + y2) // 2

        # Get the depth value at the center of the car
        depth_value = depth_map[center_y, center_x]

        # Convert 2D point to 3D using depth information
        point_2d = np.array([[center_x, center_y]], dtype=np.float32)
        point_3d = cv2.undistortPoints(point_2d, camera_matrix, dist_coeffs)
        point_3d = point_3d[0][0]
        point_3d = np.array([point_3d[0], point_3d[1], 1.0]) * depth_value

        # Project 3D point back to 2D for visualization
        rvec = np.zeros(3, dtype=np.float32)
        tvec = np.zeros(3, dtype=np.float32)
        point_2d_proj, _ = cv2.projectPoints(point_3d.reshape(1, 1, 3), rvec, tvec, camera_matrix, dist_coeffs)
        x_proj, y_proj = point_2d_proj[0][0]

        # Draw original bounding box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Draw projected point on the 3D canvas
        cv2.circle(canvas_3d, (int(x_proj), int(y_proj)), 5, (0, 0, 255), -1)

        # Add depth information
        cv2.putText(frame, f"Depth: {depth_value:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

    # Combine original frame and 3D canvas
    combined_frame = cv2.addWeighted(frame, 0.7, canvas_3d, 0.3, 0)

    # Write the frame to the output video
    out.write(combined_frame)

    # Display frame count every 30 frames (optional for logging purposes)
    frame_count += 1
    if frame_count % 30 == 0:
        print(f"Processed frame {frame_count}")

    # Clear the output to prevent cluttering
    clear_output(wait=True)

    # Add a small delay if needed (adjust or remove if processing is already fast enough)
    time.sleep(0.1)

# Release video and writer objects
cap.release()
out.release()

print("Video processing complete. Output saved as 'output_video.mp4'")

Video processing complete. Output saved as 'output_video.mp4'
