In [2]:
import os
import shutil
import PIL
import matplotlib.pyplot as plt
from ultralytics import YOLO
import random
import dill
import torch
import cv2
import numpy as np
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import cv2
import torch.nn.functional as F


In [3]:
model_type = "DPT_Hybrid"  # Choose model type as per requirements
midas = torch.hub.load("intel-isl/MiDaS", model_type)
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
# Set the appropriate transform for the chosen model type
if model_type in ["DPT_Large", "DPT_Hybrid"]:
    transform = midas_transforms.dpt_transform
else:
    transform = midas_transforms.small_transform
# Set the model to evaluation mode if needed
for param in midas.parameters():
    param.requires_grad = False

Using cache found in /home/aditya/.cache/torch/hub/intel-isl_MiDaS_master
  model = create_fn(
Using cache found in /home/aditya/.cache/torch/hub/intel-isl_MiDaS_master


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AbsoluteDepthModel(nn.Module):
    def __init__(self, midas_model):
        super(AbsoluteDepthModel, self).__init__()
        self.midas_model = midas_model
        
        # 1x1 Convolution layers to learn pixel-wise transformations
        self.conv = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=1),  # Input channels = 1 (depth map)
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(32, 1, kernel_size=1) # Output channels = 1 (absolute depth)
        )
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # Step 1: Get relative inverse depth prediction from MiDas
        relative_depth = self.midas_model(x)  # Shape: (batch, 1, H, W)
        
        
        
        # Step 3: Resize depth map to desired dimensions
        resized_depth = F.interpolate(
            relative_depth.unsqueeze(1),  # Add channel dimension
            size=(352, 1216),
            mode="bicubic",
            align_corners=False
        )
        
        # Step 4: Apply 1x1 convolution for pixel-wise transformation
        absolute_depth = self.conv(resized_depth)
        absolute_depth = self.relu(absolute_depth)  # Ensure non-negative depth
        
        return absolute_depth.squeeze(1)  # Shape: (batch, H, W)


In [5]:
depth_model = AbsoluteDepthModel(midas)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
depth_model = depth_model.to(device)
state_dict = torch.load('depth.pth', weights_only=True)
depth_model.load_state_dict(state_dict)

True


<All keys matched successfully>

In [6]:
def depth_predict(image_in,test_image,point):
    with torch.no_grad():
        image_in=image_in.to('cuda')
        output=depth_model(image_in)
        output = torch.nn.functional.interpolate(
            output.unsqueeze(1),
            size=np.array(test_image).shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze()
    predictions=output.cpu().numpy()
    
    # print(point[1],point[0])
    x=int(point[0])
    y=int(point[1])

    return (predictions[y,x])

In [7]:
K = np.array([
        [721.5377, 0.0, 596.5593],
        [0.0, 721.5377, 149.854],
        [0.0, 0.0, 1.0]
    ])


In [8]:
def image_to_camera_coords(u, v, z, intrinsic_matrix):
    """
    Convert image coordinates (u, v) and depth z to 3D camera coordinates (x, y, z).
    
    Parameters:
        u (float): Image coordinate u (pixel column).
        v (float): Image coordinate v (pixel row).
        z (float): Depth value in meters.
        intrinsic_matrix (numpy.ndarray): 3x3 camera intrinsic matrix.
        
    Returns:
        numpy.ndarray: 3D point (x, y, z) in the camera coordinate frame.
    """
    # Intrinsic matrix components
    fx = intrinsic_matrix[0, 0]
    fy = intrinsic_matrix[1, 1]
    cx = intrinsic_matrix[0, 2]
    cy = intrinsic_matrix[1, 2]
    
    # Calculate camera coordinates
    x = (u - cx) * z / fx
    y = (v - cy) * z / fy
    return np.array([x, y, z])

In [10]:

# Load the YOLO model
class_model = YOLO('/yolo.pt')

# Load images
images = os.listdir("/Dataset/image")

# Randomly shuffle and sample images
random.shuffle(images)
random_images = random.sample(images, 20)

# Define the output video parameters
output_video_path = "depth_video.avi"
frame_width = 1280  # Define frame width
frame_height = 720  # Define frame height
fps = 1  # Frames per second
fourcc = cv2.VideoWriter_fourcc(*'XVID')  # Codec

# Initialize VideoWriter
video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

for img in images:
    # Predict using the YOLO model
    results = class_model.predict(source=f"/Dataset/image/{img}", imgsz=640, conf=0.25)
    
    # Load the image for plotting
    image_path = f"/Dataset/image/{img}"
    image = cv2.imread(image_path)
    test_image = Image.open(image_path)
    image_in = transform(np.array(test_image))
    
    for result in results:
        for detection in result.boxes:
            x1, y1, x2, y2 = detection.xyxy[0].tolist()  
            x_center = (x1 + x2) / 2
            y_center = (y1 + y2) / 2
            label = result.names[int(detection.cls)]
            
            # Predict depth at the center of the detection
            depth = depth_predict(image_in, test_image, (x_center, y_center)) 
            # if depth>10:
            #     depth=2.0
            
            # Convert to camera coordinates
            x, y, z = image_to_camera_coords(x_center, y_center, depth, K)
            label_text = f"{label}: ({x:.1f}, {y:.1f}, {z:.2f})"
            print(label_text)
            
            # Write label, center coordinates, and depth on the image
            cv2.putText(image, label_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 
                        0.7, (255, 0, 0), 2, cv2.LINE_AA)  # Green text
            
            # Draw a circle at the center of the detected object
            cv2.circle(image, (int(x_center), int(y_center)), 5, (255, 0, 0), -1)  # Green dot with radius 5
            plt.figure(figsize=(20,10))
            # plt.imshow(image)
        # break
    # break
    # Resize the image to the output video frame size
    resized_frame = cv2.resize(image, (frame_width, frame_height))
    # Write the frame to the video
    video_writer.write(resized_frame)

# Release the video writer
video_writer.release()
print(f"Video saved at {output_video_path}")


image 1/1 /media/aditya/ExtraSpace/3D_Object_Detection/Dataset/image/2011_09_26_drive_0023_sync_image_0000000098_image_02.png: 192x640 8 cars, 1 van, 41.8ms
Speed: 2.9ms preprocess, 41.8ms inference, 100.2ms postprocess per image at shape (1, 3, 192, 640)
car: (6.6, 2.0, 14.20)
car: (-10.1, 1.6, 15.28)
car: (-8.8, 1.2, 14.80)
car: (-6.1, 1.1, 15.14)
car: (3.3, 0.5, 16.95)
van: (-3.7, 0.5, 16.16)
car: (0.8, 0.4, 17.12)
car: (-4.6, 0.7, 15.73)
car: (-4.2, 0.7, 15.98)

image 1/1 /media/aditya/ExtraSpace/3D_Object_Detection/Dataset/image/2011_09_30_drive_0016_sync_image_0000000083_image_02.png: 192x640 2 cars, 7.9ms
Speed: 0.8ms preprocess, 7.9ms inference, 1.2ms postprocess per image at shape (1, 3, 192, 640)
car: (0.0, 0.6, 16.11)
car: (-1.6, 0.4, 17.31)

image 1/1 /media/aditya/ExtraSpace/3D_Object_Detection/Dataset/image/2011_09_29_drive_0026_sync_image_0000000047_image_02.png: 192x640 3 cars, 8.0ms
Speed: 0.6ms preprocess, 8.0ms inference, 0.8ms postprocess per image at shape (1, 3, 

  plt.figure(figsize=(20,10))


pedestrian: (-9.3, 1.8, 15.01)
cyclist: (4.8, 1.6, 14.96)
cyclist: (11.8, 1.5, 16.08)
cyclist: (9.0, 1.3, 16.78)
pedestrian: (8.1, 0.9, 17.22)
pedestrian: (8.7, 1.0, 17.13)
pedestrian: (-0.1, 0.6, 16.93)
pedestrian: (8.2, 0.9, 17.16)

image 1/1 /media/aditya/ExtraSpace/3D_Object_Detection/Dataset/image/2011_10_03_drive_0047_sync_image_0000000599_image_02.png: 192x640 5 cars, 2 miscs, 8.0ms
Speed: 0.7ms preprocess, 8.0ms inference, 0.9ms postprocess per image at shape (1, 3, 192, 640)
misc: (-8.5, 0.5, 15.72)
car: (1.3, 0.8, 15.59)
misc: (-2.6, -0.0, 17.29)
car: (-4.1, 0.8, 15.63)
car: (2.8, 0.5, 16.81)
car: (-3.3, 0.6, 16.04)
car: (4.6, 0.4, 17.06)

image 1/1 /media/aditya/ExtraSpace/3D_Object_Detection/Dataset/image/2011_09_26_drive_0036_sync_image_0000000185_image_02.png: 192x640 7 cars, 2 vans, 1 truck, 8.0ms
Speed: 0.6ms preprocess, 8.0ms inference, 0.9ms postprocess per image at shape (1, 3, 192, 640)
car: (9.6, 2.1, 14.12)
car: (-6.4, 1.4, 14.61)
car: (5.5, 1.4, 14.63)
car: (-2.5

: 