<a href="https://colab.research.google.com/github/Mragankk/Object_detection_and_Depth_estimation/blob/main/YOLO%2BDepth_pro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Object Detection & Depth Estimation using YOLO and Apple's ML-Depth-Pro

This notebook performs:
- **Object detection** using [Ultralytics YOLO](https://github.com/ultralytics/ultralytics)
- **Depth estimation** using Apple's [ML-Depth-Pro](https://github.com/apple/ml-depth-pro)
- Visualization of results with bounding boxes and depth values


**Install Dependencies**
- **YOLO** (Ultralytics) for object detection
- **ML-Depth-Pro** for depth estimation
- OpenCV, PyTorch, Pillow, Open3D for image processing and 3D visualization


In [None]:
%pip install numpy opencv-python==4.9.0.80 opencv-contrib-python==4.9.0.80 opencv-python-headless==4.9.0.80 -q

%pip install ultralytics timm torch torchvision open3d pillow -q

!rm -rf ml-depth-pro
!git clone https://github.com/apple/ml-depth-pro -q
%cd ml-depth-pro
!pip install -e . -q
%cd ..

!mkdir -p checkpoints

!hf download apple/DepthPro depth_pro.pt --local-dir checkpoints

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.2/62.2 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.3/68.3 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m120.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m97.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

**Restart the Runtime**

After the installation completes successfully, restart the Colab runtime to ensure all newly installed packages are loaded properly.  

You can do this by:  
- Clicking on **Runtime** in the top menu  
- Selecting **Restart session**  
- Then re-running the next cells

**Upload Images**\
Upload your test image from your local machine.

In [None]:
from google.colab import files

uploaded = files.upload()

for filename in uploaded.keys():
    print(f"Uploaded file: {filename}")

### Object Detection and Depth Estimation
We use YOLO to detect objects in the image and store their bounding boxes & labels.

In [None]:
import cv2
import torch
import numpy as np
from PIL import Image
from ultralytics import YOLO
import depth_pro as dp
from google.colab.patches import cv2_imshow

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load YOLO model
yolo_model = YOLO('/content/yolo11x.pt', verbose=False).to(device)
if device == 'cuda':
    yolo_model = yolo_model.cuda()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

image_path = '/content/lab.jpg'

image = cv2.imread(image_path)
original_height, original_width = image.shape[:2]

results = yolo_model(image)

# Store bounding boxes and class names
obj_boxes = []
obj_names = []
for r in results:
    boxes = r.boxes.xyxy.cpu().numpy()
    classes = r.boxes.cls.cpu().numpy()
    for box, cls in zip(boxes, classes):
        x1, y1, x2, y2 = map(int, box[:4])
        obj_boxes.append((x1, y1, x2, y2))
        obj_names.append(r.names[int(cls)])
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)

pil_image = Image.open(image_path).convert("RGB")

##Depth Estimation
We pass the image into Apple's ML-Depth-Pro model to estimate the distance (in meters) for each detected object.


In [None]:
depth_model, transform = dp.create_model_and_transforms()
depth_model = depth_model.to(device).eval()

img, _, f_px = dp.load_rgb(image_path)
depth_input = transform(img).to(device)

prediction = depth_model.infer(depth_input, f_px=f_px)
depth = prediction["depth"]  # depth in 'm'

depth_np = depth.squeeze().cpu().numpy()

for (x1, y1, x2, y2), obj_name in zip(obj_boxes, obj_names):
    center_x = (x1 + x2) // 2
    center_y = (y1 + y2) // 2

    depth_value = depth_np[center_y, center_x]
    text = f'{obj_name}: {depth_value:.2f}m'
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 1.2
    font_thickness = 2
    text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]

    text_x = x1
    text_y = y1 - 10
    rect_x1 = text_x - 5
    rect_y1 = text_y - text_size[1] - 10
    rect_x2 = text_x + text_size[0] + 5
    rect_y2 = text_y + 5

    cv2.rectangle(image, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 0, 0), -1)
    cv2.putText(image, text, (text_x, text_y), font, font_scale, (255, 255, 255), font_thickness)

resized_img = cv2.resize(image, (2500, 3000), interpolation=cv2.INTER_CUBIC)
cv2.imwrite('/content/results/final_image.jpg', resized_img)
cv2_imshow(resized_img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
import torch
torch.cuda.empty_cache()

### Single Object Detection

In [None]:
from PIL import Image
from ultralytics import YOLO
import numpy as np
import depth_pro as dp
import cv2
from google.colab.patches import cv2_imshow
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

yolo_model = YOLO('/content/yolo11x.pt', verbose=False).to(device)
if device == 'cuda':
    yolo_model = yolo_model.cuda()

img_path='/content/lab.jpg'
image=cv2.imread(img_path)
result=yolo_model(image)

### Bounding Boxes

In [None]:
obj_boxes=[]
for r in result:
    boxes=r.boxes.xyxy.cpu().numpy()
    classes=r.boxes.cls.cpu().numpy()

    for box,cls in zip(boxes,classes):
        if r.names[int(cls)]=='chair': # check coco.names for other object classes
            x1,y1,x2,y2=map(int,box[:4])
            obj_boxes.append((x1,y1,x2,y2))
            cv2.rectangle(image,(x1,y1),(x2,y2),(0,255,0),2)

resized_img=cv2.resize(image,(500,500),interpolation=cv2.INTER_CUBIC)
cv2_imshow(resized_img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
import matplotlib.pyplot as plt
import cv2
import numpy as np

# Convert depth tensor to NumPy array
depth_np = depth.cpu().numpy().squeeze()

# Save as an image (normalize and convert to 8-bit for visibility)
depth_normalized = cv2.normalize(depth_np, None, 0, 255, cv2.NORM_MINMAX)
depth_uint8 = np.uint8(depth_normalized)  # Convert to uint8

# Save using OpenCV
cv2.imwrite("depth_map_for_image.png", depth_uint8)

# Show with Matplotlib
plt.imshow(depth_np, cmap='magma')
plt.colorbar()
plt.show()

In [None]:
import cv2
import numpy as np
import open3d as o3d

# Load the depth map
depth_map = cv2.imread("/content/depth_map_for_image.png", cv2.IMREAD_UNCHANGED)

# Check if the image is loaded
if depth_map is None:
    raise FileNotFoundError("Error: The depth image file was not loaded. Check the file path.")

# Invert the depth map (flip depth values)
depth_map = cv2.bitwise_not(depth_map)  # Works for 8-bit depth
# If depth is 16-bit or floating point, use: depth_map = np.max(depth_map) - depth_map

# Normalize and apply a colormap for visualization
depth_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX)
depth_colored = cv2.applyColorMap(depth_normalized.astype(np.uint8), cv2.COLORMAP_JET)

# Show fixed depth map
cv2_imshow(depth_colored)
cv2.waitKey(0)
cv2.destroyAllWindows()

# Convert to Open3D Image
depth_o3d = o3d.geometry.Image(depth_map.astype(np.uint16))

# Camera intrinsics (modify based on camera)
fx, fy, cx, cy = 525, 525, 319.5, 239.5
intrinsic = o3d.camera.PinholeCameraIntrinsic(640, 480, fx, fy, cx, cy)

# Create point cloud
pcd = o3d.geometry.PointCloud.create_from_depth_image(depth_o3d, intrinsic)

# Visualize
o3d.visualization.draw_geometries([pcd])

**Run Depth Estimation**
- Loads the uploaded image
- Passes it through the DepthPro model
- Generates a depth map
- Saves and displays the result

In [None]:
# depth model and preprocessing transform

# depth_model, transform = dp.create_model_and_transforms()
# depth_model.eval()
depth_model, transform = dp.create_model_and_transforms()
depth_model = depth_model.to(device).eval()

img,_,f_px=dp.load_rgb(img_path)
depth_input=transform(img).to(device)

prediction=depth_model.infer(depth_input, f_px=f_px)
depth=prediction["depth"] # depth in 'm'

depth_np=depth.squeeze().cpu().numpy()
for x1,y1,x2,y2 in obj_boxes:
    center_x=(x1+x2)//2
    center_y=(y1+y2)//2

    depth_value=depth_np[center_y,center_x]
    text=f'Depth: {depth_value:.2f}m'
    font=cv2.FONT_HERSHEY_SIMPLEX
    font_scale=1.2
    font_thickness=2
    text_size=cv2.getTextSize(text,font,font_scale,font_thickness)[0]

    text_x=x1
    text_y=y1-10
    rect_x1=text_x-5
    rect_y1=text_y-text_size[1]-10
    rect_x2=text_x+text_size[0]+5
    rect_y2=text_y+5

    cv2.rectangle(image,(rect_x1,rect_y1),(rect_x2,rect_y2),(0,0,0),-1)
    cv2.putText(image,text,(text_x,text_y),font,font_scale,(255,255,255),font_thickness)

# image=cv2.resize(image,(1000,1000),interpolation=cv2.INTER_CUBIC)
cv2_imshow(image)
cv2.waitKey(0)
cv2.destroyAllWindows()
cv2.imwrite("chair_detection_with_depth.jpg",image)