In [1]:
!pip install torch torchvision opencv-python open3d timm

Collecting open3d
  Downloading open3d-0.19.0-cp310-cp310-manylinux_2_31_x86_64.whl.metadata (4.3 kB)
Collecting dash>=2.6.0 (from open3d)
  Downloading dash-2.18.2-py3-none-any.whl.metadata (10 kB)
Collecting configargparse (from open3d)
  Downloading ConfigArgParse-1.7-py3-none-any.whl.metadata (23 kB)
Collecting ipywidgets>=8.0.4 (from open3d)
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting addict (from open3d)
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting pyquaternion (from open3d)
  Downloading pyquaternion-0.9.9-py3-none-any.whl.metadata (1.4 kB)
Collecting flask>=3.0.0 (from open3d)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting werkzeug>=3.0.0 (from open3d)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting dash-html-components==2.0.0 (from dash>=2.6.0->open3d)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 

In [2]:
import cv2
import torch
import numpy as np
import open3d as o3d
from tqdm import tqdm
import os
import plotly.graph_objs as go

In [3]:
# Define function to extract frames
def extract_frames(video_path, output_folder):
    # Open the video
    video_capture = cv2.VideoCapture(video_path)

    # Check if video opened successfully
    if not video_capture.isOpened():
        print("Error opening video stream or file")
        return

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    frame_count = 0
    while True:
        ret, frame = video_capture.read()

        # If the frame was read successfully
        if ret:
            # Save frame as image
            frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
            cv2.imwrite(frame_filename, frame)
            frame_count += 1
        else:
            break

    video_capture.release()
    print(f"Frames extracted and saved to {output_folder}")

# Example usage
video_path = "/content/leo.mp4"
output_folder = "/content/frames"
extract_frames(video_path, output_folder)

Frames extracted and saved to /content/frames


In [3]:
def create_dirs(base_path):
    depth_dir = os.path.join(base_path, 'depth_maps')
    cloud_dir = os.path.join(base_path, 'point_clouds')
    os.makedirs(depth_dir, exist_ok=True)
    os.makedirs(cloud_dir, exist_ok=True)
    return depth_dir, cloud_dir

def depth_to_colored_point_cloud(depth_map, color_image, fx=525.0, fy=525.0, cx=319.5, cy=239.5):
    rows, cols = depth_map.shape
    c, r = np.meshgrid(np.arange(cols), np.arange(rows), sparse=True)
    z = depth_map
    x = (c - cx) * z / fx
    y = (r - cy) * z / fy

    points = np.stack([x, y, z], axis=-1).reshape(-1, 3)
    colors = color_image.reshape(-1, 3) / 255.0

    mask = points[:, 2] > 0
    points = points[mask]
    colors = colors[mask]

    return points, colors

In [4]:
def process_video(video_path, base_path):
    depth_dir, cloud_dir = create_dirs(base_path)

    midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    midas.to(device).eval()

    midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
    transform = midas_transforms.dpt_transform

    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    for frame_idx in tqdm(range(frame_count)):
        ret, frame = cap.read()
        if not ret:
            break

        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        input_batch = transform(img).to(device)

        with torch.no_grad():
            prediction = midas(input_batch)
            prediction = torch.nn.functional.interpolate(
                prediction.unsqueeze(1),
                size=img.shape[:2],
                mode="bicubic",
                align_corners=False,
            ).squeeze()

        depth_map = prediction.cpu().numpy()
        depth_map = ((depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255).astype(np.uint8)

        # Generate colored point cloud
        points, colors = depth_to_colored_point_cloud(depth_map, img)
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(points)
        pcd.colors = o3d.utility.Vector3dVector(colors)

        # Save outputs
        cv2.imwrite(os.path.join(depth_dir, f'depth_{frame_idx:06d}.png'), depth_map)
        o3d.io.write_point_cloud(os.path.join(cloud_dir, f'cloud_{frame_idx:06d}.ply'), pcd)

    cap.release()

In [None]:
video_path = '/content/leo.mp4'
base_path = '/content/output'
process_video(video_path, base_path)

Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master
100%|██████████| 58/58 [06:21<00:00,  6.58s/it]


In [None]:
def visualize_point_cloud(pcd, marker_size=2):
    # Extract point cloud data
    points = np.asarray(pcd.points)
    colors = np.asarray(pcd.colors)

    # Scale colors to 0-255 range (for RGB colors)
    if colors.max() <= 1:
        colors = (colors * 255).astype(np.uint8)

    # Combine colors for Plotly (format as 'rgb(r, g, b)')
    rgb_colors = [f'rgb({r},{g},{b})' for r, g, b in colors]

    # Create a 3D scatter plot using Plotly
    fig = go.Figure(data=[go.Scatter3d(
        x=points[:, 0],
        y=points[:, 1],
        z=points[:, 2],
        mode='markers',
        marker=dict(
            size=marker_size,
            color=rgb_colors,
            opacity=0.8
        )
    )])

    fig.update_layout(
        scene=dict(
            xaxis_title="X-axis",
            yaxis_title="Y-axis",
            zaxis_title="Z-axis",
            aspectmode="data",
        ),
        title="Point Cloud Visualization",
        margin=dict(l=0, r=0, b=0, t=40)
    )

    fig.show()

pcd_file = '/content/output/point_clouds/cloud_000047.ply'

pcd = o3d.io.read_point_cloud(pcd_file)

visualize_point_cloud(pcd)
