In [1]:
# Optional config for better memory efficiency
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Required imports
import torch
from mapanything.models import MapAnything
from mapanything.utils.image import load_images

In [2]:
# Get inference device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Init model - This requires internet access or the huggingface hub cache to be pre-downloaded
# For Apache 2.0 license model, use "facebook/map-anything-apache"
model = MapAnything.from_pretrained("facebook/map-anything").to(device)

Loading pretrained dinov2_vitg14 from torch hub


Using cache found in C:\Users\jordan/.cache\torch\hub\facebookresearch_dinov2_main
  return t.to(


In [3]:
from read_write_model import read_model
from read_write_model import extract_intrinsics_extrinsics_from_colmap
from read_write_model import get_image_paths

In [4]:
# Load COLMAP model
# The actual path to your COLMAP reconstruction directory
colmap_path = "../../mvat/data/D3/colmap/sparse/0"  

assert os.path.exists(colmap_path), "COLMAP path does not exist. Please check the path."

cameras, images, points3D = read_model(colmap_path, ext=".txt")  # or ".bin"
print(f"Loaded COLMAP model with {len(cameras)} cameras, {len(images)} images, {len(points3D)} points")

# Extract intrinsics and extrinsics
intrinsics, extrinsics = extract_intrinsics_extrinsics_from_colmap(cameras, images)
print(f"Extracted intrinsics shape: {intrinsics.shape}")
print(f"Extracted extrinsics shape: {extrinsics.shape}")

# Get image paths from COLMAP images (assuming images are in the same directory or adjust paths)
image_paths = get_image_paths(images, colmap_path)[0:4]

Loaded COLMAP model with 1 cameras, 68 images, 20825 points
Extracted intrinsics shape: (68, 3, 3)
Extracted extrinsics shape: (68, 4, 4)


In [None]:
images.keys()

In [None]:
cameras, images[0:4]

In [None]:
# Load and preprocess images from a folder or list of paths
images = [
    "../data/4356/images/T_S04856.jpg",
    "../data/4356/images/T_S04857.jpg",
    "../data/4356/images/T_S04858.jpg",
    "../data/4356/images/T_S04859.jpg",
]

depth_maps = [
    "../../mvat/data/D3/colmap/depth/T_S04856.tif",
    # "../../mvat/data/D3/colmap/depth/T_S04857.tif
    # "../../mvat/data/D3/colmap/depth/T_S04858.tif",
    # "../../mvat/data/D3/colmap/depth/T_S04859.tif",
]
views = load_images(images)

In [None]:
# Run inference
predictions = model.infer(
    views,                            # Input views
    memory_efficient_inference=False, # Trades off speed for more views (up to 2000 views on 140 GB)
    use_amp=True,                     # Use mixed precision inference (recommended)
    amp_dtype="bf16",                 # bf16 inference (recommended; falls back to fp16 if bf16 not supported)
    apply_mask=True,                  # Apply masking to dense geometry outputs
    mask_edges=True,                  # Remove edge artifacts by using normals and depth
    apply_confidence_mask=False,      # Filter low-confidence regions
    confidence_percentile=10,         # Remove bottom 10 percentile confidence pixels
)


In [None]:

# Access results for each view - Complete list of metric outputs
for i, pred in enumerate(predictions):
    # Geometry outputs
    pts3d = pred["pts3d"]                     # 3D points in world coordinates (B, H, W, 3)
    pts3d_cam = pred["pts3d_cam"]             # 3D points in camera coordinates (B, H, W, 3)
    depth_z = pred["depth_z"]                 # Z-depth in camera frame (B, H, W, 1)
    depth_along_ray = pred["depth_along_ray"] # Depth along ray in camera frame (B, H, W, 1)

    # Camera outputs
    ray_directions = pred["ray_directions"]   # Ray directions in camera frame (B, H, W, 3)
    intrinsics = pred["intrinsics"]           # Recovered pinhole camera intrinsics (B, 3, 3)
    camera_poses = pred["camera_poses"]       # OpenCV (+X - Right, +Y - Down, +Z - Forward) cam2world poses in world frame (B, 4, 4)
    cam_trans = pred["cam_trans"]             # OpenCV (+X - Right, +Y - Down, +Z - Forward) cam2world translation in world frame (B, 3)
    cam_quats = pred["cam_quats"]             # OpenCV (+X - Right, +Y - Down, +Z - Forward) cam2world quaternion in world frame (B, 4)

    # Quality and masking
    confidence = pred["conf"]                 # Per-pixel confidence scores (B, H, W)
    mask = pred["mask"]                       # Combined validity mask (B, H, W, 1)
    non_ambiguous_mask = pred["non_ambiguous_mask"]                # Non-ambiguous regions (B, H, W)
    non_ambiguous_mask_logits = pred["non_ambiguous_mask_logits"]  # Mask logits (B, H, W)

    # Scaling
    metric_scaling_factor = pred["metric_scaling_factor"]  # Applied metric scaling (B,)

    # Original input
    img_no_norm = pred["img_no_norm"]         # Denormalized input images for visualization (B, H, W, 3)

In [None]:
import matplotlib.pyplot as plt
from depth_anything_3.utils.visualize import visualize_depth

# Visualize input images and depth maps
n_images = len(predictions)

fig, axes = plt.subplots(2, n_images, figsize=(12, 6))

if n_images == 1:
    axes = axes.reshape(2, 1)

for i in range(n_images):
    # Show original image
    axes[0, i].imshow(predictions[i]["img_no_norm"].squeeze().cpu().numpy())
        
    axes[0, i].set_title(f"Input {i+1}")
    axes[0, i].axis('off')
    
    # Show depth map
    depth_vis = visualize_depth(predictions[i]["depth_z"].squeeze().cpu().numpy(), cmap="Spectral")
    axes[1, i].imshow(depth_vis)
    axes[1, i].set_title(f"Depth {i+1}")
    axes[1, i].axis('off')

plt.tight_layout()
plt.show()

In [None]:
predictions[0]['depth_z'].squeeze().cpu().numpy()

In [None]:
predictions[-1]['depth_z'].squeeze().cpu().numpy()