# MegaSAM Pipeline - Google Colab

This notebook runs the MegaSAM pipeline for camera tracking and depth estimation.

**Requirements:** GPU runtime (Runtime > Change runtime type > T4 GPU)

In [None]:
# Check GPU availability
!nvidia-smi

## 1. Clone Repository and Initialize Submodules

In [None]:
# Clone the repository
!git clone https://github.com/JonnyShiUW/cse455-mega-sam-impl.git
%cd cse455-mega-sam-impl

# Initialize submodules
!git submodule update --init --recursive

In [None]:
%cd implementation

## 2. Install Dependencies

In [None]:
# Use Colab's pre-installed PyTorch (compatible with the environment)
# Just install the additional dependencies
!pip install opencv-python-headless tqdm imageio einops scipy matplotlib 
!pip install timm ninja numpy==1.26.3 huggingface-hub kornia
!pip install torch-scatter -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__.split('+')[0])")+cu$(python -c "import torch; print(torch.version.cuda.replace('.',''))").html

In [None]:
# Install xformers (compatible with Colab's PyTorch version)
!pip install xformers

In [None]:
# Install UniDepth
!pip install unidepth

## 3. Compile DROID-SLAM Extensions

In [None]:
%cd DROID-SLAM
!python setup.py install
%cd ..

## 4. Download Model Checkpoints

In [None]:
# Create checkpoint directories
!mkdir -p mega-sam/Depth-Anything/checkpoints

# Download DepthAnything checkpoint (~1.2GB)
!wget -O mega-sam/Depth-Anything/checkpoints/depth_anything_vitl14.pth \
    "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth"

print("DepthAnything checkpoint downloaded!")

In [None]:
# Download RAFT checkpoint (~78MB)
!wget -O mega-sam/cvd_opt/raft-things.pth \
    "https://www.dropbox.com/s/4j4z58wuv8o0mfz/raft-things.pth?dl=1"

print("RAFT checkpoint downloaded!")

In [None]:
# Verify checkpoints
!ls -lh mega-sam/Depth-Anything/checkpoints/
!ls -lh mega-sam/cvd_opt/raft-things.pth
!ls -lh mega-sam/checkpoints/

## 5. Upload Your Input Frames

Upload your `test_video` folder of JPEG frames, or use the sample frames if included in the repo.

In [None]:
# Check if test_video frames exist
!ls test_video/ | head -10
!ls test_video/*.jpg 2>/dev/null | wc -l

In [None]:
# If you need to upload frames from Google Drive:
# from google.colab import drive
# drive.mount('/content/drive')
# !cp -r /content/drive/MyDrive/your_frames_folder ./test_video

## 6. Run MegaSAM Pipeline

In [None]:
# Verify setup before running
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

In [None]:
# Run the pipeline!
!python main.py

## 7. Visualizations for Presentation

In [None]:
# Check outputs
!ls -lh outputs_cvd/
!ls -lh reconstructions/

In [None]:
# Load and inspect final output
import numpy as np

data = np.load("outputs_cvd/marching_sgd_cvd_hr.npz")
print("Output contents:")
for key in data.files:
    print(f"  {key}: shape={data[key].shape}, dtype={data[key].dtype}")

In [None]:
# Visualize a sample depth map
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# RGB frame
axes[0].imshow(data['images'][0])
axes[0].set_title('Input Frame 0')
axes[0].axis('off')

# Depth map
axes[1].imshow(data['depths'][0], cmap='turbo')
axes[1].set_title('Estimated Depth 0')
axes[1].axis('off')

plt.tight_layout()
plt.show()

### Grid View: Multiple Frames with Depth Maps

In [None]:
# Show grid of frames at different time points
import matplotlib.pyplot as plt
import numpy as np

n_samples = 6  # Number of frames to show
total_frames = len(data['images'])
indices = np.linspace(0, total_frames - 1, n_samples, dtype=int)

fig, axes = plt.subplots(2, n_samples, figsize=(20, 7))

for i, idx in enumerate(indices):
    # RGB frame
    axes[0, i].imshow(data['images'][idx])
    axes[0, i].set_title(f'Frame {idx}')
    axes[0, i].axis('off')
    
    # Depth map
    axes[1, i].imshow(data['depths'][idx], cmap='turbo')
    axes[1, i].set_title(f'Depth {idx}')
    axes[1, i].axis('off')

axes[0, 0].set_ylabel('RGB', fontsize=14)
axes[1, 0].set_ylabel('Depth', fontsize=14)

plt.suptitle('MegaSAM: RGB Frames and Estimated Depth Maps', fontsize=16)
plt.tight_layout()
plt.savefig('grid_visualization.png', dpi=150, bbox_inches='tight')
plt.show()
print("Saved: grid_visualization.png")

### Camera Trajectory Visualization (3D)

In [None]:
# Plot camera trajectory in 3D
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

# Extract camera positions from camera-to-world transforms
cam_c2w = data['cam_c2w']  # Shape: (N, 4, 4)
positions = cam_c2w[:, :3, 3]  # Extract translation component

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot trajectory
ax.plot(positions[:, 0], positions[:, 1], positions[:, 2], 
        'b-', linewidth=2, label='Camera Path')

# Mark start and end
ax.scatter(*positions[0], color='green', s=100, label='Start', marker='o')
ax.scatter(*positions[-1], color='red', s=100, label='End', marker='s')

# Color points by time
colors = plt.cm.viridis(np.linspace(0, 1, len(positions)))
ax.scatter(positions[:, 0], positions[:, 1], positions[:, 2], 
           c=colors, s=20, alpha=0.6)

ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('Estimated Camera Trajectory (MegaSAM)')
ax.legend()

plt.tight_layout()
plt.savefig('camera_trajectory.png', dpi=150, bbox_inches='tight')
plt.show()
print("Saved: camera_trajectory.png")

### Animated GIF: Side-by-Side RGB and Depth

In [None]:
# Create side-by-side animated GIF
import imageio
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
from tqdm import tqdm

frames_for_gif = []
depths = data['depths']
images = data['images']

# Normalize depths for consistent colormap
depth_min, depth_max = depths.min(), depths.max()

print("Creating animation frames...")
for i in tqdm(range(len(images))):
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # RGB
    axes[0].imshow(images[i])
    axes[0].set_title('Input RGB', fontsize=14)
    axes[0].axis('off')
    
    # Depth with consistent colormap
    depth_normalized = (depths[i] - depth_min) / (depth_max - depth_min)
    depth_colored = cm.turbo(depth_normalized)[:, :, :3]
    axes[1].imshow(depth_colored)
    axes[1].set_title('Estimated Depth', fontsize=14)
    axes[1].axis('off')
    
    plt.suptitle(f'MegaSAM Output - Frame {i}/{len(images)-1}', fontsize=16)
    plt.tight_layout()
    
    # Convert figure to image
    fig.canvas.draw()
    frame = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    frame = frame.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    frames_for_gif.append(frame)
    plt.close(fig)

# Save as GIF
print("Saving GIF...")
imageio.mimsave('megasam_output.gif', frames_for_gif, fps=10)
print("Saved: megasam_output.gif")

# Display in notebook
from IPython.display import Image, display
display(Image(filename='megasam_output.gif'))

## 8. Download Results

In [None]:
# Download all visualization files for your presentation
from google.colab import files
import shutil

# Create a zip with all results
!mkdir -p megasam_presentation
!cp grid_visualization.png megasam_presentation/
!cp camera_trajectory.png megasam_presentation/
!cp megasam_output.gif megasam_presentation/
!cp outputs_cvd/marching_sgd_cvd_hr.npz megasam_presentation/

shutil.make_archive('megasam_presentation', 'zip', 'megasam_presentation')

print("Files included:")
!ls -lh megasam_presentation/

print("\nDownloading zip file...")
files.download('megasam_presentation.zip')

In [None]:
# Or save to Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# !cp -r outputs_cvd /content/drive/MyDrive/megasam_results/