<a href="https://colab.research.google.com/github/JanNogga/grid_fusion_pytorch/blob/main/colab_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install grid-fusion-pytorch



In [2]:
import os
import torch
import timeit
import numpy as np

# we only need moviepy if we want to visualize the fused voxel grids slice-by-slice
from moviepy.editor import ImageSequenceClip

# these modules are compiled the first time they are loaded
from torch_fusion.util import sample_rays, download_example_data, soften_semseg
from torch_fusion.counting_model import apply_counting_model
from torch_fusion.point_cloud_fusion import apply_point_cloud_fusion

Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/counting_model_util_cuda/build.ninja...
Building extension module counting_model_util_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module counting_model_util_cuda...
Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/point_cloud_fusion_util_cuda/build.ninja...
Building extension module point_cloud_fusion_util_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module point_cloud_fusion_util_cuda...


In [3]:
# make sure to use a GPU session in colab
assert torch.cuda.is_available()
device = torch.device("cuda")

In [4]:
# to show examples we need some example data - download it
example_data_path = 'example_data_download'
download_example_data(out_dir=example_data_path)
local_files = os.path.join(example_data_path, 'grid_fusion_pytorch_example_data')

Output directory already exists. Check if data is already downloaded, else delete example_data_download and try again.


### Counting model

In [5]:
# define batch size and number of classes for this example
B, C = 1, 39
# define resolution of the map
H, W, D = 140, 112, 100
# initialize empty counter map - the two channels are hits + misses
counter_map_in = torch.zeros([B, 2, H, W, D]).to(device)
# intialize semantic map with uniform distribution
semantic_map_in = torch.ones([B, C, H, W, D]).to(device)
semantic_map_in *= 1 / (semantic_map_in.shape[1])
# convert probabilities to log probs
semantic_map_in = torch.log(semantic_map_in)

# furthermore we need to specify the dimensions of this map in the real world
range_min = torch.load(os.path.join(local_files,'example_world_limits_lower.pt')).cuda()
print('Loaded world dimensions lower limits. Shape:', range_min.shape)
range_max = torch.load(os.path.join(local_files,'example_world_limits_upper.pt')).cuda()
print('Loaded world dimensions upper limits. Shape:', range_max.shape)
# this can be specified per batch element, else assume each map in the batch is equally large

Loaded world dimensions lower limits. Shape: torch.Size([3])
Loaded world dimensions upper limits. Shape: torch.Size([3])


In [6]:
cam_pose_batch = torch.load(os.path.join(local_files,'example_cam_pose_batch.pt')).cuda()
print('Loaded batch of camera poses. Shape:', cam_pose_batch.shape)
cam_k_batch = torch.load(os.path.join(local_files,'example_cam_k_batch.pt')).cuda()
print('Loaded batch of camera intrinsics. Shape:', cam_k_batch.shape)
depth_batch = torch.load(os.path.join(local_files,'example_depth_batch.pt')).cuda()
print('Loaded batch of depth images. Shape:', depth_batch.shape)
semseg_batch = torch.load(os.path.join(local_files,'example_semseg_batch.pt')).cuda()
print('Loaded batch of semantic segmentation masks. Shape:', semseg_batch.shape)

ray_origs, ray_dirs, _, _ = sample_rays(cam_pose_batch, cam_k_batch, depth=depth_batch, normalize=False)
print('Calculated ray origins. Shape:', ray_origs.shape)
print('Calculated ray directions. Shape:', ray_dirs.shape)
depth_reshape = depth_batch.squeeze(2).flatten(-2,-1)
print('Reshaped depth accordingly. Shape:', depth_reshape.shape)
# reshape semseg to match rays
semseg_reshape = semseg_batch.squeeze(2).flatten(-2,-1)
# convert class labels to eps-soft log probs
semseg_soft = torch.log(soften_semseg(semseg_reshape))
# mask background values with nan
semseg_soft[semseg_reshape == -1] = torch.full((semseg_soft.shape[-1],), fill_value=float('nan'), device=semseg_soft.device)
print('Converted semantic segmentation to eps-soft log probs. Shape:', semseg_soft.shape)

Loaded batch of camera poses. Shape: torch.Size([1, 22, 4, 4])
Loaded batch of camera intrinsics. Shape: torch.Size([1, 22, 3, 3])
Loaded batch of depth images. Shape: torch.Size([1, 22, 1, 480, 640])
Loaded batch of semantic segmentation masks. Shape: torch.Size([1, 22, 1, 480, 640])
Calculated ray origins. Shape: torch.Size([1, 22, 3])
Calculated ray directions. Shape: torch.Size([1, 22, 307200, 3])
Reshaped depth accordingly. Shape: torch.Size([1, 22, 307200])
Converted semantic segmentation to eps-soft log probs. Shape: torch.Size([1, 22, 307200, 39])


In [7]:
# no semantic annotation for rays, no semantic maps
counter_map_out_A = apply_counting_model(counter_map_in, ray_origs, ray_dirs,
                                                         depth_reshape, range_min, range_max,
                                                         grid_semantic=None,
                                                         ray_semseg=None, n_steps=4096, verbose=True)
print('Computed output counter map. Shape:', counter_map_out_A.shape, '\n')
# semantic annotation for rays, but no semantic maps
counter_map_out_B = apply_counting_model(counter_map_in, ray_origs, ray_dirs,
                                                         depth_reshape, range_min, range_max,
                                                         grid_semantic=None,
                                                         ray_semseg=semseg_soft, n_steps=4096, verbose=True)
print('Computed output counter map. Shape:', counter_map_out_B.shape, '\n')
# no semantic annotation for rays, but semantic maps
counter_map_out_C = apply_counting_model(counter_map_in, ray_origs, ray_dirs,
                                                         depth_reshape, range_min, range_max,
                                                         grid_semantic=semantic_map_in,
                                                         ray_semseg=None, n_steps=4096, verbose=True)
print('Computed output counter map. Shape:', counter_map_out_C.shape, '\n')
# semantic annotation for rays and semantic maps
counter_map_out_D, semantic_map_out_D = apply_counting_model(counter_map_in, ray_origs, ray_dirs,
                                                         depth_reshape, range_min, range_max,
                                                         grid_semantic=semantic_map_in,
                                                         ray_semseg=semseg_soft, n_steps=4096, verbose=True)
print('Computed output semantic map. Shape:', semantic_map_out_D.shape)
print('Computed output counter map. Shape:', counter_map_out_D.shape, '\n')

Computed output counter map. Shape: torch.Size([1, 2, 140, 112, 100]) 

Computed output counter map. Shape: torch.Size([1, 2, 140, 112, 100]) 

Computed output counter map. Shape: torch.Size([1, 2, 140, 112, 100]) 

Computed output semantic map. Shape: torch.Size([1, 39, 140, 112, 100])
Computed output counter map. Shape: torch.Size([1, 2, 140, 112, 100]) 



In [8]:
from torch_fusion.util import CMAP_39, visualize_voxel_grid
# visualize the results
# cmap needs to be num_classes x 3 tensor
cmap = CMAP_39.to(device)
vis_dict = visualize_voxel_grid(counter_map_out_D, grid_semantic=semantic_map_out_D, color_map=cmap)
fps = 10
# create clip
clip_reflect_z = ImageSequenceClip(list(vis_dict['reflectance']['z_axis']), fps=fps)
clip_reflect_z.ipython_display(width = 360, autoplay=1, loop=1)

Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




In [9]:
n_trials = 10
n_calls = 10
timed_func = timeit.Timer(lambda: apply_counting_model(counter_map_in, ray_origs, ray_dirs, depth_reshape, range_min,
                                                       range_max, grid_semantic=semantic_map_in, ray_semseg=semseg_soft,
                                                       n_steps=4096, verbose=False, assert_inputs=False))
exec_times = timed_func.repeat(repeat=n_trials, number=n_calls)
num_rays = ray_dirs.shape[0]*ray_dirs.shape[1]*ray_dirs.shape[2]
print(f"Bayesian fusion took {np.mean(exec_times)/n_calls} seconds per call using {num_rays} rays.")
print(f"This corresponds to about {np.round((num_rays/(np.mean(exec_times)/n_calls))/1000000,1)} million ray casts per second.")

Bayesian fusion took 0.08790229012999588 seconds per call using 6758400 rays.
This corresponds to about 76.9 million ray casts per second.


#### Variation - only count hits

Just provide the same input but leave out the channel for misses in the counter maps.

In [10]:
# define batch size and number of classes for this example
B, C = 1, 39
# define resolution of the map
H, W, D = 140, 112, 100
# initialize empty counter map - the one channel is for the hits counter
counter_map_in = torch.zeros([B, 1, H, W, D]).to(device) # <--- !!! ONLY DIFFERENCE COMPARED TO ABOVE !!!

In [11]:
# no semantic annotation for rays, no semantic maps
occ_map_out_A = apply_counting_model(counter_map_in, ray_origs, ray_dirs,
                                                         depth_reshape, range_min, range_max,
                                                         grid_semantic=None,
                                                         ray_semseg=None, n_steps=4096, verbose=True)
print('Computed output occupancy map. Shape:', occ_map_out_A.shape, '\n')
# semantic annotation for rays, but no semantic maps
occ_map_out_B = apply_counting_model(counter_map_in, ray_origs, ray_dirs,
                                                         depth_reshape, range_min, range_max,
                                                         grid_semantic=None,
                                                         ray_semseg=semseg_soft, n_steps=4096, verbose=True)
print('Computed output occupancy map. Shape:',occ_map_out_B.shape, '\n')
# no semantic annotation for rays, but semantic maps
occ_map_out_C = apply_counting_model(counter_map_in, ray_origs, ray_dirs,
                                                         depth_reshape, range_min, range_max,
                                                         grid_semantic=semantic_map_in,
                                                         ray_semseg=None, n_steps=4096, verbose=True)
print('Computed output occupancy map. Shape:',occ_map_out_C.shape, '\n')
# semantic annotation for rays and semantic maps
occ_map_out_D, semantic_map_out_D = apply_counting_model(counter_map_in, ray_origs, ray_dirs,
                                                         depth_reshape, range_min, range_max,
                                                         grid_semantic=semantic_map_in,
                                                         ray_semseg=semseg_soft, n_steps=4096, verbose=True)
print('Computed output semantic map. Shape:',semantic_map_out_D.shape)
print('Computed output occupancy map. Shape:',occ_map_out_D.shape, '\n')

Computed output occupancy map. Shape: torch.Size([1, 1, 140, 112, 100]) 

Computed output occupancy map. Shape: torch.Size([1, 1, 140, 112, 100]) 

Computed output occupancy map. Shape: torch.Size([1, 1, 140, 112, 100]) 

Computed output semantic map. Shape: torch.Size([1, 39, 140, 112, 100])
Computed output occupancy map. Shape: torch.Size([1, 1, 140, 112, 100]) 



In [12]:
n_trials = 10
n_calls = 10
timed_func = timeit.Timer(lambda: apply_counting_model(counter_map_in, ray_origs, ray_dirs, depth_reshape, range_min,
                                                       range_max, grid_semantic=semantic_map_in, ray_semseg=semseg_soft,
                                                       n_steps=4096, verbose=False, assert_inputs=False))
exec_times = timed_func.repeat(repeat=n_trials, number=n_calls)
num_rays = ray_dirs.shape[0]*ray_dirs.shape[1]*ray_dirs.shape[2]
print(f"Bayesian fusion took {np.mean(exec_times)/n_calls} seconds per call using {num_rays} rays.")
print(f"This corresponds to about {np.round((num_rays/(np.mean(exec_times)/n_calls))/1000000000,1)} billion ray endpoints per second.")

Bayesian fusion took 0.009819778519986357 seconds per call using 6758400 rays.
This corresponds to about 0.7 billion ray endpoints per second.


### Directly fuse points clouds

Fusing point clouds with or without semantic annotation is possible too. Misses aren't counted in this setup.

In [13]:
# define batch size and number of classes for this example
B, C = 1, 39
# define resolution of the map
H, W, D = 140, 112, 100
# initialize empty occupancy map
occ_map_in = torch.zeros([B, 1, H, W, D]).to(device)
# intialize semantic map with uniform distribution
semantic_map_in = torch.ones([B, C, H, W, D]).to(device)
semantic_map_in *= 1 / (semantic_map_in.shape[1])
# convert probabilities to log probs
semantic_map_in = torch.log(semantic_map_in)

# furthermore we need to specify the dimensions of this map in the real world
range_min = torch.load(os.path.join(local_files,'example_world_limits_lower.pt')).cuda()
print('Loaded world dimensions lower limits. Shape:', range_min.shape)
range_max = torch.load(os.path.join(local_files,'example_world_limits_upper.pt')).cuda()
print('Loaded world dimensions upper limits. Shape:', range_max.shape)
# this can be specified per batch element, else assume each map in the batch is equally large

Loaded world dimensions lower limits. Shape: torch.Size([3])
Loaded world dimensions upper limits. Shape: torch.Size([3])


In [14]:
# load an example point cloud
# load the 3D locations
pc_locs = torch.load(os.path.join(local_files,'example_pointcloud_locations.pt')).cuda()
print('Loaded point cloud locations. Shape:', pc_locs.shape)
# next load the class probabilites per location
pc_probs = torch.load(os.path.join(local_files,'example_pointcloud_semantics.pt')).cuda()
print('Loaded point cloud class probabilities. Shape:', pc_probs.shape)
# convert these to log probs
pc_probs = torch.log(pc_probs)

# to mask points so they are not fused, set their locations to nan
test_masking = False
if test_masking:
    # arbitrarily decide which points to mask
    invalid_mask = torch.rand_like(pc_locs[:,:,0]) > 0.5
    print('Masking',invalid_mask.sum(),'locations!')
    invalid_mask = invalid_mask.unsqueeze(-1).expand(-1,-1,pc_locs.shape[-1])
    pc_locs[invalid_mask] = float('nan')

Loaded point cloud locations. Shape: torch.Size([1, 669621, 3])
Loaded point cloud class probabilities. Shape: torch.Size([1, 669621, 39])


In [15]:
# no point cloud semantics, no semantic maps
occ_map_out_A  = apply_point_cloud_fusion(occ_map_in, pc_locs, range_min, range_max,
                                                      grid_semantic=None,
                                                      point_cloud_logprobs=None, verbose=True)
print('Computed output occupancy map. Shape:', occ_map_out_A.shape, '\n')
# point cloud semantics, but no semantic map
occ_map_out_B  = apply_point_cloud_fusion(occ_map_in, pc_locs, range_min, range_max,
                                                      grid_semantic=None,
                                                      point_cloud_logprobs=pc_probs, verbose=True)
print('Computed output occupancy map. Shape:', occ_map_out_B.shape, '\n')
# semantic map, but no point cloud semantics
occ_map_out_C = apply_point_cloud_fusion(occ_map_in, pc_locs, range_min, range_max,
                                                      grid_semantic=semantic_map_in,
                                                      point_cloud_logprobs=None, verbose=True)
print('Computed output occupancy map. Shape:', occ_map_out_C.shape, '\n')
# point cloud semantics and semantic map
occ_map_out_D, semantic_map_out_D = apply_point_cloud_fusion(occ_map_in, pc_locs, range_min, range_max,
                                                      grid_semantic=semantic_map_in,
                                                      point_cloud_logprobs=pc_probs, verbose=True)
print('Computed output semantic map. Shape:', semantic_map_out_D.shape)
print('Computed output occupancy map. Shape:', occ_map_out_D.shape, '\n')

Computed output occupancy map. Shape: torch.Size([1, 1, 140, 112, 100]) 

Computed output occupancy map. Shape: torch.Size([1, 1, 140, 112, 100]) 

Computed output occupancy map. Shape: torch.Size([1, 1, 140, 112, 100]) 

Computed output semantic map. Shape: torch.Size([1, 39, 140, 112, 100])
Computed output occupancy map. Shape: torch.Size([1, 1, 140, 112, 100]) 



In [16]:
vis_dict = visualize_voxel_grid(occ_map_out_D, grid_semantic=semantic_map_out_D, color_map=cmap)

clip_reflect_z = ImageSequenceClip(list(vis_dict['hits']['z_axis']), fps=fps)
clip_reflect_z.ipython_display(width = 360, autoplay=1, loop=1)

Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                    

Moviepy - Done !
Moviepy - video ready __temp__.mp4




In [17]:
n_trials = 10
n_calls = 10
timed_func = timeit.Timer(lambda: apply_point_cloud_fusion(occ_map_in, pc_locs, range_min, range_max,
                                                           grid_semantic=semantic_map_in, point_cloud_logprobs=pc_probs,
                                                           verbose=False, assert_inputs=False))
exec_times = timed_func.repeat(repeat=n_trials, number=n_calls)
num_points = pc_locs.shape[0] * pc_locs.shape[1]
print(f"Bayesian fusion took {np.mean(exec_times)/n_calls} seconds per call using {num_points} points.")
print(f"This corresponds to about {np.round((num_points/(np.mean(exec_times)/n_calls))/1000000,1)} million fused points per second.")

Bayesian fusion took 0.0023656020700036605 seconds per call using 669621 points.
This corresponds to about 283.1 million fused points per second.
