In [1]:
from models.taichi_modules import (
    HashEncoder, DirEncoder, HashEmbedder, SHEncoder, 
    VolumeRendererTaichi
)
from models.custom_functions import VolumeRenderer
import tinycudann as tcnn
import taichi as ti
import torch

[Taichi] version 1.4.0, llvm 15.0.4, commit fbe92fd8, linux, python 3.9.16
[I 01/25/23 20:11:49.385 335178] [shell.py:_shell_pop_print@23] Graphical python shell detected, using wrapped sys.stdout




In [2]:
ti.init(arch=ti.cuda, device_memory_GB=4, kernel_profiler=True, offline_cache=False)
device = torch.device('cuda')
L=16; F=2; log2_T=19; N_min=16; b=1.3195079565048218

[Taichi] Starting on arch=cuda


In [3]:
# tcnn
cuda_hash_encoder = \
    tcnn.Encoding(
        n_input_dims=3,
        encoding_config={
            "otype": "Grid",
            "type": "Hash",
            "n_levels": L,
            "n_features_per_level": F,
            "log2_hashmap_size": log2_T,
            "base_resolution": N_min,
            "per_level_scale": b,
            "interpolation": "Linear"
        },
    ).to(device)

cuda_dir_encoder = \
    tcnn.Encoding(
        n_input_dims=3,
        encoding_config={
            "otype": "SphericalHarmonics",
            "degree": 4,
        },
    ).to(device)

cuda_render_func = VolumeRenderer.apply

In [4]:
b = cuda_hash_encoder.native_tcnn_module.hyperparams()['per_level_scale']

taichi_hash_encoder = HashEncoder(cuda_hash_encoder.params, b, 8192).to(device)
taichi_dir_encoder = DirEncoder(8192).to(device)
taichi_render_func = VolumeRendererTaichi(8192).to(device)

per_level_scale:  1.3195079565048218
offset_:  5722520


In [5]:
torch_hash_encoder = HashEmbedder()
torch_dir_encoder = SHEncoder()

In [6]:
samples_per_rays = 10
position = torch.load('./test_data/positions.t').float()
dirs = torch.load('./test_data/dir.t').float()
sigmas = torch.load('./test_data/sigmas.t').float()
rgbs = torch.load('./test_data/rgbs.t').float()
deltas = torch.load('./test_data/deltas.t').float()
ts = torch.load('./test_data/ts.t').float()
rays_a = torch.load('./test_data/rays_a.t')

## CUDA Hash

CUDA total test:
- _module_function
- _module_function_backward

In [8]:
# check forward
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    r1 = cuda_hash_encoder(position)
print('pytorch forward\n', prof.key_averages(group_by_stack_n=5).table(
    sort_by='self_cuda_time_total', row_limit=5))

# check backward
# a strange loss for better verification
loss1 = ((r1 * r1) - torch.tanh(r1)).sum()
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    loss1.backward()
print('pytorch backward\n', prof.key_averages(group_by_stack_n=5).table(
    sort_by='self_cuda_time_total', row_limit=5))

pytorch forward
 -------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
         _module_function        24.77%     270.000us        32.84%     358.000us     358.000us     644.000us        56.10%     645.000us     645.000us             1  
              aten::empty         2.39%      26.000us        18.90%     206.000us     103.000us     161.000us        14.02%     161.000us      80.500us             2  
              aten::copy_         1.65%      18.000us         3.49%      38.000us      19.000us     127.000us        11.06%     127.000us     

## CUDA Volume

In [None]:
# check forward
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    vr_samples, opacity, depth, r4, ws = VolumeRenderer.apply(sigmas, rgbs, deltas, ts, rays_a, 1e-4)
print('pytorch forward\n', prof.key_averages(group_by_stack_n=5).table(
    sort_by='self_cuda_time_total', row_limit=5))

# check backward
# a strange loss for better verification
loss4 = ((r4 * r4) - torch.tanh(r4)).sum()
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    loss4.backward()
print('pytorch backward\n', prof.key_averages(group_by_stack_n=5).table(
    sort_by='self_cuda_time_total', row_limit=5))

## Taichi

CUDA total test:
- _module_function
- _module_function_backward

CUDA total contains torch2ti and ti2torch. Use kernel_profiler instead

In [8]:
r3 = taichi_hash_encoder(position)
loss3 = ((r3 * r3) - torch.tanh(r3)).sum()
loss3.backward()
ti.profiler.clear_kernel_profiler_info()
# check forward
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    r3 = taichi_hash_encoder(position)
print('pytorch forward\n', prof.key_averages(group_by_stack_n=5).table(
    sort_by='self_cuda_time_total', row_limit=5))

# check backward
# a strange loss for better verification
loss3 = ((r3 * r3) - torch.tanh(r3)).sum()
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    loss3.backward()
print('pytorch backward\n', prof.key_averages(group_by_stack_n=5).table(
    sort_by='self_cuda_time_total', row_limit=5))

pytorch forward
 -------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
         _module_function        96.09%       1.866ms        99.79%       1.938ms       1.938ms       1.826ms        93.83%       1.946ms       1.946ms             1  
              aten::fill_         0.62%      12.000us         1.34%      26.000us      26.000us      82.000us         4.21%      82.000us      82.000us             1  
              aten::zeros         1.18%      23.000us         3.71%      72.000us      72.000us      16.000us         0.82%     120.000us     

In [9]:
ti.profiler.print_kernel_profiler_info('trace')

Kernel Profiler(trace, default) @ CUDA on NVIDIA GeForce RTX 3090 Ti
[  start.time | kernel.time |   regs  |   shared mem | grid size | block size | occupancy ] Kernel name
-------------------------------------------------------------------------------------------------------
[    0.000 ms |    0.012 ms |      18 |      0 bytes |         1 |          1 | 16 blocks ] torch2ti_c82_0_kernel_0_serial
[    0.012 ms |    0.020 ms |      18 |      0 bytes |      2688 |        128 | 12 blocks ] torch2ti_c82_0_kernel_1_range_for
[    0.032 ms |    0.008 ms |       8 |      0 bytes |         1 |          1 | 16 blocks ] torch2ti_c82_1_kernel_0_serial
[    0.040 ms |    0.111 ms |      15 |      0 bytes |      2688 |        128 | 12 blocks ] torch2ti_c82_1_kernel_1_range_for
[    0.151 ms |    0.007 ms |       8 |      0 bytes |         1 |          1 | 16 blocks ] hash_encode_kernel_c104_0_kernel_0_serial
[    0.158 ms |    0.987 ms |      40 |      0 bytes |      2688 |         16 | 16 blocks ]

## Taichi Volume

- composite_train_fw_c92_0_kernel_0_range_for
- composite_train_fw_x_reverse_grad_reverse_grad_kernel_0_range_for

In [14]:
vr_samples, opacity, depth, r5, ws = taichi_render_func(sigmas, rgbs, deltas, ts, rays_a, 1e-4)
loss5 = ((r5 * r5) - torch.tanh(r5)).sum()
loss5.backward()
ti.profiler.clear_kernel_profiler_info()

vr_samples, opacity, depth, r5, ws = taichi_render_func(sigmas, rgbs, deltas, ts, rays_a, 1e-4)
loss5 = ((r5 * r5) - torch.tanh(r5)).sum()
loss5.backward()

ti.profiler.print_kernel_profiler_info('trace')

Kernel Profiler(trace, default) @ CUDA on NVIDIA GeForce RTX 3090 Ti
[  start.time | kernel.time |   regs  |   shared mem | grid size | block size | occupancy ] Kernel name
-------------------------------------------------------------------------------------------------------
[    0.000 ms |    0.006 ms |       8 |      0 bytes |         1 |          1 | 16 blocks ] torch2ti_c82_0_kernel_0_serial
[    0.006 ms |    0.008 ms |      16 |      0 bytes |      2688 |        128 | 12 blocks ] torch2ti_c82_0_kernel_1_range_for
[    0.014 ms |    0.004 ms |      18 |      0 bytes |         1 |          1 | 16 blocks ] torch2ti_c82_1_kernel_1_serial
[    0.018 ms |    0.016 ms |      18 |      0 bytes |      2688 |        128 | 12 blocks ] torch2ti_c82_1_kernel_0_range_for
[    0.034 ms |    0.004 ms |       8 |      0 bytes |         1 |          1 | 16 blocks ] torch2ti_c82_2_kernel_0_serial
[    0.038 ms |    0.009 ms |      16 |      0 bytes |      2688 |        128 | 12 blocks ] torch2ti_c