In [1]:
!pip install torch Ninja > /dev/null

In [2]:
import torch
from torch.utils.cpp_extension import load_inline

In [3]:
cpp_source = """
std::string hello_world() {
  return "hello world !";
}"""

 Errors:

 - the build_directory has to exist is readable path
    - Created a directory specifically
    
 - Ninja is required to load C++ extensions
    - Resolved using pip install Ninja

In [6]:
hello_module = load_inline(
    name='hello_mod',
    cpp_sources=[cpp_source],
    functions=['hello_world'],
    verbose=True,
    build_directory='/home/aicoder/tmp'  # this directory has to exist
)

Emitting ninja build file /home/aicoder/tmp/build.ninja...
Building extension module hello_mod...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/2] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=hello_mod -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/aicoder/.local/lib/python3.10/site-packages/torch/include -isystem /home/aicoder/.local/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/aicoder/.local/lib/python3.10/site-packages/torch/include/TH -isystem /home/aicoder/.local/lib/python3.10/site-packages/torch/include/THC -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /home/aicoder/tmp/main.cpp -o main.o 
[2/2] c++ main.o -shared -L/home/aicoder/.local/lib/python3.10/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o hello_mod.so


Loading extension module hello_mod...


In [7]:
hello_module.hello_world()

'hello world !'

Building a Square Matrix Kernel

In [14]:
cuda_cpp_kernel = """
/*Here the kernel is defined where the work is being done*/
__global__ void square_matrix_kernel(const float* matrix, float* result, int width, int height){
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < height && col < width){
      int idx = row * width + col;
      result[idx] = matrix[idx] * matrix[idx];
    }
}

/*Here a function is defined, that seems to return torch::Tensor obj*/
torch::Tensor square_matrix(torch::Tensor matrix){
  const auto height = matrix.size(0);
  const auto width = matrix.size(1);

  auto result = torch::empty_like(matrix);

  /*wht is dim3: https://stackoverflow.com/questions/31141541/cuda-block-grid-dimensions-when-to-use-dim3 */

  dim3 threads_per_block(16, 16);
  dim3 number_of_blocks((width + threads_per_block.x - 1) / threads_per_block.x,
                        (height + threads_per_block.y - 1) / threads_per_block.y);
  /*The kernel is being called below has been defined above */
  square_matrix_kernel<<<number_of_blocks, threads_per_block>>>(
    matrix.data_ptr<float>(), result.data_ptr<float>(), width, height
  );

  return result;
}"""

In [15]:
cpp_source = "torch::Tensor square_matrix(torch::Tensor matrix);"

Error: cannot open shared object file when using load_inline()

Your library is a dynamic library. You need to tell the operating system where it can locate it at runtime.

To do so, we will need to do those easy steps:

Find where the library is placed if you don't know it.

sudo find / -name the_name_of_the_file.so

Check for the existence of the dynamic library path environment variable(LD_LIBRARY_PATH)

echo $LD_LIBRARY_PATH

If there is nothing to be displayed, add a default path value (or not if you wish to)

LD_LIBRARY_PATH=/usr/local/lib

We add the desired path, export it and try the application.

Note that the path should be the directory where the path.so.something is. So if path.so.something is in /my_library/path.so.something, it should be:

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/my_library/

- Run sudo ldconfig

ldconfig creates the necessary links and cache to the most recent shared libraries found in the directories specified on the command line, in the file /etc/ld.so.conf, and in the trusted directories (/lib and /usr/lib).

- Running sudo ldconfig lead to following

 /usr/local/lib/libtbbbind.so.3 is not a symbolic link on many other files

-- Solution:

  Simply change the extension name

Error: CUDA_HOME environment variable is not set.

In [10]:
!echo $LD_LIBRARY_PATH

/usr/local/cuda-12.4/lib64


In [4]:
device = torch.device('cuda')

In [16]:
sqr_mat_ext = load_inline(
    name='sqr_mat_ext1',
    cpp_sources=[cpp_source],
    cuda_sources=cuda_cpp_kernel,
    functions=['square_matrix'],
    with_cuda=True,
    extra_cuda_cflags=["-O2"],
    verbose=True,
    build_directory='/home/aicoder/sqr_mat_ext',
    # extra_cuda_cflags=['--expt-relaxed-constexpr']
)

The input conditions for extension module sqr_mat_ext1 have changed. Bumping to version 1 and re-building as sqr_mat_ext1_v1...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/aicoder/sqr_mat_ext/build.ninja...
Building extension module sqr_mat_ext1_v1...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=sqr_mat_ext1_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/aicoder/.local/lib/python3.10/site-packages/torch/include -isystem /home/aicoder/.local/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/aicoder/.local/lib/python3.10/site-packages/torch/include/TH -isystem /home/aicoder/.local/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda-12.4/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /home/aicoder/sqr_mat_ext/main.cpp -o main.o 
[2/3] /usr/local/cuda-12.4/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=sqr_mat_ext1_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/aicoder/.local

Loading extension module sqr_mat_ext1_v1...


In [1]:
# hello cuda

cuda_code = """
__global__ void helloCUDA(float f)
{
    printf("Hello thread %d", f=%f\n, threadIdx.x, f)
}

int main(){
    helloCUDA<<<1, 5>>>(1.2345f);
    cudaDeviceSynchronize();
    return 0;
}
"""

In [None]:
cpp_source = "int main();"
hellomod = load_inline(
    name='hellomod',
    cpp_sources=cpp_source,
    cuda_sources=cuda_code,
    functions=['main'],
    with_cuda=True,
    verbose=True,
    extra_cflags=['-O2'],
    build_directory='content'
)

In [1]:
import torch
tens1 = torch.randint(0, 10, size=(1, 10))
tens1 = torch.rand(size=(1,10))
tens1.dtype

torch.float32

In [28]:
sqr_mat_ext.square_matrix(tens1)

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         7.7052e+31, 1.9447e+31, 2.1715e-18, 2.3081e-12]])

In [3]:
torch.cuda.is_available()

True

In [37]:
# Another cuda kernel

cuda_kernel = """
extern "C" __global__
void square_kernel(const float* __restrict__ input, float* __restrict__ output, int size){
  const int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < size) {
    output[index] = input[index] * input[index];
  }
}"""

In [None]:
from torch.utils.cpp_extension import load_inline

module = load_inline(
    name='square',
    cpp_sources='',
    cuda_sources=cuda_kernel,
    functions=['square_kernel'],
    verbose=True
)
# error: ‘square_kernel’ was not declared in this scope

In [39]:
def square(input):
    output = torch.empty_like(input)
    threads_per_block = 1024
    blocks_per_grid = (input.numel() + (threads_per_block - 1)) // threads_per_block
    module.square_kernel(blocks_per_grid, threads_per_block, input, output, input.numel())
    return output

In [2]:
input_tensor = torch.randn(100)

In [None]:
input_tensor.to(device)
# Throwing illegal memory access when moving to device

In [6]:
def time_pytorch_function(func, input):
    # CUDA IS ASYNC so can't use python time module
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    # Warmup
    for _ in range(5):
        func(input)

    start.record()
    func(input)
    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end)

def square_2(a):
    return a * a

def square_3(a):
    return a ** 2
b = torch.randn(10000, 10000).cuda()

In [9]:
b.device

device(type='cuda', index=0)

In [10]:
time_pytorch_function(torch.square, b)
time_pytorch_function(square_2, b)
time_pytorch_function(square_3, b)

1.7377279996871948

In [11]:
from torch.autograd.profiler import profile

with profile(use_cuda=True) as prof:
    torch.square(b)

STAGE:2024-05-11 11:32:45 17512:17512 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-05-11 11:32:45 17512:17512 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-05-11 11:32:45 17512:17512 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [12]:
print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))

-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
             aten::square         6.77%     135.000us        17.86%     356.000us     356.000us     135.000us         6.65%       2.030ms       2.030ms             1  
                aten::pow         9.93%     198.000us        10.94%     218.000us     218.000us       1.887ms        92.96%       1.895ms       1.895ms             1  
        aten::result_type         0.05%       1.000us         0.05%       1.000us       1.000us       4.000us         0.20%       4.000us       4.000us        

In [13]:
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    square_2(b)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))


-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::mul         0.85%      79.000us         1.23%     114.000us     114.000us       1.780ms       100.00%       1.780ms       1.780ms             1  
          cudaEventRecord        81.47%       7.553ms        81.47%       7.553ms       3.776ms       0.000us         0.00%       0.000us       0.000us             2  
         cudaLaunchKernel         0.38%      35.000us         0.38%      35.000us      35.000us       0.000us         0.00%       0.000us       0.000us        

STAGE:2024-05-11 11:34:11 17512:17512 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-05-11 11:34:11 17512:17512 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-05-11 11:34:11 17512:17512 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [14]:
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    square_3(b)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::pow        28.21%     110.000us        42.05%     164.000us     164.000us       1.825ms        99.89%       1.827ms       1.827ms             1  
        aten::result_type         1.03%       4.000us         1.03%       4.000us       4.000us       1.000us         0.05%       1.000us       1.000us             1  
                 aten::to         0.26%       1.000us         0.26%       1.000us       1.000us       1.000us         0.05%       1.000us       1.000us        

STAGE:2024-05-11 11:34:26 17512:17512 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-05-11 11:34:26 17512:17512 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-05-11 11:34:26 17512:17512 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
