In [None]:
!pip install ninja
!sudo apt update
!sudo apt install ccache -y
#!sudo apt install g++-11 -y

In [None]:
!sudo apt update -y && sudo apt upgrade -y

In [17]:
!sudo apt install g++-11 -y

Reading package lists... Done
Building dependency tree       
Reading state information... Done
[1;31mE: [0mUnable to locate package g++-11[0m


In [18]:
!gcc --version

gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [15]:
!ls /usr/lib/ccache

c++	 c99-gcc  g++	 gcc	x86_64-linux-gnu-g++	x86_64-linux-gnu-gcc
c89-gcc  cc	  g++-9  gcc-9	x86_64-linux-gnu-g++-9	x86_64-linux-gnu-gcc-9


In [12]:
import torch
import torch.utils.cpp_extension
import os

os.environ['CXX'] = "/usr/lib/ccache/g++-9"
os.environ['CC'] = "/usr/lib/ccache/gcc-9"

The output.data_ptr<float>() function returns a pointer to the underlying memory of a PyTorch tensor. This can be useful for accessing the tensor data directly, without having to go through the PyTorch API

In [20]:
# based on Jeremy's Lecture 3 notebook
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
'''

cuda_src = cuda_begin + r'''
__global__ void rgb_to_grayscale_kernel(unsigned char* out, unsigned char* in, int n) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i >= n) return;
    out[i] = 0.2989f*in[i] + 0.5870f*in[i+n] + 0.1140f*in[i+2*n];  // fix with f found by Andreas...
}

torch::Tensor rgb_to_grayscale_out(torch::Tensor output, const torch::Tensor& input) {
    //CHECK_INPUT(input);
    int h = input.size(1);
    int w = input.size(2);
    TORCH_CHECK((h == output.size(0)) || (w == output.size(1)) || (output.device() == input.device())
                || (output.scalar_type() == input.scalar_type()));
    int threads = 256;
    rgb_to_grayscale_kernel<<<cdiv(w*h,threads), threads>>>(
        output.data_ptr<unsigned char>(), input.data_ptr<unsigned char>(), w*h);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}

torch::Tensor rgb_to_grayscale(const torch::Tensor& input) {
    //CHECK_INPUT(input);
    int h = input.size(1);
    int w = input.size(2);
    auto output = torch::empty({h,w}, input.options());
    rgb_to_grayscale_out(output, input);
    return output;
}
'''

cpp_src = """
torch::Tensor rgb_to_grayscale(const torch::Tensor& input);
torch::Tensor rgb_to_grayscale_out(torch::Tensor outpuit, const torch::Tensor& input);
"""

In [22]:
# G++-9 supports C++17 by default. To use C++20 with g++-9, you need to specify the -std=c++2a flag. For example, to compile a C++20 program with g++-9, you would use the following command:
# https://stackoverflow.com/questions/75470181/cmake-knows-std-20-but-g9-doesnt
# Kaggle is using g++-9 so the above issues arose.
# after the //CHECK_INPUT(input); is commented, the code executed.
# even adding -std=c++20 did not make a difference
# adding build_directory='.' is important, else it errors out as the rest of the file-system is not writable
module = torch.utils.cpp_extension.load_inline(
    name="test_ext", cpp_sources=cpp_src, cuda_sources=cuda_src, 
    functions=['rgb_to_grayscale', 'rgb_to_grayscale_out'], extra_cuda_cflags=['--ptxas-options=-v -std=c++20'], verbose=True,
build_directory='.')

The input conditions for extension module test_ext have changed. Bumping to version 4 and re-building as test_ext_v4...
Detected CUDA files, patching ldflags
Emitting ninja build file ./build.ninja...
Building extension module test_ext_v4...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] /usr/lib/ccache/g++-9 -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=test_ext_v4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -c /kaggle/working/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc  -ccbin /usr/lib/ccache/gcc-9 -DTORCH_EXTENSION_NAME=test_ext_v4 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-packages/to

Loading extension module test_ext_v4...


In [23]:
n = 2048
# create a 3D 2048 px * 2048 px 
t = torch.randint(0, 256, (3, n, n), dtype=torch.uint8, device="cuda")

In [24]:
out = module.rgb_to_grayscale(t)
# hmm, there is synchronize in python too..
torch.cuda.synchronize()

In [25]:
import time
t0 = time.perf_counter_ns()

for i in range(10_000):
    module.rgb_to_grayscale_out(out, t)

torch.cuda.synchronize()
t1 = time.perf_counter_ns()

print((t1-t0) / 10_000 / 1_000, "µs") 

54.42138 µs


In [26]:
with torch.profiler.profile() as prof:
    for i in range(10_000):
        module.rgb_to_grayscale_out(out, t)
        torch.cuda.synchronize()

print(prof.key_averages().table())

STAGE:2024-05-18 02:32:55 34:34 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-05-18 02:32:56 34:34 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-05-18 02:32:56 34:34 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       cudaLaunchKernel        10.73%      64.400ms        10.73%      64.400ms       6.440us       0.000us         0.00%       0.000us       0.000us         10000  
rgb_to_grayscale_kernel(unsigned char*, unsigned cha...         0.00%       0.000us         0.00%       0.000us       0.000us     655.917ms       100.00%     655.917ms      65.592us         10000  
         

In [27]:
# gelu as fusion example
def gelu(x):
    return 0.5 * x * (1+ torch.tanh((2/torch.pi)**0.5 * (x+0.044715 * x**3)))

x = torch.randn(1024, 1024, device="cuda")

In [28]:
%timeit gelu(x); torch.cuda.synchronize()
%timeit torch.nn.functional.gelu(x, approximate='tanh'); torch.cuda.synchronize()

174 µs ± 590 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
35.5 µs ± 85.6 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [30]:
cuda_src = cuda_begin + r'''
__global__ void my_gelu_kernel(float* out, float* inp, int n) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i >= n) return;
    float x = inp[i];
    out[i] = 0.5f * x * (1.0f + tanhf(sqrtf(2.0f/3.141592653589793f) * (x + 0.044715f * (x * x * x))));
}

torch::Tensor my_gelu_out(torch::Tensor output, const torch::Tensor& inp) {
    //CHECK_INPUT(inp);
    int n = inp.numel();
    TORCH_CHECK((output.sizes() == inp.sizes())  || (output.device() == inp.device())
                || (output.scalar_type() == inp.scalar_type()));
    int threads = 256;
    my_gelu_kernel<<<cdiv(n, threads), threads>>>(
        output.data_ptr<float>(), inp.data_ptr<float>(), n);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}

torch::Tensor my_gelu(const torch::Tensor& inp) {
    //CHECK_INPUT(inp);
    auto output = torch::empty_like(inp);
    my_gelu_out(output, inp);
    return output;
}
'''

cpp_src = """
torch::Tensor my_gelu(const torch::Tensor& inp);
torch::Tensor my_gelu_out(torch::Tensor output, const torch::Tensor& inp);
"""

import os
os.environ['CXX'] = '/usr/lib/ccache/g++-9'
os.environ['CC'] = '/usr/lib/ccache/gcc-9'

gelu_module = torch.utils.cpp_extension.load_inline(
    "test_ext_gelu", cpp_src, cuda_src, 
    functions=['my_gelu', 'my_gelu_out'], extra_cuda_cflags=['--ptxas-options=-v'], verbose=True, build_directory='.')

The input conditions for extension module test_ext_gelu have changed. Bumping to version 1 and re-building as test_ext_gelu_v1...
Detected CUDA files, patching ldflags
Emitting ninja build file ./build.ninja...
Building extension module test_ext_gelu_v1...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] /usr/lib/ccache/g++-9 -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=test_ext_gelu_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -c /kaggle/working/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc  -ccbin /usr/lib/ccache/gcc-9 -DTORCH_EXTENSION_NAME=test_ext_gelu_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-p

Loading extension module test_ext_gelu_v1...


In [31]:
(gelu_module.my_gelu(x) - gelu(x)).abs().max()

tensor(2.3842e-07, device='cuda:0')

In [32]:
%timeit gelu_module.my_gelu(x); torch.cuda.synchronize()

43.2 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [35]:
# empty kernel to look at the latency

cuda_src = cuda_begin + r'''
__global__ void my_empty_kernel(float* out, float* inp, int n) {
}

torch::Tensor my_empty_out(torch::Tensor output, const torch::Tensor& inp) {
    // CHECK_INPUT(inp);
    int n = inp.numel(); // calculate num of elements in the tensor
    TORCH_CHECK((output.sizes() == inp.sizes())  || (output.device() == inp.device())
                || (output.scalar_type() == inp.scalar_type()));
    int threads = 256;
    my_empty_kernel<<<cdiv(n, threads), threads>>>(
        output.data_ptr<float>(), inp.data_ptr<float>(), n);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}

torch::Tensor my_empty(const torch::Tensor& inp) {
    // CHECK_INPUT(inp);
    auto output = torch::empty_like(inp);
    my_empty_out(output, inp);
    return output;
}
'''

cpp_src = """
torch::Tensor my_empty(const torch::Tensor& inp);
torch::Tensor my_empty_out(torch::Tensor output, const torch::Tensor& inp);
"""

import os
os.environ['CXX'] = '/usr/lib/ccache/g++-9'
os.environ['CC'] = '/usr/lib/ccache/gcc-9'

empty_module = torch.utils.cpp_extension.load_inline(
    "test_ext_empty", cpp_src, cuda_src, 
    functions=['my_empty', 'my_empty_out'], extra_cuda_cflags=['--ptxas-options=-v'], verbose=True, build_directory='.')

The input conditions for extension module test_ext_empty have changed. Bumping to version 1 and re-building as test_ext_empty_v1...
Detected CUDA files, patching ldflags
Emitting ninja build file ./build.ninja...
Building extension module test_ext_empty_v1...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] /usr/lib/ccache/g++-9 -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=test_ext_empty_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -c /kaggle/working/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc  -ccbin /usr/lib/ccache/gcc-9 -DTORCH_EXTENSION_NAME=test_ext_empty_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site

Loading extension module test_ext_empty_v1...


In [36]:
%timeit empty_module.my_empty_out(x, x); torch.cuda.synchronize()

with torch.profiler.profile() as prof:
    for i in range(10_000):
        empty_module.my_empty_out(x, x)
        torch.cuda.synchronize()

print(prof.key_averages().table())

25.9 µs ± 144 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


STAGE:2024-05-18 02:38:03 34:34 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-05-18 02:38:03 34:34 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-05-18 02:38:03 34:34 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


----------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
----------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                        cudaLaunchKernel        72.88%      56.963ms        72.88%      56.963ms       5.696us       0.000us         0.00%       0.000us       0.000us         10000  
    my_empty_kernel(float*, float*, int)         0.00%       0.000us         0.00%       0.000us       0.000us     116.967ms       100.00%     116.967ms      11.697us         10000  
                   cudaDeviceSynchronize        27.12%      21.194ms        27.12%  

https://stackoverflow.com/questions/7024615/putting-a-for-loop-in-a-cuda-kernel
https://forums.developer.nvidia.com/t/cuda-kernel-for-loop-performance/81165/8
https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/

In [37]:
cuda_src = cuda_begin + r'''
__global__ void simple_matmul_k(float* m, float* n, float* out, int h, int w, int k) {
    int r = blockIdx.y*blockDim.y + threadIdx.y;
    int c = blockIdx.x*blockDim.x + threadIdx.x;

    if (r>=h || c>=w) return;
    float o = 0;
    for (int i = 0; i<k; ++i) o += m[r*k+i] * n[i*w+c]; // need to review how this loop will execute
    out[r*w+c] = o;
}

torch::Tensor simple_matmul(const torch::Tensor& m, const torch::Tensor& n) {
    // CHECK_INPUT(m); CHECK_INPUT(n);
    int h = m.size(0);
    int w = n.size(1);
    int k = m.size(1);
    TORCH_CHECK(k==n.size(0), "Size mismatch!");
    auto output = torch::zeros({h, w}, m.options());

    dim3 tpb(16,16);
    dim3 blocks(cdiv(w, tpb.x), cdiv(h, tpb.y));
    simple_matmul_k<<<blocks, tpb>>>(
        m.data_ptr<float>(), n.data_ptr<float>(), output.data_ptr<float>(), h, w, k);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}
'''

cpp_src = """
torch::Tensor simple_matmul(const torch::Tensor& m, const torch::Tensor& n);
"""

simple_matmul_module = torch.utils.cpp_extension.load_inline(
    "test_ext_simple_matmul", cpp_src, cuda_src, 
    functions=['simple_matmul'], extra_cuda_cflags=['--ptxas-options=-v'], verbose=True, build_directory='.')

Detected CUDA files, patching ldflags
Emitting ninja build file ./build.ninja...
Building extension module test_ext_simple_matmul...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] /usr/lib/ccache/g++-9 -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=test_ext_simple_matmul -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -c /kaggle/working/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc  -ccbin /usr/lib/ccache/gcc-9 -DTORCH_EXTENSION_NAME=test_ext_simple_matmul -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/pytho

Loading extension module test_ext_simple_matmul...


In [38]:
a = torch.randn(1024, 1024, device="cuda")
b = torch.randn(1024, 1024, device="cuda")
%timeit simple_matmul_module.simple_matmul(a, b)

(simple_matmul_module.simple_matmul(a, b) - a@b).abs().max()

4.21 ms ± 1.37 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


tensor(0.0002, device='cuda:0')

In [None]:
cuda_src = cuda_begin + r"""
constexpr int TILE_SIZE = 16;

__global__ void tiled_matmul_kernel(float* out, float* M, float* N, int h, int w, int k) {
  __shared__ float M_tile[TILE_SIZE][TILE_SIZE];
  __shared__ float N_tile[TILE_SIZE][TILE_SIZE];
  
  // idxes into tile
  int ir = threadIdx.y;
  int ic = threadIdx.x;
  
  int r = blockIdx.y * blockDim.y + threadIdx.y;
  int c = blockIdx.x * blockDim.x + threadIdx.x;

  // note: cannot just exit if we want to do padding!
  
  float res = 0.0f;
  for (int K_tileidx = 0; K_tileidx < (k + TILE_SIZE -1) / TILE_SIZE; K_tileidx++) {
    // note how threadIdx.x is the fastes moving bit --> coalesced memory access
    M_tile[ir][ic] = (((r < h) && (K_tileidx * TILE_SIZE + ic < k)) ? M[r * k + K_tileidx * TILE_SIZE + ic] : 0.f);
    N_tile[ir][ic] = ((((K_tileidx * TILE_SIZE + ir) < k) && (c < w)) ? N[(K_tileidx * TILE_SIZE + ir) * w + c] : 0.f);
    //M_tile[ir][ic] = M[r * k + K_tileidx * TILE_SIZE + ic];
    //N_tile[ir][ic] = N[(K_tileidx * TILE_SIZE + ir) * w + c];
    __syncthreads();
    for (int idx = 0; idx < TILE_SIZE; idx++) {
       res += M_tile[ir][idx] * N_tile[idx][ic];
    }
    __syncthreads(); // important! (why?)
  }
  if ((r < h) && (c < w)) {
    out[r * w + c] = res;
  }
}

torch::Tensor tiled_matmul(const torch::Tensor& m, const torch::Tensor& n) {
    // CHECK_INPUT(m); CHECK_INPUT(n);
    int h = m.size(0);
    int w = n.size(1);
    int k = m.size(1);
    TORCH_CHECK(k==n.size(0), "Size mismatch");
    //TORCH_CHECK((k % TILE_SIZE == 0) && (h % TILE_SIZE == 0) && (w % TILE_SIZE == 0), "Padding not done");
    auto output = torch::empty({h, w}, m.options());

    dim3 tpb(TILE_SIZE, TILE_SIZE);
    dim3 blocks(cdiv(w, tpb.x), cdiv(h, tpb.y));
    tiled_matmul_kernel<<<blocks, tpb>>>(
        output.data_ptr<float>(), m.data_ptr<float>(), n.data_ptr<float>(), h, w, k);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}

"""
cpp_src = """
torch::Tensor tiled_matmul(const torch::Tensor& m, const torch::Tensor& n);
"""

tiled_matmul_module = torch.utils.cpp_extension.load_inline(
    "test_ext_tiled_matmul", cpp_src, cuda_src, 
    functions=['tiled_matmul'], extra_cuda_cflags=['--ptxas-options=-v'], verbose=True, build_directory='.')

Detected CUDA files, patching ldflags
Emitting ninja build file ./build.ninja...
Building extension module test_ext_tiled_matmul...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] /usr/lib/ccache/g++-9 -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=test_ext_tiled_matmul -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -c /kaggle/working/main.cpp -o main.o 


In [None]:
%timeit tiled_matmul_module.tiled_matmul(a, b)

In [None]:
aa = torch.randn(500, 200, device="cuda")
bb = torch.randn(200, 1000, device="cuda")


(tiled_matmul_module.tiled_matmul(aa, bb) - aa@bb).abs().max()