In [1]:
!nvidia-smi

Wed Mar  8 21:52:20 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A10          On   | 00000000:06:00.0 Off |                    0 |
|  0%   49C    P8    42W / 150W |      0MiB / 23028MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# import os
# import urllib
# import zipfile


# if not os.path.isfile("triton.zip"):
#   urllib.request.urlretrieve("https://github.com/openai/triton/archive/refs/heads/main.zip", "triton.zip")
# if not os.path.isdir("triton-repo"):
#   with zipfile.ZipFile("triton.zip", 'r') as zip_ref:
#     zip_ref.extractall("triton-repo")

# !pip install -e ./triton-repo/triton-main/python[tutorials]
!pip install triton matplotlib pandas tabulate # libraries besides triton for tutorial specifically

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
import torch

import triton
import triton.language as tl


@triton.jit
def add_kernel(
    x_ptr,  # *Pointer* to first input vector
    y_ptr,  # *Pointer* to second input vector
    output_ptr,  # *Pointer* to output vector
    n_elements,  # Size of the vector
    # pid_debug, # just dump all debug info here,
    offset_debug,
    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process
                 # NOTE: `constexpr` so it can be used as a shape value
   
):
    # There are multiple 'program's processing different data. We identify which program
    # we are here
    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0
    # tl.store(pid_debug, pid)
    # This program will process inputs that are offset from the initial data.
    # for instance, if you had a vector of length 256 and block_size of 64, the programs
    # would each access the elements [0:64, 64:128, 128:192, 192:256].
    # Note that offsets is a list of pointers
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # Create a mask to guard memory operations against out-of-bounds accesses
    mask = offsets < n_elements
    tl.store(offset_debug + tl.arange(0, BLOCK_SIZE), offsets)
    # Load x and y from DRAM, masking out any extra elements in case the input is not a
    # multiple of the block size
    x = tl.load(x_ptr + offsets, mask=mask)
    y = tl.load(y_ptr + offsets, mask=mask)
    output = x + y
    # Write x + y back to DRAM
    tl.store(output_ptr + offsets, output, mask=mask)


In [31]:
# def add(x: torch.Tensor, y: torch.Tensor, pid_debug: torch.Tensor):
def add(x: torch.Tensor, y: torch.Tensor):
    # We need to preallocate the output
    output = torch.empty_like(x)
    assert x.is_cuda and y.is_cuda and output.is_cuda
    n_elements = output.numel()
    # The SPMD launch grid denotes the number of kernel instances that run in parallel.
    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]
    # In this case, we use a 1D grid where the size is the number of blocks
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    # NOTE:
    #  - each torch.tensor object is implicitly converted into a pointer to its first element.
    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
    #  - don't forget to pass meta-parameters as keywords arguments
    offset_debug = torch.zeros([1024], dtype=torch.float32).cuda()
    # add_kernel[grid](x, y, output, n_elements, pid_debug, offset_debug, BLOCK_SIZE=1024)
    add_kernel[grid](x, y, output, n_elements, offset_debug, BLOCK_SIZE=1024)
    # add_kernel[grid](x, y, output, n_elements, pid_debug, BLOCK_SIZE=1024)
    # add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
    # running asynchronously at this point.
    # print(f"pid_debug {pid_debug}")
    # return output, pid_debug
    print(f"offset_debug {offset_debug} and shape {offset_debug.shape}")
    # this is super inconsistent with what I expected?
    # e.g. the current example is offset_debug tensor([1024., 1025., 1026.,  ..., 1021., 1022., 1023.], device='cuda:0') and shape torch.Size([1024])
    # and that makes no sense to me
    return output



In [33]:
# torch.manual_seed(0)
size = 1024 * 30 + 1
x = torch.rand(size, device='cuda')
y = torch.rand(size, device='cuda')
output_torch = x + y
# pid_debug = torch.zeros([1], dtype=int).cuda()

# output_triton, pid_debug = add(x, y, pid_debug)
output_triton = add(x, y)
print(output_torch)
print(output_triton)
# print(pid_debug) # pid debug
print(
    f'The maximum difference between torch and triton is '
    f'{torch.max(torch.abs(output_torch - output_triton))}'
)

offset_debug tensor([1024., 1025., 1026.,  ..., 1021., 1022., 1023.], device='cuda:0') and shape torch.Size([1024])
tensor([0.4436, 1.0880, 1.8005,  ..., 0.3637, 1.2854, 0.7171], device='cuda:0')
tensor([0.4436, 1.0880, 1.8005,  ..., 0.3637, 1.2854, 0.7171], device='cuda:0')
The maximum difference between torch and triton is 0.0


In [9]:
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=['size'],  # argument names to use as an x-axis for the plot
        x_vals=[
            2 ** i for i in range(12, 16, 1)
        ],  # different possible values for `x_name`
        x_log=True,  # x axis is logarithmic
        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
        line_vals=['triton', 'torch'],  # possible values for `line_arg`
        line_names=['Triton', 'Torch'],  # label name for the lines
        styles=[('blue', '-'), ('green', '-')],  # line styles
        ylabel='GB/s',  # label name for the y-axis
        plot_name='vector-add-performance',  # name for the plot. Used also as a file name for saving the plot.
        args={},  # values for function arguments not in `x_names` and `y_name`
    )
)
def benchmark(size, provider):
    x = torch.rand(size, device='cuda', dtype=torch.float32)
    y = torch.rand(size, device='cuda', dtype=torch.float32)
    if provider == 'torch':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)
    if provider == 'triton':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))
    gbps = lambda ms: 12 * size / ms * 1e-6
    return gbps(ms), gbps(max_ms), gbps(min_ms)
benchmark.run(print_data=True, show_plots=True)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


tensor([1024., 1025., 1026.,  ..., 2045., 2046., 2047.], device='cuda:0')
tensor([0.0000e+00, 1.0000e+00, 2.0000e+00,  ..., 1.0210e+03, 1.0220e+03,
        1.0230e+03], device='cuda:0')
tensor([0.0000e+00, 1.0000e+00, 2.0000e+00,  ..., 1.0210e+03, 1.0220e+03,
        1.0230e+03], device='cuda:0')
tensor([0.0000e+00, 1.0000e+00, 2.0000e+00,  ..., 1.0210e+03, 1.0220e+03,
        1.0230e+03], device='cuda:0')
tensor([0.0000e+00, 1.0000e+00, 2.0000e+00,  ..., 1.0210e+03, 1.0220e+03,
        1.0230e+03], device='cuda:0')
tensor([0.0000e+00, 1.0000e+00, 2.0000e+00,  ..., 1.0210e+03, 1.0220e+03,
        1.0230e+03], device='cuda:0')
tensor([0.0000e+00, 1.0000e+00, 2.0000e+00,  ..., 1.0210e+03, 1.0220e+03,
        1.0230e+03], device='cuda:0')
tensor([0.0000e+00, 1.0000e+00, 2.0000e+00,  ..., 1.0210e+03, 1.0220e+03,
        1.0230e+03], device='cuda:0')
tensor([0.0000e+00, 1.0000e+00, 2.0000e+00,  ..., 1.0210e+03, 1.0220e+03,
        1.0230e+03], device='cuda:0')
tensor([0.0000e+00, 1.0000e+00

KeyboardInterrupt: 

In [46]:
output_torch.shape

torch.Size([98432])

In [108]:
torch.manual_seed(0)


RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.