# Vector Addition Kernel
--------

In [1]:
!nvidia-smi

Wed Apr 23 14:11:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   54C    P8             18W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## CUDA

In [14]:
%%writefile vector_addition.cu

#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <cstdlib>
#include <ctime>

__global__ void vectorAdd(const float* A, const float* B, float* C, int N)
{
    // Element_id (i) = block_id * block_size + thread_id
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N)
    {
        C[i] = A[i] + B[i]; // A[i] will translate to *(A + i)
    }
}

void vectorAddCPU(const float* A, const float* B, float* C, int N)
{
    for (int i = 0; i < N; ++i)
    {
        C[i] = A[i] + B[i];
    }
}

void initialiseVectors(float* A, float* B, int N)
{
    srand(static_cast<unsigned int>(time(0)));

    for (int i = 0; i < N; i++)
    {
        A[i] = static_cast<float>(rand()); // divide by RAND_MAX later if you want to normalise values
        B[i] = static_cast<float>(rand());
    }
}

template <typename Func>
double measureExecutionTime(Func func)
{
    auto start = std::chrono::high_resolution_clock::now();
    func();
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    return duration.count();
}

bool compareResults(const float *A, const float *B, int N)
{
    for (int i = 0; i < N; i++)
    {
        if (fabs(A[i] - B[i]) > 1e-4)
        {
            std::cout << "Mismatch at index " << i << ": CPU=" << A[i] << " GPU=" << B[i] << std::endl;
            return false;
        }
    }
    return true;
}

int main()
{
    int N = 1 << 25; // 1 million elements
    size_t size = N * sizeof(float); // Memory size needed to store the vectors for addition

    // Allocate memory on the host (CPU)
    float* A_host = (float*)malloc(size); // malloc return a void pointer
    float* B_host = (float*)malloc(size);
    float* C_host_cpu = (float*)malloc(size);
    float* C_host_gpu = (float*)malloc(size);

    initialiseVectors(A_host, B_host, N);

    // Measure CPU execution time for vector addition
    double cpu_time = measureExecutionTime([&]()
    {
        vectorAddCPU(A_host, B_host, C_host_cpu, N);
    });

    std::cout << "CPU execution time: " << cpu_time << "ms" << '\n';

    // Allocate memory on the device (GPU)
    float* A_device;
    float* B_device;
    float* C_device;

    cudaMalloc((void**)&A_device, size);
    cudaMalloc((void**)&B_device, size);
    cudaMalloc((void**)&C_device, size);

    cudaMemcpy(A_device, A_host, size, cudaMemcpyHostToDevice);
    cudaMemcpy(B_device, B_host, size, cudaMemcpyHostToDevice);

    int threads_per_block = 256;
    int blocks_per_grid = (N + threads_per_block - 1) / threads_per_block;
    std::cout << "Launching kernel with " << blocks_per_grid << " blocks of "
          << threads_per_block << " threads." << '\n';

    double gpu_time = measureExecutionTime([&]()
    {
        vectorAdd<<<blocks_per_grid, threads_per_block>>>(A_device, B_device, C_device, N);
        cudaDeviceSynchronize();
    });

    std::cout << "GPU execution time: " << gpu_time << "ms" << '\n';

    cudaMemcpy(C_host_gpu, C_device, size, cudaMemcpyDeviceToHost);

    bool success = compareResults(C_host_cpu, C_host_gpu, N);
    std::cout << (success ? "CPU and GPU results match!" : "Results mismatch!");

    cudaFree(A_device);
    cudaFree(B_device);
    cudaFree(C_device);

    free(A_host);
    free(B_host);
    free(C_host_cpu);
    free(C_host_gpu);

    return 0;
}

Overwriting vector_addition.cu


In [15]:
!nvcc -arch=sm_89 vector_addition.cu -o vector_addition
!./vector_addition

CPU execution time: 166.674ms
Launching kernel with 131072 blocks of 256 threads.
GPU execution time: 1.6756ms
CPU and GPU results match!

## Triton

In [16]:
import torch
import triton
import triton.language as tl
import time

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

# torch.manual_seed(0)

size = 1 << 25
x = torch.rand(size, device=DEVICE)
y = torch.rand(size, device=DEVICE)
x_cpu = torch.rand(size, device=torch.device("cpu"))
y_cpu = torch.rand(size, device=torch.device("cpu"))

@triton.jit
def add_kernel(x_ptr, y_ptr, output_ptr, num_elements, BLOCK_SIZE: tl.constexpr):
  # Get program index
  pid = tl.program_id(axis=0) # blockIdx.x
  block_start = pid * BLOCK_SIZE # blockIdx.x * blockDim.x
  # Generate the range of global indices this program is responsible for
  offsets = block_start + tl.arange(0, BLOCK_SIZE)
  mask = offsets < num_elements # Guard against out of bound invalid operations

  # Load vectors from DRAM, masking out any extra elements in case the input is not a
  # multiple of the block size.
  x = tl.load(x_ptr + offsets, mask=mask)
  y = tl.load(y_ptr + offsets, mask=mask)
  output = x + y

  # Write result back to DRAM
  tl.store(output_ptr + offsets, output, mask=mask)

def add(x: torch.tensor, y: torch.tensor):
  output = torch.empty_like(x)
  assert x.device == DEVICE and y.device == DEVICE and output.device == DEVICE

  num_elements = output.numel()
  grid = lambda meta: (triton.cdiv(num_elements, meta["BLOCK_SIZE"]), )
  add_kernel[grid](x, y, output, num_elements, BLOCK_SIZE=256)

  return output


cuda:0


In [17]:
# Warmup and cache kernel
_ = add(x, y)

# Measure Triton execution time
torch.cuda.synchronize()
start = time.perf_counter()
output_triton = add(x, y)
torch.cuda.synchronize()
end = time.perf_counter()
triton_time = (end - start) * 1000

# Measure PyTorch GPU execution time
torch.cuda.synchronize()
start = time.perf_counter()
output_torch = x + y
torch.cuda.synchronize()
end = time.perf_counter()
pytorch_time = (end - start) * 1000

# Measure PyTorch CPU execution time
start = time.perf_counter()
output_torch_cpu = x_cpu + y_cpu
end = time.perf_counter()
pytorch_time_cpu = (end - start) * 1000

print(f"PyTorch CPU execution time: {pytorch_time_cpu:.5f}ms")
print(f"PyTorch GPU execution time: {pytorch_time:.5f}ms")
print(f"Triton  execution time: {triton_time:.5f}ms")
print(f'The maximum difference between torch and triton is '
      f'{torch.max(torch.abs(output_torch - output_triton))}')

PyTorch CPU execution time: 15.63703ms
PyTorch GPU execution time: 1.85400ms
Triton  execution time: 1.99292ms
The maximum difference between torch and triton is 0.0
