In [1]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmprgya6wsc".


In [42]:
%%cuda
#include <stdio.h>


struct GpuTimer {
      cudaEvent_t start;
      cudaEvent_t stop;

      GpuTimer()
      {
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
      }

      ~GpuTimer()
      {
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
      }

      void Start()
      {
            cudaEventRecord(start, 0);
      }

      void Stop()
      {
            cudaEventRecord(stop, 0);
      }

      float Elapsed()
      {
            float elapsed;
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&elapsed, start, stop);
            return elapsed;
      }
};


void host_add(int *a, int *b, int *c) {
    for (int idx = 0; idx < N; idx++)
        c[idx] = a[idx] + b[idx];
}

__global__ void device_add(int *a, int *b, int *c) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < N)
        c[index] = a[index] + b[index];
}

void fill_array(int* array, int n) {
    for (int i = 0; i < n; i++) {
        array[i] = i; // Example fill operation
    }
}

int main(void) {
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    int sizes[] = {512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576,
               2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456};
    int blocks[] = {16, 32, 64, 128, 256, 512};
    int no_of_blocks = 0;
    // GpuTimer timer; // Assuming this is part of some GPU timing library
    for(int s = 0; s < sizeof(sizes) / sizeof(sizes[0]); s++) {
        int n = sizes[s];
        int size = n * sizeof(int);

        a = (int *)malloc(size); fill_array(a, n);
        b = (int *)malloc(size); fill_array(b, n);
        c = (int *)malloc(size);

        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);

        cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

        for (int t = 0; t < sizeof(blocks) / sizeof(blocks[0]); t++) {
            GpuTimer timer;

            int threads_per_block = blocks[t];
            no_of_blocks = (n + threads_per_block - 1) / threads_per_block;

            timer.Start();
            device_add<<<no_of_blocks, threads_per_block>>>(d_a, d_b, d_c);
            cudaDeviceSynchronize();
            timer.Stop();

            cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

            printf("N = %d; no_of_blocks = %d; threads_per_block = %d; Elapsed time = %f ms\n", n, no_of_blocks, threads_per_block,timer.Elapsed());
        }

        free(a); free(b); free(c);
        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    }

    return 0;
}

N = 512; no_of_blocks = 32; threads_per_block = 16; Elapsed time = 0.163904 ms
N = 512; no_of_blocks = 16; threads_per_block = 32; Elapsed time = 0.015520 ms
N = 512; no_of_blocks = 8; threads_per_block = 64; Elapsed time = 0.011296 ms
N = 512; no_of_blocks = 4; threads_per_block = 128; Elapsed time = 0.011808 ms
N = 512; no_of_blocks = 2; threads_per_block = 256; Elapsed time = 0.011680 ms
N = 512; no_of_blocks = 1; threads_per_block = 512; Elapsed time = 0.011712 ms
N = 1024; no_of_blocks = 64; threads_per_block = 16; Elapsed time = 0.014336 ms
N = 1024; no_of_blocks = 32; threads_per_block = 32; Elapsed time = 0.012416 ms
N = 1024; no_of_blocks = 16; threads_per_block = 64; Elapsed time = 0.011168 ms
N = 1024; no_of_blocks = 8; threads_per_block = 128; Elapsed time = 0.011360 ms
N = 1024; no_of_blocks = 4; threads_per_block = 256; Elapsed time = 0.011776 ms
N = 1024; no_of_blocks = 2; threads_per_block = 512; Elapsed time = 0.011680 ms
N = 2048; no_of_blocks = 128; threads_per_block