In [1]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpn14fvycx".


In [11]:
%%cuda
#include <stdio.h>

struct GpuTimer {
      cudaEvent_t start;
      cudaEvent_t stop;

      GpuTimer()
      {
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
      }

      ~GpuTimer()
      {
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
      }

      void Start()
      {
            cudaEventRecord(start, 0);
      }

      void Stop()
      {
            cudaEventRecord(stop, 0);
      }

      float Elapsed()
      {
            float elapsed;
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&elapsed, start, stop);
            return elapsed;
      }
};




__global__ void device_add(int *a, int *b, int *c, int n) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < n)
        c[index] = a[index] + b[index];
}

void fill_array(int* array, int n) {
    for (int i = 0; i < n; i++) {
        array[i] = i; // Example fill operation
    }
}

int main(void) {
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    int sizes[] = {512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576,
               2097152, 4194304, 8388608, 16777216};
    int blocks[] = { 32, 64, 128, 256, 512};
    int no_of_blocks = 0;

    printf("N; no_of_blocks; threads_per_block; Elapsed time\n");

    for(int s = 0; s < sizeof(sizes) / sizeof(sizes[0]); s++) {
        int n = sizes[s];
        int size = n * sizeof(int);

        a = (int *)malloc(size); fill_array(a, n);
        b = (int *)malloc(size); fill_array(b, n);
        c = (int *)malloc(size);

        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);

        cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

        for (int t = 0; t < sizeof(blocks) / sizeof(blocks[0]); t++) {
            GpuTimer timer;

            int threads_per_block = blocks[t];
            no_of_blocks = (n + threads_per_block - 1) / threads_per_block;

            timer.Start();
            device_add<<<no_of_blocks, threads_per_block>>>(d_a, d_b, d_c, n);
            cudaDeviceSynchronize();
            timer.Stop();

            cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

            printf("%d;%d;%d;%f\n", n, no_of_blocks, threads_per_block,timer.Elapsed());
        }

        free(a); free(b); free(c);
        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    }

    return 0;
}

N; no_of_blocks; threads_per_block; Elapsed time
512;16;32;0.200352
512;8;64;0.014976
512;4;128;0.010784
512;2;256;0.011648
512;1;512;0.011328
1024;32;32;0.016416
1024;16;64;0.017184
1024;8;128;0.012160
1024;4;256;0.011424
1024;2;512;0.010976
2048;64;32;0.018400
2048;32;64;0.014720
2048;16;128;0.014496
2048;8;256;0.013088
2048;4;512;0.014624
4096;128;32;0.014688
4096;64;64;0.012480
4096;32;128;0.012000
4096;16;256;0.014912
4096;8;512;0.012640
8192;256;32;0.013632
8192;128;64;0.012704
8192;64;128;0.011136
8192;32;256;0.012288
8192;16;512;0.010976
16384;512;32;0.011680
16384;256;64;0.018560
16384;128;128;0.013280
16384;64;256;0.012096
16384;32;512;0.011424
32768;1024;32;0.017856
32768;512;64;0.012480
32768;256;128;0.012416
32768;128;256;0.012704
32768;64;512;0.013312
65536;2048;32;0.025568
65536;1024;64;0.016352
65536;512;128;0.013248
65536;256;256;0.013024
65536;128;512;0.013408
131072;4096;32;0.026272
131072;2048;64;0.018656
131072;1024;128;0.015744
131072;512;256;0.014752
131072;256;5