In [None]:
!nvidia-smi

Tue Nov 19 15:54:08 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc4jupyter

In [11]:
%%cuda

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <type_traits>

#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t ec, const char* file, int line, bool abort=true) {
    if(ec != cudaSuccess) {
        fprintf(stderr,  "GPUassert: %s %s %d\n", cudaGetErrorString(ec), file, line);
        if (abort) exit(ec);
    }
}

__global__ void sum_matrix(int32_t* m1, int32_t* m2, int64_t* r, int size) {
    int gid = threadIdx.x + blockIdx.x * blockDim.x +
          (threadIdx.y + blockIdx.y * blockDim.y) * size;

    if(gid < size*size) {
       r[gid] = m1[gid] + m2[gid];
    }

}

void print_time(const clock_t& start, const clock_t& end, const char* message) {
    printf(message);
    printf(" time : %4.6f \n", (double)((double)(end-start)/CLOCKS_PER_SEC));
}

template <typename... Args>
void device_free(Args*... pointers) {
    gpuErrorCheck((cudaFree(pointers), ...));
}

int main (int argc, char** argv) {
    const size_t size_x { 1024 }, size_y { 1024 };
    const size_t total_size { size_y * size_x };
    clock_t start{}, end{};

    srand((unsigned)time(NULL));

    // Cpu Data
    start = clock();
    std::vector<int32_t> matrix1(size_x * size_y, static_cast<int32_t>(rand() & 0xFF));
    std::vector<int32_t> matrix2(size_x * size_y, static_cast<int32_t>(rand() & 0xFF));
    std::vector<int64_t> results(size_x * size_y);
    end = clock();
    print_time(start, end, "Vector Init");

    // Device Pointers
    int32_t *device_matrix1{}, *device_matrix2{};
    int64_t *device_results{};

    // Device Allocation
    start = clock();
    gpuErrorCheck(cudaMalloc((int32_t**)&device_matrix1, sizeof(int32_t) * total_size));
    gpuErrorCheck(cudaMalloc((int32_t**)&device_matrix2, sizeof(int32_t) * total_size));
    gpuErrorCheck(cudaMalloc((int64_t**)&device_results, sizeof(int64_t) * total_size));

    gpuErrorCheck(cudaMemcpy(device_matrix1, matrix1.data(), sizeof(int32_t) * total_size, cudaMemcpyHostToDevice));
    gpuErrorCheck(cudaMemcpy(device_matrix2, matrix2.data(), sizeof(int32_t) * total_size, cudaMemcpyHostToDevice));
    end = clock();
    print_time(start, end, "GPU Alloc & HtoD Data Transfer");

    dim3 block(32,32);
    dim3 grid(size_x/block.x +1, size_y/block.y +1);


    start = clock();
    sum_matrix<<<grid, block>>>(device_matrix1, device_matrix2, device_results, size_x);
    cudaDeviceSynchronize();
    end = clock();
    print_time(start, end, "Sum Matrix");

    start = clock();
    gpuErrorCheck(cudaMemcpy(results.data(), device_results, sizeof(int64_t) * matrix1.size(), cudaMemcpyDeviceToHost));
    end = clock();
    print_time(start, end, "Device to Host Data Transfer");

    //Validty Check
    for(int i=0; i<total_size; i++) {
        if(results[i] != matrix1[i] + matrix2[i]) {
          printf("!!! Validity Check is not confirmed !!!\n");
          printf("result[%d] %d != %d\n", i, results[i], matrix1[i] + matrix2[i] );
          device_free(device_matrix1, device_matrix2, device_results);
          return -1;
        }
    }

    device_free(device_matrix1, device_matrix2, device_results);
    cudaDeviceReset();
    printf("Validity Check is confirmed √\n");
    return 0;
}


Vector Init time : 0.017083 
GPU Alloc & HtoD Data Transfer time : 0.181355 
Sum Matrix time : 0.000307 
Device to Host Data Transfer time : 0.002030 
Validity Check is confirmed √

