<a href="https://colab.research.google.com/github/Kingsley-Yoimiya/cuda-learning/blob/main/simple_practice/CUDA_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# add nvcc support for jupyter
!pip install nvcc4jupyter

%load_ext nvcc4jupyter

!nvcc --version

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpjxzhldmf".
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [4]:
%%cuda

#include <iostream>
#include <random>
#include <chrono>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include <cuda.h>

using namespace std;

#define BLOCK_SIZE 256

__global__ void reduce_sum_kernel(const float* input_vecs, size_t n, size_t dim, float* output_vec) {
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < n * dim) {
        atomicAdd(&output_vec[idx % dim], input_vecs[idx]);
    }
}

void reduce_sum(const float* input_vecs, size_t n, size_t dim, float* output_vec) {
    float* cu_input_vecs;
    float* cu_output_vecs;
    size_t input_size = n * dim * sizeof(float), output_size = dim * sizeof(float);
    cudaMalloc((void**) &cu_input_vecs, input_size);
    cudaMalloc((void**) &cu_output_vecs, output_size);
    cudaMemcpy(cu_input_vecs, input_vecs, input_size, cudaMemcpyHostToDevice);
    cudaMemset(cu_output_vecs, 0, output_size);
    size_t grid_size = (n * dim + BLOCK_SIZE - 1) / BLOCK_SIZE;
    reduce_sum_kernel <<< grid_size, BLOCK_SIZE >>>(cu_input_vecs, n, dim, cu_output_vecs);
    cudaDeviceSynchronize();
    cudaMemcpy(output_vec, cu_output_vecs, output_size, cudaMemcpyDeviceToHost);
    cudaFree(cu_input_vecs);
    cudaFree(cu_output_vecs);
}

const long long N = 1e9;
const int T = 1000;

uniform_real_distribution<float> u(0, 1);
mt19937 rnd(chrono::system_clock::now().time_since_epoch().count());

int main() {
    float* input_vecs = new float[N];
    float* output_vec = new float[T];
    float* correct_vec = new float[T];
    for(int i = 0; i < N; i++) {
        input_vecs[i] = u(rnd);
    }

    cerr << "GENERATE OK!" << endl;
    double st = clock();
    reduce_sum(input_vecs, N / T, T, output_vec);
    double ed = clock();
    std::cout << (ed - st) / CLOCKS_PER_SEC << std::endl;

    st = clock();

    for(int i = 0; i < T; i++) {
        correct_vec[i] = 0;
    }
    for(int i = 0; i < N; i++) {
        correct_vec[i % T] += input_vecs[i];
    }

    ed = clock();
    std::cout << (ed - st) / CLOCKS_PER_SEC << std::endl;

    for(int i = 0; i < T; i++) {
        if(abs(correct_vec[i] - output_vec[i]) > 1) {
            std::cout << correct_vec[i] << " " << output_vec[i] << " ERROR!" << std::endl;
            break;
        }
    }

    std::cout << output_vec[0] << std::endl;
    delete[] input_vecs;
    delete[] output_vec;

    return 0;
}


GENERATE OK!
0.991417
3.25612
500030

