In [None]:
!nvidia-smi

In [None]:
%%writefile parallel_reduction_seq_addressing.cu

#include <iostream>
#include <cuda_runtime.h>

__global__ void initVectors(int *vector, int n){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n){ // each thread initializes an element in the vector
        vector[idx] = idx;
    }
}

__global__ void reduce_seq_addressing(int *g_in_data, int *g_out_data, int n){
    extern __shared__ int sdata[];  // stored in the shared memory

    // Each thread loading one element from global onto shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    // Load or pad with 0 (handle edge cases where n is not a multiple of blockDim)
    sdata[tid] = (i < n) ? g_in_data[i] : 0;
    __syncthreads();

    // Reduction method -- occurs in shared memory
    for(unsigned int s = blockDim.x/2; s > 0; s >>= 1){
        // reduce_seq_addressing -- check out the reverse loop above
        if (tid < s){   // then, we check threadID to do our computation
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    if (tid == 0){
        g_out_data[blockIdx.x] = sdata[0];
    }
}

int main(){
    int N = 50000000;
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    int n = N;

    // CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int *d_vector;
    cudaMalloc(&d_vector, N * sizeof(int));

    // Time initialization
    cudaEventRecord(start);
    initVectors<<<blocksPerGrid, threadsPerBlock>>>(d_vector, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float initTime = 0;
    cudaEventElapsedTime(&initTime, start, stop);

    int *d_in = d_vector; 
    int *d_out;

    // Time reduction
    cudaEventRecord(start);

    int totalIterations = 0;
    while (n > 1) {
        int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

        cudaMalloc(&d_out, blocksPerGrid * sizeof(int));

        reduce_seq_addressing<<<blocksPerGrid, threadsPerBlock,
                threadsPerBlock * sizeof(int)>>>(d_in, d_out, n);

        cudaDeviceSynchronize();

        if (d_in != d_vector)
            cudaFree(d_in);

        d_in = d_out;
        n = blocksPerGrid;
        totalIterations++;
    }

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float reduce_seq_addressingTime = 0;
    cudaEventElapsedTime(&reduce_seq_addressingTime, start, stop);

    int result;
    cudaMemcpy(&result, d_in, sizeof(int), cudaMemcpyDeviceToHost);

    // Calculate bandwidth
    int totalDataRead = N * sizeof(int); // Initial read
    int totalDataWritten = N * sizeof(int); // Writes during reduction
    float totalDataGB = (totalDataRead + totalDataWritten) / (1024.0f * 1024.0f * 1024.0f);
    float totalTimeSeconds = (initTime + reduce_seq_addressingTime) / 1000.0f;
    float bandwidthGBps = totalDataGB / totalTimeSeconds;

    // Results
    std::cout << "========================================" << std::endl;
    std::cout << "CUDA Parallel Reduction Profiling" << std::endl;
    std::cout << "========================================" << std::endl;
    std::cout << "Array Size (N):          " << N << std::endl;
    std::cout << "Threads Per Block:       " << threadsPerBlock << std::endl;
    std::cout << "Reduction Iterations:    " << totalIterations << std::endl;
    std::cout << "----------------------------------------" << std::endl;
    std::cout << "Initialization Time:     " << initTime << " ms" << std::endl;
    std::cout << "Reduction Time:          " << reduce_seq_addressingTime << " ms" << std::endl;
    std::cout << "Total Time:              " << (initTime + reduce_seq_addressingTime) << " ms" << std::endl;
    std::cout << "----------------------------------------" << std::endl;
    std::cout << "Effective Bandwidth:     " << bandwidthGBps << " GB/s" << std::endl;
    std::cout << "========================================" << std::endl;
    std::cout << "Sum Result:              " << result << std::endl;
    std::cout << "Expected:  " << (N * (N - 1) / 2) << std::endl;
    std::cout << "========================================" << std::endl;

    // Cleanup
    cudaFree(d_vector);
    cudaFree(d_in);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Overwriting parallel_reduction_seq_addressing.cu


In [10]:
!nvcc -arch=sm_75 parallel_reduction_seq_addressing.cu -o parallel_reduction_seq_addressing
!./parallel_reduction_seq_addressing

CUDA Parallel Reduction Profiling
Array Size (N):          50000000
Threads Per Block:       256
Reduction Iterations:    4
----------------------------------------
Initialization Time:     0.956416 ms
Reduction Time:          4.99706 ms
Total Time:              5.95347 ms
----------------------------------------
Effective Bandwidth:     62.5734 GB/s
Sum Result:              1283106752
Expected:  -864376896


**the improvement of computational time shows with bigger input array sizes** (5.95 ms for 50million input size) and interleaved addressing 8.61491 ms