<a href="https://colab.research.google.com/github/MichaelGelo/GRP2_DeepDive__CUDA/blob/main/DeepDive_CUDA_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Group 2 - DeepDive CUDA - Histogram counting (Atomic operations)**
## **GROUP 2 - S11**

**MEMBERS:**

- Alfred Bastin S. Agustines
- Allan David C. De Leon
- Michael Angelo Depasucat
- Kai Hiori J. Padilla


# Check if CUDA is present

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Feb 19 18:55:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# (1) C Histogram program

In [17]:
%%writefile C_histo.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>

#define NUM_BINS 10

void getHistogram(int* vector, int vectorSize, int* histogram) {
    for (int i = 0; i < vectorSize; i++) {
        int bIndex = vector[i] % 10;
        histogram[bIndex]++;
    }
}

int main() {
    const size_t ARRAY_SIZE = 1<<24;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(double);

    int32_t *vector;
    vector = (int32_t*)malloc(ARRAY_BYTES);
    int histogram[NUM_BINS] = { 0 };
    const size_t loope = 100;

    for (int i = 0; i < ARRAY_SIZE; i++) {
        vector[i] = i;
    }

    clock_t start, end;
    getHistogram(vector, ARRAY_SIZE, histogram);


    for (int i = 0; i < 10; i++) {
        histogram[i] = 0;
    }

    double elapse, time_taken;
    elapse = 0.0f;
    for (int i = 0; i < loope; i++) {
        for (int i = 0; i < 10; i++) {
            histogram[i] = 0;
        }
        start = clock();
        getHistogram(vector, ARRAY_SIZE, histogram);
        end = clock();
        time_taken = ((double)(end - start)) * 1E3 / CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
    }
    printf("Function (in C) average time for %lu loops is %f milliseconds to execute an array size %lu \n", loope, elapse / loope, ARRAY_SIZE);

    size_t err_count = 0;
    int32_t  test[NUM_BINS] = { 0 };

    for (int i = 0; i < ARRAY_SIZE; i++) {
        int bIndex = vector[i] % 10;
        test[bIndex]++;
    }
    for (int i = 0; i < 10; i++) {

        if (test[i] != histogram[i])
            err_count++;
    }
    printf("Error count (C program): %lu\n", err_count);

    for (int i = 0; i < NUM_BINS; i++) {
        printf("Bin %d: %d\n", i, histogram[i]);
    }

    free(vector);
	return 0;
}

Overwriting C_histo.c


In [18]:
%%shell
gcc C_histo.c -o C_histo



In [19]:
%%shell
./C_histo

Function (in C) average time for 100 loops is 69.097810 milliseconds to execute an array size 16777216 
Error count (C program): 0
Bin 0: 1677722
Bin 1: 1677722
Bin 2: 1677722
Bin 3: 1677722
Bin 4: 1677722
Bin 5: 1677722
Bin 6: 1677721
Bin 7: 1677721
Bin 8: 1677721
Bin 9: 1677721




# (2) CUDA program w/ Unified memory, pre-fetching and memadvise (Atomic Operations)

In [27]:
%%writefile cuda.cu
#include <stdio.h>
#include <stdlib.h>

#define VECTOR_SIZE (1 << 28)
#define NUM_LOOPE 30
#define NUM_BINS 10

// CUDA kernel using atomic operations
__global__ void histogramKernel(int *d_data, int *d_histogram, int size) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = index; i < size; i += stride) {
        int bin = d_data[i] % NUM_BINS;
        atomicAdd(&d_histogram[bin], 1);
    }
}

int main() {
    const size_t ARRAY_SIZE = VECTOR_SIZE;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(int);

    //number of times the program is to be executed
    const size_t loope = NUM_LOOPE;

    //declare array
    int *data, *histogram;
    cudaMallocManaged(&data, ARRAY_BYTES);
    cudaMallocManaged(&histogram, NUM_BINS * sizeof(int));

    //get gpu id
    int device = -1;
    cudaGetDevice(&device);

    // memory advise
    cudaMemAdvise(data, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(data, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

    //"prefetch data" to create CPU page memory
    cudaMemPrefetchAsync(data, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(histogram, NUM_BINS * sizeof(int), device, NULL);

    // *** init array
    for (size_t i = 0; i < VECTOR_SIZE; i++) {
        data[i] = i;
    }

    //"Prefetch data" from CPU-GPU
    cudaMemPrefetchAsync(data, ARRAY_BYTES, device, NULL);
    cudaMemPrefetchAsync(histogram, NUM_BINS * sizeof(int), device, NULL);

    // *** setup CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (VECTOR_SIZE + numThreads - 1) / numThreads;

    printf("*** function = Histogram Counting\n");
    printf("numElements = %lu\n", ARRAY_SIZE);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    for (size_t i = 0; i < loope; i++) {
        cudaMemset(histogram, 0, NUM_BINS * sizeof(int));
        histogramKernel<<<numBlocks, numThreads>>>(data, histogram, VECTOR_SIZE);
    }

    //barrier
    cudaDeviceSynchronize();

    //"Prefetch data" from GPU-CPU
    cudaMemPrefetchAsync(histogram, NUM_BINS * sizeof(int), cudaCpuDeviceId, NULL);

    //error checking
    size_t err_count = 0;
    int referenceHistogram[NUM_BINS] = {0};
    for (size_t i = 0; i < VECTOR_SIZE; i++) {
        int bin = data[i] % NUM_BINS;
        referenceHistogram[bin]++;
    }
    for (int i = 0; i < NUM_BINS; i++) {
        if (histogram[i] != referenceHistogram[i]) {
            printf("Mismatch at bin %d: CUDA=%d, CPU=%d\n", i, histogram[i], referenceHistogram[i]);
            err_count++;
        }
    }
    printf("Error count (CUDA program): %zu\n", err_count);

    // Print results
    for (int i = 0; i < NUM_BINS; i++) {
        printf("Bin %d: %d\n", i, histogram[i]);
    }

    //free memory
    cudaFree(data);
    cudaFree(histogram);

    return 0;
}

Overwriting cuda.cu


In [28]:
%%shell
nvcc cuda.cu -o cuda



In [29]:
%%shell
nvprof ./cuda

==12256== NVPROF is profiling process 12256, command: ./cuda
*** function = Histogram Counting
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count (CUDA program): 0
Bin 0: 26843546
Bin 1: 26843546
Bin 2: 26843546
Bin 3: 26843546
Bin 4: 26843546
Bin 5: 26843546
Bin 6: 26843545
Bin 7: 26843545
Bin 8: 26843545
Bin 9: 26843545
==12256== Profiling application: ./cuda
==12256== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.90893s        30  63.631ms  61.040ms  108.16ms  histogramKernel(int*, int*, int)
                    0.00%  52.479us        30  1.7490us  1.5680us  4.0000us  [CUDA memset]
      API calls:   76.81%  1.84828s        30  61.609ms  57.878us  108.18ms  cudaMemset
                   14.30%  344.17ms         5  68.833ms  28.015us  243.45ms  cudaMemPrefetchAsync
                    4.53%  109.03ms         2  54.517ms  69.544us  108.97ms  cudaMallocManaged
                    

