## Q1: Matrix Addition


In [12]:
%%writefile 1.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>
#include <omp.h>

__global__ void matrix_Add_cuda(int *a, int *b, int *c, int n) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if(tid < n * n) {
        int row = tid / n;
        int col = tid % n;
        c[row * n + col] = a[row * n + col] + b[row * n + col];
    }
}

void matrix_Add_OpenAcc(int *a, int *b, int *c, int n) {
    #pragma acc parallel loop collapse(2) copyin(a[0:n][0:n], b[0:n][0:n]) copyout(c[0:n][0:n])
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            c[i * n + j] = a[i * n + j] + b[i * n + j];
        }
    }
}

void matrix_Add_cpu(int *a, int *b, int *c, int n) {
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            c[i * n + j] = a[i * n + j] + b[i * n + j];
        }
    }
}

int main() {
    int sizes[] = {1024, 2048, 4096};
    int n;

    for (int idx = 0; idx < 3; idx++) {
        n = sizes[idx];
        int size = n * n * sizeof(int);

        int *a = (int*)malloc(size);
        int *b = (int*)malloc(size);
        int *c = (int*)malloc(size);

        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                a[i * n + j] = j + 1;
                b[i * n + j] = j + 1;
            }
        }

        int *d_a, *d_b, *d_c;
        cudaMalloc((void**)&d_a, size);
        cudaMalloc((void**)&d_b, size);
        cudaMalloc((void**)&d_c, size);

        cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

        // CUDA Matrix Addition
        int threadsPerBlock = 1024;
        int blocksPerGrid = (n * n + threadsPerBlock - 1) / threadsPerBlock;

        auto start_cuda = std::chrono::high_resolution_clock::now();
        matrix_Add_cuda<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
        auto end_cuda = std::chrono::high_resolution_clock::now();
        cudaDeviceSynchronize();

        cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

        std::chrono::duration<double> duration_cuda = end_cuda - start_cuda;
        printf("CUDA Matrix Addition (size %d) took: %f seconds\n", n, duration_cuda.count());

        cudaFree(d_a);
        cudaFree(d_b);
        cudaFree(d_c);

        // OpenACC Matrix Addition
        auto start_openacc = std::chrono::high_resolution_clock::now();
        matrix_Add_OpenAcc(a, b, c, n);
        auto end_openacc = std::chrono::high_resolution_clock::now();

        std::chrono::duration<double> duration_openacc = end_openacc - start_openacc;
        printf("OpenACC Matrix Addition (size %d) took: %f seconds\n", n, duration_openacc.count());

        // CPU Sequential Matrix Addition
        auto start_cpu = std::chrono::high_resolution_clock::now();
        matrix_Add_cpu(a, b, c, n);
        auto end_cpu = std::chrono::high_resolution_clock::now();

        std::chrono::duration<double> duration_cpu = end_cpu - start_cpu;
        printf("CPU Sequential Matrix Addition (size %d) took: %f seconds\n", n, duration_cpu.count());

        // Display a small part of the result matrix
        printf("Result matrix for size %d (first 3x3 block):\n", n);
        for (int i = 0; i < 3 && i < n; i++) {
            for (int j = 0; j < 3 && j < n; j++) {
                printf("%d ", c[i * n + j]);
            }
            printf("\n");
        }
        printf("\n");

        free(a);
        free(b);
        free(c);
    }

    return 0;
}


Overwriting 1.cu


In [13]:
!nvcc -o 1 1.cu

In [14]:
!./1

CUDA Matrix Addition (size 1024) took: 0.000164 seconds
OpenACC Matrix Addition (size 1024) took: 0.003666 seconds
CPU Sequential Matrix Addition (size 1024) took: 0.003463 seconds
Result matrix for size 1024 (first 3x3 block):
2 4 6 
2 4 6 
2 4 6 

CUDA Matrix Addition (size 2048) took: 0.000017 seconds
OpenACC Matrix Addition (size 2048) took: 0.013858 seconds
CPU Sequential Matrix Addition (size 2048) took: 0.013915 seconds
Result matrix for size 2048 (first 3x3 block):
2 4 6 
2 4 6 
2 4 6 

CUDA Matrix Addition (size 4096) took: 0.000019 seconds
OpenACC Matrix Addition (size 4096) took: 0.057156 seconds
CPU Sequential Matrix Addition (size 4096) took: 0.055521 seconds
Result matrix for size 4096 (first 3x3 block):
2 4 6 
2 4 6 
2 4 6 



## Q2: Histogram of Array

In [51]:
%%writefile 2.cu

#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#include <time.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <openacc.h>

#define ARRAY_SIZE 10000000
#define MAX_VALUE 100

__global__ void compute_histogram_cuda(int *array, int *histogram, int size) {
    int id = threadIdx.x + blockIdx.x * blockDim.x;

    if (id < size) {
        atomicAdd(&histogram[array[id]], 1);
    }
}

void compute_histogram_OpenAcc(int *array, int *histogram, int size) {
    #pragma acc parallel loop copyin(array[0:ARRAY_SIZE]) copyout(histogram[0:ARRAY_SIZE])
     for (int i = 0; i < size; i++) {
        int value = array[i];
        histogram[value]++;
    }
}

int main() {
    int *array = (int *)malloc(ARRAY_SIZE * sizeof(int));
    int *histogram_OpenAcc = (int *)calloc(MAX_VALUE, sizeof(int));
    int *histogram_cuda = (int *)calloc(MAX_VALUE, sizeof(int));

    if (!array || !histogram_OpenAcc || !histogram_cuda) {
        fprintf(stderr, "Memory allocation failed\n");
        return 1;
    }

    srand(time(NULL));
    for (int i = 0; i < ARRAY_SIZE; i++) {
        array[i] = rand() % MAX_VALUE;
    }

////////////////////OpenAcc Computation/////////////////////////////////////////////

    auto start = std::chrono::high_resolution_clock::now();
    compute_histogram_OpenAcc(array, histogram_OpenAcc, ARRAY_SIZE);
    auto end = std::chrono::high_resolution_clock::now();

    double elapsed_time_OpenAcc = std::chrono::duration<double, std::milli>(end - start).count();

    printf("For OpenAcc: \n");
    printf("Value: ");
    for (int i = 0; i < MAX_VALUE; i++) {
        printf("%d ", i);
    }

    printf("\nCount: ");
    for (int i = 0; i < MAX_VALUE; i++) {
        printf("%d ", histogram_OpenAcc[i]);
    }
    printf("\n");

    printf("OpenAcc Execution Time: %.3f ms\n", elapsed_time_OpenAcc);

////////////////CUDA Computation/////////////////////////////////////////////////////////////////

    // CUDA memory allocation
    int *d_array, *d_histogram;
    cudaMalloc((void **)&d_array, ARRAY_SIZE * sizeof(int));
    cudaMalloc((void **)&d_histogram, MAX_VALUE * sizeof(int));

    cudaMemcpy(d_array, array, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemset(d_histogram, 0, MAX_VALUE * sizeof(int));

    int threadsPerBlock = 256;
    int blocksPerGrid = (ARRAY_SIZE + threadsPerBlock - 1) / threadsPerBlock;

    cudaEvent_t start_event, stop_event;
    cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);

    cudaEventRecord(start_event, 0);
    compute_histogram_cuda<<<blocksPerGrid, threadsPerBlock>>>(d_array, d_histogram, ARRAY_SIZE);
    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);

    float elapsed_time_cuda = 0;
    cudaEventElapsedTime(&elapsed_time_cuda, start_event, stop_event);

    cudaMemcpy(histogram_cuda, d_histogram, MAX_VALUE * sizeof(int), cudaMemcpyDeviceToHost);

    printf("\nFor Cuda: \n");
    printf("Value: ");
    for (int i = 0; i < MAX_VALUE; i++) {
        printf("%d ", i);
    }

    printf("\nCount: ");
    for (int i = 0; i < MAX_VALUE; i++) {
        printf("%d ", histogram_cuda[i]);
    }
    printf("\n");

    printf("CUDA Execution Time: %.3f ms\n", elapsed_time_cuda);

    free(array);
    free(histogram_cuda);
    free(histogram_OpenAcc);
    cudaFree(d_array);
    cudaFree(d_histogram);

    return 0;
}


Overwriting 2.cu


In [52]:
!nvcc -o 2 2.cu

In [53]:
!./2

For OpenAcc: 
Value: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 
Count: 100110 99942 100042 100036 99609 99946 99950 100011 99926 99445 100273 100182 99734 99953 100102 99846 100091 99962 99820 99989 100498 99853 100042 99486 100098 99878 100162 100768 100608 100697 99908 100301 100126 99504 99698 100541 99926 100305 99530 99840 99825 99480 100466 100046 99749 99822 99821 100650 99511 100209 99974 100117 99746 99638 99710 99734 100055 100262 100498 99951 99892 100396 99967 100219 100252 99980 99960 99195 100448 100285 99803 100771 100720 100320 99705 100078 99412 100385 99497 100459 99933 99725 100240 99487 99750 100065 100669 99680 99959 100056 100367 99613 100223 99443 100351 99274 100115 99690 99665 99949 
OpenAcc Execution Time: 25.624 ms
