<a href="https://colab.research.google.com/github/MichaelGelo/GRP2_CEPARCO_IP/blob/main/CEPARCO_IP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Group 2 - Implementing Hyyrö’s Bit Vector Algorithm Using CUDA SIMT**
## **GROUP 2 - S11**

**MEMBERS:**

- Alfred Bastin S. Agustines
- Allan David C. De Leon
- Michael Angelo Depasucat
- Kai Hiori J. Padilla


##Check if CUDA is present

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

##Integrating Project

In [None]:
%%writefile CUDA.cu

#include <stdio.h>
#include <stdlib.h>
// Prefetch + page creation + memadvise

//CUDA saxpy kernel
__global__
void saxpy(size_t n, float a, float *out, float *x, float *y){
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = index; i < n; i += stride)
        out[i] = a * x[i] + y[i];

}

int main(){
  const size_t ARRAY_SIZE = 1<<24;
  const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

 //number of times the program is to be executed
   const size_t loope = 10;

//declare array
  float *x, *y, *out;
  cudaMallocManaged(&x, ARRAY_BYTES);
  cudaMallocManaged(&y, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

//get gpu id
  int device = -1;
  cudaGetDevice(&device);

 // memory advise
  cudaMemAdvise(x, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(x, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  cudaMemAdvise(y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

//"prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(x, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  cudaMemPrefetchAsync(y, ARRAY_BYTES, cudaCpuDeviceId, NULL);

//"prefetch data" to create GPU page memory
  cudaMemPrefetchAsync(out, ARRAY_BYTES, device, NULL);

// *** init array
  for (size_t i=0; i<ARRAY_SIZE; i++){
     x[i] = (float)i;
     y[i] = (float)(ARRAY_SIZE - i);
  }
  float a = 2.0f;

//"Prefetch data" from CPU-GPU
  cudaMemPrefetchAsync(x, ARRAY_BYTES, device, NULL);
  cudaMemPrefetchAsync(y, ARRAY_BYTES, device, NULL);

// *** setup CUDA kernel
  size_t numThreads = 1024;
  size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("*** function = SAXPY\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks,numThreads);
  for (size_t i=0; i<loope;i++)
    saxpy<<<numBlocks, numThreads>>>(ARRAY_SIZE, a, out, x, y);

//barrier
    cudaDeviceSynchronize();

//"Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(out, ARRAY_BYTES, cudaCpuDeviceId, NULL);

//error checking
  size_t err_count = 0;
  for (size_t i=0; i<ARRAY_SIZE; i++){
    if(out[i] != (a * x[i] + y[i]))
      err_count++;
  }
  printf("Error count(CUDA program): %zu\n", err_count);

//free memory
  cudaFree(x);
  cudaFree(y);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA.cu -o CUDA

In [None]:
%%shell
nvprof ./CUDA