### Name:
> Castillo, Marvien Angel C. <br>
> Herrera, Mikhaela Gabrielle B. <br>
> Regindin, Sean Adrien I. <br>

# Setup Environment

In [2]:
import os

# Add the directory containing the executable to the PATH
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin"

# Check if the directory is added to the PATH
print(os.environ["PATH"])

/opt/tljh/user/bin:/bin:/usr/bin:/usr/local/cuda/bin


# Check if CUDA is present

In [3]:
%%bash
nvcc --version
nvprof --version
nsys --version
ncu --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Wed_Apr__9_19:24:57_PDT_2025
Cuda compilation tools, release 12.9, V12.9.41
Build cuda_12.9.r12.9/compiler.35813241_0
nvprof: NVIDIA (R) Cuda command line profiler
Copyright (c) 2012 - 2025 NVIDIA Corporation
Release version 12.9.19 (21)
NVIDIA Nsight Systems version 2025.1.3.140-251335620677v0
NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2025 NVIDIA Corporation
Version 2025.2.0.0 (build 35613519) (public-release)


In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Nov  6 06:01:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.51.03              Driver Version: 575.51.03      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-PCIE-32GB           Off |   00000000:00:10.0 Off |                    0 |
| N/A   27C    P0             22W /  250W |       0MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Variant 1 - C Program

In [25]:
%%writefile C_var1.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
// ***C function version
void kernel_C(float A[], float B[], float C[], size_t n, int idx[]) {
	for (int i = 0; i < n; i++) {
		if (A[i] >= B[i]) {
			C[i] = A[i];
			idx[i] = 0;
		}
		else {
			C[i] = B[i];
			idx[i] = 1;
		}
	}
}

int main(int argc, char** argv)
{
   const size_t ARRAY_SIZE = 1<<24;
   const size_t INT_ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
   const size_t FLOAT_ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
//number of times the program is to be executed
   const size_t loope = 10;
//declare array
   float *C,*A,*B;
   int *idx,a;
   A = (float*)malloc(FLOAT_ARRAY_BYTES);
   B = (float*)malloc(FLOAT_ARRAY_BYTES);
   C = (float*)malloc(FLOAT_ARRAY_BYTES);
   idx = (int*)malloc(INT_ARRAY_BYTES);
   a=2;
//timer variables
  clock_t start, end;
// ***--- initialize your array here ---------
   int i;
	for (i = 0; i < ARRAY_SIZE; i++) {
		A[i] = sin(i * 0.0005) * 100.0 + 50.0;
		B[i] = cos(i * 0.0003) * 100.0 + 50.0;
	}
// fill-in cache
    kernel_C(A,B,C,ARRAY_SIZE,idx);
//time here
  double elapse, time_taken;
  elapse = 0.0f;
  for (int i=0; i<loope; i++){
    start = clock();
      kernel_C(A,B,C,ARRAY_SIZE,idx );
    end = clock();
    time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
    elapse = elapse + time_taken;
  }
  printf("Function (in C) average time for %lu loops is %f milliseconds to execute an array size %lu \n", loope, elapse/loope, ARRAY_SIZE);

// error checking routine here --
   size_t err_count = 0;
   int sanity_checker = 0;
   for (int i = 0; i < ARRAY_SIZE; i++) { 
        float expected_C = (A[i] >= B[i]) ? A[i] : B[i];
        int expected_idx = (A[i] >= B[i]) ? 0 : 1;
    
        if (fabs(C[i] - expected_C) > 1e-5 || idx[i] != expected_idx) {
            sanity_checker++;
        }
    }
  printf("Sanity Checker = %d",sanity_checker);
  
  // Free memory
  free(A);
  free(B);
  free(C);
  free(idx);
  return 0;
}

Overwriting C_var1.c


In [26]:
%%bash
gcc C_var1.c -o C_var1 -lm

In [27]:
%%bash
./C_var1

Function (in C) average time for 10 loops is 216.433500 milliseconds to execute an array size 16777216 
Sanity Checker = 0

# Variant 2 - Grid Stride Loop

In [7]:
%%writefile CUDA_var2.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
//Grid stride loop

//*** CUDA kernel
__global__
void kernel_C(size_t n, float A[],float B[],float C[],int idx[]){
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
    {
        if (A[i] >= B[i]) {
            C[i] = A[i];
            idx[i] = 0;
        }
        else {
            C[i] = B[i];
            idx[i] = 1;
        }
    }
}

int main(){
  const size_t ARRAY_SIZE = 1<<24;
  const size_t INT_ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
  const size_t FLOAT_ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
 //number of times the program is to be executed
   const size_t loope = 10;
//declare array
  float *A,*B,*C; 
  int *idx;
  cudaMallocManaged(&A, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&B, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&C, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&idx, INT_ARRAY_BYTES);
// *** init array
  int i;
  for (i = 0; i < ARRAY_SIZE; i++) {
    A[i] = sin(i * 0.0005) * 100.0 + 50.0;
    B[i] = cos(i * 0.0003) * 100.0 + 50.0;
  }

// *** setup CUDA kernel
  size_t numThreads = 1024;
  size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("*** function = Float kernel_C\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks,numThreads);
  for (size_t i=0; i<loope;i++)
    kernel_C <<<numBlocks, numThreads>>> (ARRAY_SIZE,A,B,C,idx);
//barrier
    cudaDeviceSynchronize();
  int sanity_checker = 0;
  for (int i = 0; i < ARRAY_SIZE; i++) { 
        float expected_C = (A[i] >= B[i]) ? A[i] : B[i];
        int expected_idx = (A[i] >= B[i]) ? 0 : 1;
    
        if (fabs(C[i] - expected_C) > 1e-5 || idx[i] != expected_idx) {
            sanity_checker++;
        }
    }
  printf("Sanity Checker = %d",sanity_checker);
//free memory
  cudaFree(A);
  cudaFree(B);
  cudaFree(C);
  cudaFree(idx);
  return 0;
}


Overwriting CUDA_var2.cu


In [8]:
%%bash
nvcc CUDA_var2.cu -o CUDA_var2 -Wno-deprecated-gpu-targets

In [9]:
%%bash
nvprof ./CUDA_var2

==1029411== NVPROF is profiling process 1029411, command: ./CUDA_var2


*** function = Float kernel_C
numElements = 16777216
numBlocks = 16384, numThreads = 1024 
Error count(CUDA program): 0
Sanity Checker = 0

==1029411== Profiling application: ./CUDA_var2
==1029411== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  121.64ms        10  12.164ms  343.23us  118.54ms  kernel_C(unsigned long, float*, float*, float*, int*)
      API calls:   92.34%  1.84365s         4  460.91ms  96.183us  1.84151s  cudaMallocManaged
                    6.08%  121.47ms         1  121.47ms  121.47ms  121.47ms  cudaDeviceSynchronize
                    1.38%  27.578ms         4  6.8945ms  6.8165ms  7.0277ms  cudaFree
                    0.12%  2.4785ms        10  247.85us  13.916us  2.2560ms  cudaLaunchKernel
                    0.04%  819.71us       114  7.1900us     132ns  320.00us  cuDeviceGetAttribute
                    0.02%  321.05us         1  321.05us  321.05us  321.05us  cuDeviceGetName
                    0.01%  169.63us         2  84.816us  6.8810us  162.75us  cuDeviceGet
                    0.01%  110.88us         1  110.88us 

# Variant 3.0 - Grid Stride Loop with Prefetch

In [10]:
%%writefile CUDA_var3.cu
// prefetch
#include <stdio.h>
#include <stdlib.h>

//CUDA kernel
__global__
void kernel_C(size_t n, float A[],float B[],float C[],int idx[]){
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
    {
        if (A[i] >= B[i]) {
            C[i] = A[i];
            idx[i] = 0;
        }
        else {
            C[i] = B[i];
            idx[i] = 1;
        }
    }
}

int main(){
  const size_t ARRAY_SIZE = 1<<24;
  const size_t INT_ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
  const size_t FLOAT_ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
 //number of times the program is to be executed
   const size_t loope = 10;
//declare array
  float *A,*B,*C; 
  int *idx;
  cudaMallocManaged(&A, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&B, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&C, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&idx, INT_ARRAY_BYTES);

//get gpu id
  int device = -1;
  cudaGetDevice(&device);

// ****init array
  int i;
  for (i = 0; i < ARRAY_SIZE; i++) {
    A[i] = sin(i * 0.0005) * 100.0 + 50.0;
    B[i] = cos(i * 0.0003) * 100.0 + 50.0;
  }
 //"Prefetch data" from CPU-GPU
  cudaMemPrefetchAsync(A,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(B,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(C,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(idx,INT_ARRAY_BYTES,device,NULL);

// setup CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("*** function ***\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks, numThreads);
  for (size_t i=0; i<loope;i++)
    kernel_C <<<numBlocks, numThreads>>> (ARRAY_SIZE,A,B,C,idx);
//barrier
    cudaDeviceSynchronize(); 

//"Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(C,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(A,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(B,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(idx,INT_ARRAY_BYTES,cudaCpuDeviceId,NULL);

  int sanity_checker = 0;
  for (int i = 0; i < ARRAY_SIZE; i++) { 
        float expected_C = (A[i] >= B[i]) ? A[i] : B[i];
        int expected_idx = (A[i] >= B[i]) ? 0 : 1;
    
        if (fabs(C[i] - expected_C) > 1e-5 || idx[i] != expected_idx) {
            sanity_checker++;
        }
    }
  printf("Sanity Checker = %d",sanity_checker);
//free memory
  cudaFree(A);
  cudaFree(B);
  cudaFree(C);
  cudaFree(idx);
  return 0;
}

Overwriting CUDA_var3.cu


In [11]:
%%bash
nvcc CUDA_var3.cu -o CUDA_var3 -Wno-deprecated-gpu-targets

In [12]:
%%bash
nvprof ./CUDA_var3

==1029474== NVPROF is profiling process 1029474, command: ./CUDA_var3


*** function ***
numElements = 16777216
numBlocks = 16384, numThreads = 1024 
Error count(CUDA program): 0
Sanity Checker = 0

==1029474== Profiling application: ./CUDA_var3
==1029474== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  3.5372ms        10  353.72us  351.49us  357.69us  kernel_C(unsigned long, float*, float*, float*, int*)
      API calls:   86.38%  1.34499s         4  336.25ms  49.601us  1.34403s  cudaMallocManaged
                   10.70%  166.56ms         8  20.820ms  17.280us  50.832ms  cudaMemPrefetchAsync
                    1.70%  26.521ms         4  6.6303ms  5.2011ms  7.4492ms  cudaFree
                    0.96%  14.982ms        10  1.4982ms  16.385us  14.784ms  cudaLaunchKernel
                    0.22%  3.3656ms         1  3.3656ms  3.3656ms  3.3656ms  cudaDeviceSynchronize
                    0.03%  438.96us       114  3.8500us     116ns  169.75us  cuDeviceGetAttribute
                    0.01%  119.81us         1  119.81us  119.81us  119.81us  cuDeviceGetName
                    0.00%  24.232us         1  

# Variant 4.0 - Grid Stride Loop with Prefetch and Page Creation

In [13]:
%%writefile CUDA_var4.cu
//prefetch + page creation
// page creation responsible gpu fault page
#include <stdio.h>
#include <stdlib.h>

//CUDA kernel
__global__
void kernel_C(size_t n, float A[],float B[],float C[],int idx[]){
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
    {
        if (A[i] >= B[i]) {
            C[i] = A[i];
            idx[i] = 0;
        }
        else {
            C[i] = B[i];
            idx[i] = 1;
        }
    }
}


int main(){
  const size_t ARRAY_SIZE = 1<<24;
  const size_t INT_ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
  const size_t FLOAT_ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
 //number of times the program is to be executed
   const size_t loope = 10;
//declare array
  float *A,*B,*C; 
  int *idx;
  cudaMallocManaged(&A, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&B, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&C, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&idx, INT_ARRAY_BYTES);
//get gpu id
  int device = -1;
  cudaGetDevice(&device);

//"prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(A,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(B,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
//"prefetch data" to create GPU page memory
  cudaMemPrefetchAsync(C,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(idx,INT_ARRAY_BYTES,device,NULL);

// ****init array
  int i;
  for (i = 0; i < ARRAY_SIZE; i++) {
    A[i] = sin(i * 0.0005) * 100.0 + 50.0;
    B[i] = cos(i * 0.0003) * 100.0 + 50.0;
  }
//"Prefetch data" from CPU-GPU
  cudaMemPrefetchAsync(A,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(B,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(C,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(idx,INT_ARRAY_BYTES,device,NULL);

// setup CUDA kernel
    size_t numThreads = 1024;   
    size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("*** function ***\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks, numThreads);
  for (size_t i=0; i<loope;i++)
    kernel_C <<<numBlocks, numThreads>>> (ARRAY_SIZE,A,B,C,idx);
//barrier
    cudaDeviceSynchronize();

  //"Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(C,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(A,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(B,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(idx,INT_ARRAY_BYTES,cudaCpuDeviceId,NULL);

  int sanity_checker = 0;
  for (int i = 0; i < ARRAY_SIZE; i++) { 
        float expected_C = (A[i] >= B[i]) ? A[i] : B[i];
        int expected_idx = (A[i] >= B[i]) ? 0 : 1;
    
        if (fabs(C[i] - expected_C) > 1e-5 || idx[i] != expected_idx) {
            sanity_checker++;
        }
    }
  printf("Sanity Checker = %d",sanity_checker);

//free memory
  cudaFree(A);
  cudaFree(B);
  cudaFree(C);
  cudaFree(idx);
  return 0;
}

Overwriting CUDA_var4.cu


In [14]:
%%bash
nvcc CUDA_var4.cu -o CUDA_var4 -Wno-deprecated-gpu-targets

In [15]:
%%bash
nvprof ./CUDA_var4

==1029542== NVPROF is profiling process 1029542, command: ./CUDA_var4


*** function ***
numElements = 16777216
numBlocks = 16384, numThreads = 1024 
Error count(CUDA program): 0
Sanity Checker = 0

==1029542== Profiling application: ./CUDA_var4
==1029542== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  3.6539ms        10  365.39us  363.52us  368.93us  kernel_C(unsigned long, float*, float*, float*, int*)
      API calls:   81.20%  1.61421s         4  403.55ms  61.586us  1.61294s  cudaMallocManaged
                   16.87%  335.35ms        12  27.946ms  24.579us  82.191ms  cudaMemPrefetchAsync
                    0.99%  19.707ms         4  4.9268ms  3.9704ms  6.4406ms  cudaFree
                    0.71%  14.101ms        10  1.4101ms  10.534us  13.927ms  cudaLaunchKernel
                    0.18%  3.5237ms         1  3.5237ms  3.5237ms  3.5237ms  cudaDeviceSynchronize
                    0.03%  614.96us       114  5.3940us     127ns  241.68us  cuDeviceGetAttribute
                    0.01%  240.02us         1  240.02us  240.02us  240.02us  cuDeviceGetName
                    0.01%  136.75us         1  

# Variant 5.0 - Grid Stride Loop with Prefetch and Page Creation + mem advise

In [20]:
%%writefile CUDA_var5.cu
//prefetch + page creation + memadvise

#include <stdio.h>
#include <stdlib.h>

//CUDA kernel
__global__
void kernel_C(size_t n, float A[],float B[],float C[],int idx[]){
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
    {
        if (A[i] >= B[i]) {
            C[i] = A[i];
            idx[i] = 0;
        }
        else {
            C[i] = B[i];
            idx[i] = 1;
        }
    }
}

int main(){
  const size_t ARRAY_SIZE = 1<<24;
  const size_t INT_ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
  const size_t FLOAT_ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
 //number of times the program is to be executed
   const size_t loope = 10;
//declare array
  float *A,*B,*C; 
  int *idx;
  cudaMallocManaged(&A, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&B, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&C, FLOAT_ARRAY_BYTES);
  cudaMallocManaged(&idx, INT_ARRAY_BYTES);

//get gpu id
  int device = -1;
  cudaGetDevice(&device);

// memory advise
   cudaMemAdvise(A, FLOAT_ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
   cudaMemAdvise(A, FLOAT_ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
   cudaMemAdvise(B, FLOAT_ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
   cudaMemAdvise(B, FLOAT_ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

//"prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(A,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(B,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
//"prefetch data" to create GPU page memory
  cudaMemPrefetchAsync(C,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(idx,INT_ARRAY_BYTES,device,NULL);

// ****init array
  int i;
  for (i = 0; i < ARRAY_SIZE; i++) {
    A[i] = sin(i * 0.0005) * 100.0 + 50.0;
    B[i] = cos(i * 0.0003) * 100.0 + 50.0;
  }


 //"Prefetch data" from CPU-GPU
  cudaMemPrefetchAsync(A,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(B,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(C,FLOAT_ARRAY_BYTES,device,NULL);
  cudaMemPrefetchAsync(idx,INT_ARRAY_BYTES,device,NULL);

// setup CUDA kernel
    size_t numThreads = 1024;   // what if numThreads>1024, numThreads<1024?
    size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("*** function ***\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks, numThreads);
  for (size_t i=0; i<loope;i++)
    kernel_C <<<numBlocks, numThreads>>> (ARRAY_SIZE,A,B,C,idx);
//barrier
    cudaDeviceSynchronize();

  //"Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(C,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(A,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(B,FLOAT_ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(idx,INT_ARRAY_BYTES,cudaCpuDeviceId,NULL);


  int sanity_checker = 0;
  for (int i = 0; i < ARRAY_SIZE; i++) { 
        float expected_C = (A[i] >= B[i]) ? A[i] : B[i];
        int expected_idx = (A[i] >= B[i]) ? 0 : 1;
    
        if (fabs(C[i] - expected_C) > 1e-5 || idx[i] != expected_idx) {
            sanity_checker++;
        }
    }
  printf("Sanity Checker = %d",sanity_checker);

//free memory
  cudaFree(A);
  cudaFree(B);
  cudaFree(C);
  cudaFree(idx);
  return 0;

}


Overwriting CUDA_var5.cu


In [22]:
%%bash
nvcc CUDA_var5.cu -o CUDA_var5 -Wno-deprecated-gpu-targets

In [23]:
%%bash
nvprof ./CUDA_var5

==1030060== NVPROF is profiling process 1030060, command: ./CUDA_var5


*** function ***
numElements = 16777216
numBlocks = 16384, numThreads = 1024 
Sanity Checker = 0

==1030060== Profiling application: ./CUDA_var5
==1030060== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  3.6554ms        10  365.54us  362.75us  369.47us  kernel_C(unsigned long, float*, float*, float*, int*)
      API calls:   81.04%  1.48778s         4  371.95ms  87.888us  1.48727s  cudaMallocManaged
                   16.51%  303.05ms        12  25.254ms  196.99us  76.991ms  cudaMemPrefetchAsync
                    2.03%  37.232ms         4  9.3079ms  6.8002ms  12.693ms  cudaFree
                    0.20%  3.5807ms         1  3.5807ms  3.5807ms  3.5807ms  cudaDeviceSynchronize
                    0.13%  2.4616ms        10  246.16us  7.6480us  2.3594ms  cudaLaunchKernel
                    0.07%  1.2301ms       114  10.790us     159ns  655.91us  cuDeviceGetAttribute
                    0.01%  192.43us         1  192.43us  192.43us  192.43us  cuDeviceGetName
                    0.01%  176.62us         4  

# Variant 6.0 - CUDA classic MEMCPY

In [38]:
%%writefile CUDA_var6.cu
//grid stride loop + memcpy
// BFF PALANG TOH SO FAR + onting research sa websites HJSHAS
// UNFINISHED

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

//CUDA kernel
__global__
void kernel_C(size_t n, float A[],float B[],float C[],int idx[]){
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
    {
        if (A[i] >= B[i]) {
            C[i] = A[i];
            idx[i] = 0;
        }
        else {
            C[i] = B[i];
            idx[i] = 1;
        }
    }
}

int main(){
  const size_t ARRAY_SIZE = 1<<24;
  const size_t INT_ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
  const size_t FLOAT_ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
 //number of times the program is to be executed
   const size_t loope = 10;

float *h_A = (float*)malloc(FLOAT_ARRAY_BYTES);
float *h_B = (float*)malloc(FLOAT_ARRAY_BYTES);
float *h_C = (float*)malloc(FLOAT_ARRAY_BYTES);
int *h_idx = (int*)malloc(INT_ARRAY_BYTES);

// ****init host array
  int i;
  for (i = 0; i < ARRAY_SIZE; i++) {
    h_A[i] = sinf(i * 0.0005) * 100.0 + 50.0;
    h_B[i] = cosf(i * 0.0003) * 100.0 + 50.0;
  }

// device allocations
  float *d_A,*d_B,*d_C;
  int *d_idx;
  cudaMalloc(&d_A, FLOAT_ARRAY_BYTES); //we dont use cudaMallocManaged with cudaMemcpy
  cudaMalloc(&d_B, FLOAT_ARRAY_BYTES);
  cudaMalloc(&d_C, FLOAT_ARRAY_BYTES);
  cudaMalloc(&d_idx, INT_ARRAY_BYTES);

//get gpu id
  int device = -1;
  cudaGetDevice(&device);

// Copy data to device
    cudaMemcpy(d_A, h_A, FLOAT_ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, FLOAT_ARRAY_BYTES, cudaMemcpyHostToDevice);

// Kernel launch configuration
    int threadsPerBlock = 256;
    int blocksPerGrid = (ARRAY_SIZE + threadsPerBlock - 1) / threadsPerBlock;
    
  for (size_t i=0; i<loope;i++){
    kernel_C <<<blocksPerGrid, threadsPerBlock>>> (ARRAY_SIZE, d_A, d_B, d_C, d_idx);
    cudaDeviceSynchronize();
  }

    
// Copy results back to host
    cudaMemcpy(h_C, d_C, FLOAT_ARRAY_BYTES, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_idx, d_idx, INT_ARRAY_BYTES, cudaMemcpyDeviceToHost);

    // Verification loop
    int errors = 0;
    for (int i = 0; i < ARRAY_SIZE; i++) { 
        float expected_C = (h_A[i] >= h_B[i]) ? h_A[i] : h_B[i];
        int expected_idx = (h_A[i] >= h_B[i]) ? 0 : 1;
    
        if (fabs(h_C[i] - expected_C) > 1e-5 || h_idx[i] != expected_idx) {
            errors++;
        }
    }
    printf("Errors = %d", errors);

// Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cudaFree(d_idx);

// Free host memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_idx);

  return 0;
}

Overwriting CUDA_var6.cu


In [39]:
%%bash
nvcc CUDA_var6.cu -o CUDA_var6 -Wno-deprecated-gpu-targets

In [40]:
!./CUDA_var6

Errors = 0

In [41]:
%%bash
nvprof ./CUDA_var6

==1025309== NVPROF is profiling process 1025309, command: ./CUDA_var6


Errors = 0

==1025309== Profiling application: ./CUDA_var6
==1025309== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   73.64%  507.42ms         2  253.71ms  247.51ms  259.91ms  [CUDA memcpy DtoH]
                   25.86%  178.22ms         2  89.109ms  63.545ms  114.67ms  [CUDA memcpy HtoD]
                    0.50%  3.4523ms        10  345.23us  343.58us  349.02us  kernel_C(unsigned long, float*, float*, float*, int*)
      API calls:   62.67%  1.19623s         4  299.06ms  209.42us  1.19512s  cudaMalloc
                   36.78%  702.08ms         4  175.52ms  65.015ms  265.80ms  cudaMemcpy
                    0.19%  3.5435ms        10  354.35us  335.69us  410.42us  cudaDeviceSynchronize
                    0.18%  3.4627ms         4  865.68us  498.18us  1.3809ms  cudaFree
                    0.12%  2.2218ms        10  222.18us  12.424us  2.0468ms  cudaLaunchKernel
                    0.04%  714.79us       114  6.2700us     124