<a href="https://colab.research.google.com/github/Neel-Dandiwala/CUDA-Programs/blob/master/clockCUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
%%writefile clock.cu

#include <assert.h>
#include <stdint.h>
#include <stdio.h>

#include <cuda_runtime.h>

//multi-line macro for checkCudaErrors

#define checkCudaErrors(call)                                 \
  do {                                                        \
    cudaError_t err = call;                                   \
    if (err != cudaSuccess) {                                 \
      printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
             cudaGetErrorString(err));                        \
      exit(EXIT_FAILURE);                                     \
    }                                                         \
  } while (0)



__global__
static void timedReduction(const float *input, float *output, clock_t *timer){
    extern __shared__ float shared[];
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;

    if (tid == 0) timer[bid] = clock();

    shared[tid] = input[tid];
    shared[tid + blockDim.x] = input[tid + blockDim.x];

    for (int d = blockDim.x; d > 0; d /= 2) {
        
        //synchronise the threads writing to the shared memory
        __syncthreads();

        if (tid < d) {
            float f0 = shared[tid];
            float f1 = shared[tid + d];
            
            if (f1 < f0){
                shared[tid] = f1;
            }
        }
    }

    if (tid == 0) output[bid] = shared[0];
    __syncthreads();

    if (tid == 0) timer[bid + gridDim.x] = clock();
}

#define NUM_BLOCKS 32
#define NUM_THREADS 256

int main(int argc, char **argv) {
    
    float *dinput = NULL;
    float *doutput = NULL;
    clock_t *dtimer = NULL;

    clock_t timer[NUM_BLOCKS * 2];
    float input[NUM_THREADS * 2];

    for (int i = 0; i < NUM_THREADS * 2; i++) {
        input[i] = (float)i;
    }

    checkCudaErrors(
        cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)
    );
    checkCudaErrors(
        cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)
    );
    checkCudaErrors(
        cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)
    );

    checkCudaErrors(
        cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice)
    );

    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);

    checkCudaErrors(
        cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost)
    );
    checkCudaErrors(cudaFree(dinput));
    checkCudaErrors(cudaFree(doutput));
    checkCudaErrors(cudaFree(dtimer));

    long double avgElapsedClocks = 0;

    for(int i = 0; i < NUM_BLOCKS; i++) {
        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
    }

    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
    printf("Average Elapsed Time per block: %Lf\n", avgElapsedClocks);

    return EXIT_SUCCESS;

}


Overwriting clock.cu


In [1]:
%%shell

nvidia-smi

Sun May 29 09:38:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces



In [35]:
%%shell

nvcc clock.cu -o clock
./clock
nvprof ./clock

Average Elapsed Time per block: 2120.093750
==1538== NVPROF is profiling process 1538, command: ./clock
Average Elapsed Time per block: 2099.250000
==1538== Profiling application: ./clock
==1538== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.63%  5.4400us         1  5.4400us  5.4400us  5.4400us  timedReduction(float const *, float*, long*)
                   22.03%  2.0800us         1  2.0800us  2.0800us  2.0800us  [CUDA memcpy DtoH]
                   20.34%  1.9200us         1  1.9200us  1.9200us  1.9200us  [CUDA memcpy HtoD]
      API calls:   99.61%  279.48ms         3  93.160ms  2.8520us  279.47ms  cudaMalloc
                    0.18%  512.33us       101  5.0720us     173ns  308.26us  cuDeviceGetAttribute
                    0.14%  406.77us         1  406.77us  406.77us  406.77us  cuDeviceTotalMem
                    0.03%  87.839us         3  29.279us  3.1560us  78.928us  cudaFree
                    0.0

