In [1]:
%%writefile vector_add_bandwidth.cu
#include <stdio.h>
#include <cuda.h>
#include <chrono>

#define N 1024

// Statically defined device arrays
__device__ int d_A[N];
__device__ int d_B[N];
__device__ int d_C[N];

// Kernel: Add vectors
__global__ void vectorAdd() {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N) {
        d_C[i] = d_A[i] + d_B[i];
    }
}

// Host arrays
int h_A[N], h_B[N], h_C[N];

int main() {
    // Fill host arrays
    for (int i = 0; i < N; ++i) {
        h_A[i] = i;
        h_B[i] = 2 * i;
    }

    // Copy host arrays to statically defined device memory
    cudaMemcpyToSymbol(d_A, h_A, sizeof(int) * N);
    cudaMemcpyToSymbol(d_B, h_B, sizeof(int) * N);

    // Timing kernel execution
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;

    cudaEventRecord(start);
    vectorAdd<<<numBlocks, blockSize>>>();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copy result back
    cudaMemcpyFromSymbol(h_C, d_C, sizeof(int) * N);

    // Print sample result
    printf("Sample output: h_C[100] = %d\n", h_C[100]);

    // --- Device Properties ---
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

    float memClock = prop.memoryClockRate; // kHz
    int memBusWidth = prop.memoryBusWidth; // bits

    float theoreticalBW = 2.0 * memClock * memBusWidth / 8.0 / 1e6; // GB/s
    printf("Theoretical Bandwidth: %.2f GB/s\n", theoreticalBW);

    // --- Measured Bandwidth ---
    int bytesRead = sizeof(int) * N * 2; // A and B
    int bytesWritten = sizeof(int) * N; // C
    float seconds = milliseconds / 1000.0;
    float measuredBW = (bytesRead + bytesWritten) / seconds / 1e9; // GB/s
    printf("Measured Bandwidth: %.2f GB/s\n", measuredBW);

    return 0;
}


Writing vector_add_bandwidth.cu


In [2]:
!nvcc -arch=sm_75 vector_add_bandwidth.cu -o vector_add_bandwidth
!./vector_add_bandwidth

Sample output: h_C[100] = 300
Theoretical Bandwidth: 320.06 GB/s
Measured Bandwidth: 0.17 GB/s


In [3]:
!nvprof ./vector_add_bandwidth

==462== NVPROF is profiling process 462, command: ./vector_add_bandwidth
Sample output: h_C[100] = 300
Theoretical Bandwidth: 320.06 GB/s
Measured Bandwidth: 0.37 GB/s
==462== Profiling application: ./vector_add_bandwidth
==462== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   43.15%  3.3280us         1  3.3280us  3.3280us  3.3280us  vectorAdd(void)
                   30.29%  2.3360us         1  2.3360us  2.3360us  2.3360us  [CUDA memcpy DtoH]
                   26.56%  2.0480us         2  1.0240us  1.0240us  1.0240us  [CUDA memcpy HtoD]
      API calls:   99.56%  88.483ms         2  44.242ms  7.7480us  88.475ms  cudaMemcpyToSymbol
                    0.18%  161.04us         1  161.04us  161.04us  161.04us  cudaGetDeviceProperties
                    0.15%  132.99us       114  1.1660us     107ns  55.893us  cuDeviceGetAttribute
                    0.03%  26.320us         1  26.320us  26.320us  26.320us  cudaLaunchKer

In [None]:
#nvprof- NVIDIA's command line profiler for CUDA programs. Think of it like a detective that watches your gpu code run and tells you stuff like:
#Which kernels are being launched
##How long they take
#How much memory they're using
#Whether your GPU is sitting there twiddling its thumbs