In [1]:
!nvidia-smi

Mon Jan 26 12:39:59 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [29]:
%%writefile vector_add.cu
#include <iostream>
#include <cuda_runtime.h>

__global__ void initVectors(float *a, float *b, int n){
    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (idx < n){
        a[idx] = idx * 1.0f;
        b[idx] = idx * 2.0f;
    }
}

__global__ void addVectors(const float *a, const float *b, float *c, int n){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n){
        c[idx] = a[idx] + b[idx];
    }
}

int main(){
    const int N = 50000000;
    const int size = N * sizeof(float);


    // float *h_c = new float[N];
    // Allocate pined memory in host
    float *h_c;
    cudaMallocHost(&h_c, size);

    // Allocate device memory
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Separate timing for each kernel
    cudaEvent_t start1, stop1, start2, stop2, start3, stop3;
    cudaEventCreate(&start1); cudaEventCreate(&stop1);
    cudaEventCreate(&start2); cudaEventCreate(&stop2);
    cudaEventCreate(&start3); cudaEventCreate(&stop3);

    // Time initVectors kernel
    cudaEventRecord(start1);
    initVectors<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, N);
    cudaEventRecord(stop1);

    // Time addVectors kernel
    cudaEventRecord(start2);
    addVectors<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);
    cudaEventRecord(stop2);

    // Time memory copy
    cudaEventRecord(start3);
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
    cudaEventRecord(stop3);

    cudaEventSynchronize(stop3);

    float ms1, ms2, ms3;
    cudaEventElapsedTime(&ms1, start1, stop1);
    cudaEventElapsedTime(&ms2, start2, stop2);
    cudaEventElapsedTime(&ms3, start3, stop3);

    printf("\n=== Timing Results ===\n");
    printf("initVectors kernel:  %f ms\n", ms1);
    printf("addVectors kernel:   %f ms\n", ms2);
    printf("Memory copy D->H:    %f ms\n", ms3);
    printf("Total kernel time:   %f ms\n", ms1 + ms2);
    printf("Total time:          %f ms\n", ms1 + ms2 + ms3);

    // Verify results
    for (int i = 0; i < 10; i++){
        std::cout << "h_c[" << i << "] = " << h_c[i] << std::endl;
    }

    bool success = true;
    for (int i = 0; i < N; i++){
        float expected = i * 3.0f;
        if (fabs(h_c[i] - expected) > 1e-5){
            success = false;
            std::cout << "Error at idx " << i << ": expected " << expected << ", got " << h_c[i] << std::endl;
            break;
        }
    }
    if (success)
        std::cout << "true!" << std::endl;
    else
        std::cout << "false" << std::endl;

    // Cleanup
    cudaEventDestroy(start1); cudaEventDestroy(stop1);
    cudaEventDestroy(start2); cudaEventDestroy(stop2);
    cudaEventDestroy(start3); cudaEventDestroy(stop3);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    // delete[] h_c;
    cudaFreeHost(h_c);

    return 0;
}

Overwriting vector_add.cu


In [30]:
!nvcc -arch=sm_75 vector_add.cu -o vector_add
!./vector_add


=== Timing Results ===
initVectors kernel:  1.867808 ms
addVectors kernel:   2.290432 ms
Memory copy D->H:    16.049984 ms
Total kernel time:   4.158240 ms
Total time:          20.208223 ms
h_c[0] = 0
h_c[1] = 3
h_c[2] = 6
h_c[3] = 9
h_c[4] = 12
h_c[5] = 15
h_c[6] = 18
h_c[7] = 21
h_c[8] = 24
h_c[9] = 27
true!


# CUDA Vector Addition Analysis

## What This Code Does
Parallel vector addition on GPU: creates two 50M-element vectors, adds them, and measures performance.

## The Kernels

**initVectors**: Initializes vectors on GPU
```cuda
a[i] = i;      // [0, 1, 2, 3, ...]
b[i] = 2*i;    // [0, 2, 4, 6, ...]
```

**addVectors**: Adds vectors element-wise in each thread
```cuda
c[i] = a[i] + b[i];  // [0, 3, 6, 9, ...]
```

**Launch config**: 195,313 blocks Ã— 256 threads = 50M parallel threads

using pined memory (non pageable memory in cpu) allows direct access for gpu to cpu => a lot faster