

# Frist cuda program!



In [72]:
%%writefile vector_addition.cu

#include <iostream>
#include <cuda_runtime.h>

__global__ void vector_addition(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    const int N = 10000;
    float A[N], B[N], C[N];

    // Initialize input arrays
    for (int i = 0; i < N; i++) {
        A[i] = static_cast<float>(i);
        B[i] = static_cast<float>(i * 2);
    }

    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaMalloc(&d_b, N * sizeof(float));
    cudaMalloc(&d_c, N * sizeof(float));

    cudaMemcpy(d_a, A, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, B, N * sizeof(float), cudaMemcpyHostToDevice);

    int blocksize = 256;
    int gridsize = (N + blocksize - 1) / blocksize;

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Record the start event
    cudaEventRecord(start);

    // Launch the kernel
    vector_addition<<<gridsize, blocksize>>>(d_a, d_b, d_c, N);

    // Record the stop event
    cudaEventRecord(stop);

    // Wait for the stop event to complete
    cudaEventSynchronize(stop);

    // Calculate elapsed time in milliseconds
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "GPU kernel execution time: " << milliseconds << " ms" << std::endl;

    // Copy result back from device to host
    cudaMemcpy(C, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);

    // Optionally, print the results
    // for (int i = 0; i < N; i++) {
    //     std::cout << "C[" << i << "] = " << C[i] << std::endl;
    // }

    // Clean up
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Overwriting vector_addition.cu


In [73]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [74]:
!nvidia-smi

Tue Mar 11 17:27:05 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [75]:
!nvcc vector_addition.cu -o vector_addition

In [76]:
!./vector_addition

GPU kernel execution time: 7.82541 ms


# C program as comparision

In [63]:
%%writefile vector_addition_cpu.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Function to perform vector addition on the CPU
void vector_addition(const float* A, const float* B, float* C, int N) {
    for (int i = 0; i < N; i++) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    const int N = 10000; // Use a large size for better timing comparisons
    float *A = (float*) malloc(N * sizeof(float));
    float *B = (float*) malloc(N * sizeof(float));
    float *C = (float*) malloc(N * sizeof(float));

    // Initialization of arrays (not included in the timed section)
    for (int i = 0; i < N; i++) {
        A[i] = (float) i;
        B[i] = (float)(i * 2);
    }

    // Start timer for the vector addition only
    clock_t start = clock();

    vector_addition(A, B, C, N);

    clock_t end = clock();
    double cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;

    printf("CPU vector addition (excluding initialization) took %f seconds.\n", cpu_time_used);

    // Optionally, check a few results for correctness
    // for (int i = 0; i < 10; i++) {
    //     printf("C[%d] = %f\n", i, C[i]);
    // }

    free(A);
    free(B);
    free(C);

    return 0;
}


Overwriting vector_addition_cpu.c


In [64]:
!gcc --version

gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [65]:
!gcc -o vector_addition_cpu vector_addition_cpu.c

In [66]:
!./vector_addition_cpu

CPU vector addition (excluding initialization) took 0.000056 seconds.
