<a href="https://colab.research.google.com/github/Gurjot-Singh-2002/UCS645-Lab-Assignments/blob/main/Assignment%206/Assignment_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Gurjot Singh 102203582 3CO14

# Program 1

In [16]:
%%writefile sqrt_cuda.cu
#include <stdio.h>
#include <math.h>
#include <cuda.h>

// CUDA kernel to compute the square root of each element in array A and store in C
__global__ void sqrtKernel(float *A, float *C, int N) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;  // Global thread index
    if (idx < N)
        C[idx] = sqrtf(A[idx]);  // Compute square root of A[idx] and store in C[idx]
}

// Function to allocate memory, copy data, launch kernel, and measure time
void runSqrt(int N) {
    float *A, *C;        // Host arrays
    float *d_A, *d_C;    // Device arrays

    size_t size = N * sizeof(float);  // Total memory size

    // Allocate memory on host
    A = (float *)malloc(size);
    C = (float *)malloc(size);

    // Initialize array A with values 1 to N
    for (int i = 0; i < N; i++)
        A[i] = (float)(i + 1);

    // Allocate memory on device (GPU)
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_C, size);

    // Copy input array A from host to device
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);

    // Set the number of threads per block and blocks per grid
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Start recording time
    cudaEventRecord(start);

    // Launch the CUDA kernel
    sqrtKernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, N);

    // Stop recording time
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);  // Wait for kernel to finish

    // Calculate elapsed time in milliseconds
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Print the size and execution time
    printf("N = %d, Time = %.4f ms\n", N, milliseconds);

    // Copy result array C from device to host
    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

    // Free allocated memory
    cudaFree(d_A);
    cudaFree(d_C);
    free(A);
    free(C);

    // Destroy CUDA events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

// Main function to test with different input sizes
int main() {
    runSqrt(50000);
    runSqrt(500000);
    runSqrt(5000000);
    runSqrt(50000000);
    return 0;
}


Overwriting sqrt_cuda.cu


Program Compilation---

In [17]:
!nvcc --gpu-architecture=sm_70 sqrt_cuda.cu -o sqrt_cuda


Program Execution---

In [18]:
!./sqrt_cuda

N = 50000, Time = 0.1689 ms
N = 500000, Time = 0.0274 ms
N = 5000000, Time = 0.1908 ms
N = 50000000, Time = 1.8307 ms
