**Q1. Write a simple CUDA kernel that takes an array of integers and doubles each element.**

In [3]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Source files will be saved in "/var/folders/pf/fm0v0g_52r754cfv3r4lqv980000gn/T/tmpefn0a7_d".


In [8]:
! touch add_basic.cu
"""
#include <stdio.h>
#include <cuda_runtime.h>

using namespace std;

// CUDA Kernel function to double each element in the array
__global__ void add_basic(int *data, int count)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < count) {
        data[index] *= 2;
    }
}

int main()
{
    int *h_data;     // Host array
    int *d_data;     // Device array
    int n = 1024;    // Size of the array

    // Allocate host memory
    h_data = (int*)malloc(n * sizeof(int));

    // Initialize host array
    for(int i = 0; i < n; i++) {
        h_data[i] = i;  // Example data
    }

    // Allocate device memory
    cudaMalloc((void**)&d_data, n * sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_data, h_data, n * sizeof(int), cudaMemcpyHostToDevice);

    // Launch the kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    add_basic<<<blocksPerGrid, threadsPerBlock>>>(d_data, n);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    // Copy data back from device to host
    cudaMemcpy(h_data, d_data, n * sizeof(int), cudaMemcpyDeviceToHost);

    // Example output
    for(int i = 0; i < 10; i++) {  // Print the first 10 elements
        printf("%d ", h_data[i]);
    }
    printf("\n");

    // Free device memory
    cudaFree(d_data);

    // Free host memory
    free(h_data);

    return 0;
}

"""

'\n#include <iostream>\n#include <cuda_runtime.h>\n\nusing namespace std;\n\n__global__ void add_basic()\n{\n    // COMPLETE THIS\n}\n\nint main()\n{\n    // COMPLETE THIS\n\n    // Wait for GPU to finish before accessing on host\n    cudaDeviceSynchronize();\n\n    return 0;\n}\n'

**Q2. Write a CUDA kernel to initialize an array of integers with the index value.**

In [6]:
! touch add_basic.cu
"""
#include <iostream>
#include <cuda_runtime.h>

using namespace std;

// CUDA Kernel function to initialize each element of the array with its index
__global__ void initialize_array(int *array)
{
    // Calculate the index for the current thread
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    // Initialize the array element at the calculated index with its index value
    array[index] = index;
}

int main()
{
    const int array_size = 10;
    int *d_array;

    // Allocate memory on GPU
    cudaMalloc((void**)&d_array, array_size * sizeof(int));

    // Launch the CUDA kernel to initialize the array
    int threadsPerBlock = 5; // Example: 5 threads per block
    int blocksPerGrid = (array_size + threadsPerBlock - 1) / threadsPerBlock;
    initialize_array<<<blocksPerGrid, threadsPerBlock>>>(d_array);

    // Copy data from device to host
    int h_array[array_size];
    cudaMemcpy(h_array, d_array, array_size * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the initialized array
    cout << "Initialized Array:" << endl;
    for (int i = 0; i < array_size; ++i) {
        cout << h_array[i] << " ";
    }
    cout << endl;

    // Free GPU memory
    cudaFree(d_array);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    return 0;
}

"""

'\n#include <iostream>\n#include <cuda_runtime.h>\n\nusing namespace std;\n\n__global__ void initialize_array(int *array)\n{\n    // Calculate the index for the current thread\n    int index = blockIdx.x * blockDim.x + threadIdx.x;\n\n     // Initialize the array element at the calculated index with its index value\n    // Hint: Use the \'index\' variable to assign the value\n    // COMPLETE THIS\n\n}\n\nint main()\n{\n    const int array_size = 10;\n    int *d_array;\n\n    // Allocate memory on GPU\n    cudaMalloc((void**)&d_array, array_size * sizeof(int));\n\n    // Launch the CUDA kernel to initialize the array\n    // Hint: Use the <<<...>>> syntax to specify the grid and block dimensions\n    // COMPLETE THIS\n\n    // Copy data from device to host\n    int h_array[array_size];\n    cudaMemcpy(h_array, d_array, array_size * sizeof(int), cudaMemcpyDeviceToHost);\n\n    // Print the initialized array\n    cout << "Initialized Array:" << endl;\n    for (int i = 0; i < array_size; +

**Q3 [OPTIONAL]. How do you check for and handle errors in CUDA API calls and kernel launches?**