## CUDA Core Tutorial - Low-Level GPU Programming
### Table of Contents

7. Exercise Solution

### 7. Exercise: Vector Operations
Write a CUDA kernel that performs element-wise multiplication of two vectors.

In [17]:
# Compile your kernel here
from cuda.core.experimental import Device, Program

multiply_kernel_source = """
// TODO: Implement vector multiplication kernel
extern "C" __global__ void vector_multiply(float *a, float *b, float *c, int n) {
    // Each thread calculates its unique index
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    
    // Make sure we don't go beyond our array bounds
    if (i < n) {
        c[i] = a[i] * b[i];  // Multiply corresponding elements
    }
}
"""

# Initialize our GPU
device = Device(0)
device.set_current()

# Compile the CUDA code into a program
program = Program(multiply_kernel_source, code_type='c++')
compiled_program = program.compile(target_type='cubin')

# Get the specific kernel function we want to use
kernel = compiled_program.get_kernel("vector_multiply")

print("Kernel compiled successfully!")


Kernel compiled successfully!


In [18]:
# Launch your kernel here
import cupy as cp
from cuda.core.experimental import launch, LaunchConfig

def vector_multiply(a, b):
    # Step 1: Initialize device
    device = Device(0) 
    device.set_current()
    s = device.create_stream()
    
    # Step 2: Prepare our test data
    N = 1000  # Number of elements
    a = np.arange(N, dtype=np.float32)      # [0, 1, 2, ..., 999]
    b = np.arange(N, dtype=np.float32)      # [0, 1, 2, ..., 999]
    print(f"Input arrays have {N} elements each")

    # Step 3: Copy input data from CPU to GPU
    d_a = cp.asarray(a)
    d_b = cp.asarray(b)
    d_c = cp.ones(N, dtype=cp.float32)
    
    # Step 4: Configure how to launch the kernel
    block_size = 256  # Number of threads per block
    grid_size = (N + block_size - 1) // block_size  # Number of blocks needed
    
    print(f"Launch configuration: {grid_size} blocks of {block_size} threads each")
    print(f"Total threads: {grid_size * block_size}")
    
    # Create the launch configuration
    config = LaunchConfig(grid=(grid_size,), block=(block_size,))
    ker_args = (d_a.data.ptr, d_b.data.ptr, d_c.data.ptr, N)
    
    # Step 5: Launch the kernel!
    launch(s, config, kernel, *ker_args)
    s.sync()
    print("Kernel launched and executed")
    
    # Step 6: Copy the result back from GPU to CPU
    c = cp.asnumpy(d_c)
    print("Results copied back to CPU")
    
    return c

# Execute our vector addition
a = np.arange(1000, dtype=np.float32)
result = vector_multiply(a, a)

# Verify the result
expected = a * a
success = np.allclose(result, expected)
print(f"Kernel execution successful: {success}")
print(f"First 10 results: {result[:10]}")
print(f"Expected first 10: {expected[:10]}")

Input arrays have 1000 elements each
Launch configuration: 4 blocks of 256 threads each
Total threads: 1024
Kernel launched and executed
Results copied back to CPU
Kernel execution successful: True
First 10 results: [ 0.  1.  4.  9. 16. 25. 36. 49. 64. 81.]
Expected first 10: [ 0.  1.  4.  9. 16. 25. 36. 49. 64. 81.]
