In [12]:
import pycuda.autoinit
import pycuda.driver as cuda
import numpy as np
from pycuda.compiler import SourceModule

# CUDA kernel for left rotation
kernel_code = """
__global__ void left_rotation(int *in, int *out, int size) {
    // int idx = threadIdx.x + blockIdx.x * blockDim.x;
    // Since we assume that there's only one block with 1024 threads we can just use the following
    int idx = threadIdx.x;
    if (idx < size) {
        int shifted_idx = (idx - 1 + size) % size;
        out[shifted_idx] = in[idx];
    }
}
"""

# Compile the CUDA kernel
mod = SourceModule(kernel_code)
left_rotation = mod.get_function("left_rotation")


In [14]:

# Initialize input array with size=1024 since it's the highest and allows us to see the more differences
input_size = 1024
host_array = np.random.randint(low=0, high=10, size=input_size, dtype=np.int32)
print(host_array)
print(input_size)

[4 7 1 ... 2 6 0]
1024


In [15]:

# Allocate memory on GPU
device_array = cuda.mem_alloc(host_array.nbytes)
device_output = cuda.mem_alloc(host_array.nbytes)

# Copy data to GPU
cuda.memcpy_htod(device_array, host_array)


In [16]:
# Define block and grid size
# Since we assume only one block with 1024 threads
block_size = input_size
grid_size = 1

# Launch kernel
left_rotation(device_array, device_output, np.int32(input_size), block=(block_size, 1, 1), grid=(grid_size, 1))



In [17]:
# Copy result back to CPU
host_output = np.empty_like(host_array)
cuda.memcpy_dtoh(host_output, device_output)

# Print result
print("Original array:", host_array)
print("Array after left rotation:", host_output)

Original array: [4 7 1 ... 2 6 0]
Array after left rotation: [7 1 4 ... 6 0 4]
