In [13]:
# Task 1

import pycuda.autoinit
import pycuda.driver as cuda
import numpy as np
from pycuda.compiler import SourceModule
import time 

# Initialize input array with size=1024 since it's the highest and allows us to see the more differences
# We do not time this since it is the imput creation
input_size = 1024
host_array = np.random.randint(low=0, high=10, size=input_size, dtype=np.int32)

start_full_gpu = cuda.Event()
end_full_gpu = cuda.Event()
end_kernel_gpu = cuda.Event()
start_computation_gpu = cuda.Event()
end_computation_gpu = cuda.Event()

#################### Satrt GPU timing
start_full_gpu.record()

# CUDA kernel for left rotation
kernel_code = """
__global__ void left_rotation(int *in, int *out, int size) {
    // int idx = threadIdx.x + blockIdx.x * blockDim.x;
    // Since we assume that there's only one block with 1024 threads we can just use the following
    int idx = threadIdx.x;
    if (idx < size) {
        int shifted_idx = (idx - 1 + size) % size;
        out[shifted_idx] = in[idx];
    }
}
"""

# Compile the CUDA kernel
mod = SourceModule(kernel_code)
left_rotation = mod.get_function("left_rotation")

#################### End kernel GPU timing
end_kernel_gpu.record()

# Allocate memory on GPU
device_array = cuda.mem_alloc(host_array.nbytes)
device_output = cuda.mem_alloc(host_array.nbytes)

# Copy data to GPU
cuda.memcpy_htod(device_array, host_array)

# Define block and grid size
# Since we assume only one block with 1024 threads
block_size = input_size
grid_size = 1

#################### Start computation GPU timing
start_computation_gpu.record()
# Launch kernel
left_rotation(device_array, device_output, np.int32(input_size), block=(block_size, 1, 1), grid=(grid_size, 1))
#################### End computation GPU timing
end_computation_gpu.record()

# Copy result back to CPU
host_output = np.empty_like(host_array)
cuda.memcpy_dtoh(host_output, device_output)

#################### End GPU timing
end_full_gpu.record()
cuda.Context.synchronize()
gpu_full_time = start_full_gpu.time_till(end_full_gpu)
gpu_kernel_time = start_full_gpu.time_till(end_kernel_gpu)
computation_full_time = start_computation_gpu.time_till(end_computation_gpu)
print("Full elapsed time using GPU (ms): ", gpu_full_time)
print("---------------------")
print("Kernel creation and compile time using GPU (ms): ", gpu_kernel_time)
print("---------------------")
print("Computation time using GPU (ms): ", computation_full_time)
print("---------------------")
print("Memory management time using GPU (ms): ", gpu_full_time-gpu_kernel_time-computation_full_time)
print("---------------------")

# Print result
#print("Original array:", host_array)
#print("Array after left rotation:", host_output)

Full elapsed time using GPU (ms):  0.9042239785194397
---------------------
Kernel creation and compile time using GPU (ms):  0.5723519921302795
---------------------
Computation time using GPU (ms):  0.10035199671983719
---------------------
Memory management time using GPU (ms):  0.23151998966932297
---------------------


In [2]:
# CPU Sequential implementation (naive)

#################### Start CPU timing
start_cpu = time.time()

naive_out = np.empty(input_size)
for idx, item in enumerate(host_array):
    naive_out[(idx-1)%input_size] = item
    
#################### End CPU timing
end_cpu = time.time()
cpu_time = (end_cpu - start_cpu)*1000
print("Elapsed time using sequential for-loop (ms): ", cpu_time)
print("---------------------")
    
#print(naive_out)

Elapsed time using sequential for-loop (ms):  0.3058910369873047
---------------------


In [3]:
# CPU sequential implementation (pythonic)

#################### Start CPU timing
start_cpu = time.time()

pythonic_out = np.concatenate((host_array[1:], host_array[:1])) 

#################### End CPU timing
end_cpu = time.time()
cpu_time = (end_cpu - start_cpu)*1000
print("Elapsed time using pythonic way (ms): ", cpu_time)
print("---------------------")

#print(pythonic_out)

Elapsed time using pythonic way (ms):  0.08845329284667969
---------------------


In [4]:
# Task 2
# Compare the different implementations
if (host_output == pythonic_out).all() and (pythonic_out == np.array(naive_out, dtype=np.int32)).all():
    print("Output OK")
else:
    print("Output do not match, please investigate!")
    

Output OK
