Part 3

In [1]:
import cupy as cp

x = cp.array([2, 4, 6, 8])
y = cp.array([1, 3, 5, 7])
z = x * y
print("Task 1")
print("Result of element-wise multiplication:", cp.asnumpy(z))

z = cp.dot(x,y)
print("Task 2")
print("Result of element-wise dot product:", cp.asnumpy(z))

x_squared = cp.square(x)
print("Task 3")
print("Square of each element:", cp.asnumpy(x_squared))

ModuleNotFoundError: No module named 'cupy'

Part 6A & 6B

In [None]:
kernel = cp.RawKernel(r'''
extern "C" __global__
void abs_diff_kernel(const float *x, const float *z, float *y, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        y[idx] = fabsf(x[idx] - z[idx]);
    }
}
''', 'abs_diff_kernel')

N = 10**6
x = cp.arange(1, N + 1, dtype=cp.float32)
z = cp.arange(N, 0, -1, dtype=cp.float32)
y = cp.zeros_like(x)

threads_per_block = 256
blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
kernel((blocks_per_grid,), (threads_per_block,), (x, z, y, N))

print("Part A")
print("First 10 results of y_output:", cp.asnumpy(y[:10]))

print("Part B")
print("Dot product result:", cp.asnumpy(dot_product))

Part 6C

In [None]:
import cupy as cp

matmul_kernel = cp.RawKernel(r'''
extern "C" __global__
void matmul(const float* A, const float* B, float* C,
            int M, int K, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; ++k) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}
''', 'matmul')

M, K, N = 4, 3, 5

A = cp.arange(M*K, dtype=cp.float32).reshape(M, K)
B = cp.arange(K*N, dtype=cp.float32).reshape(K, N)
C = cp.zeros((M, N), dtype=cp.float32)

threads_per_block = (16, 16)
blocks_per_grid = ((N + threads_per_block[0] - 1) // threads_per_block[0],
                   (M + threads_per_block[1] - 1) // threads_per_block[1])

matmul_kernel(blocks_per_grid, threads_per_block,
              (A.ravel(), B.ravel(), C.ravel(), M, K, N))

print("A:\n", cp.asnumpy(A))
print("B:\n", cp.asnumpy(B))
print("C = A @ B:\n", cp.asnumpy(C))

Part 7

In [None]:
import numpy as np
import time
import cupy as cp

# Define a large input array on CPU
N = 20**7  # Size of the array
cpu_x = np.arange(N, dtype=np.float32)

# Measure the time taken to square each element on CPU
cpu_start = time.time()
y_cpu = cpu_x**2
cpu_time = time.time() - cpu_start

print(f"CPU Time: {cpu_time:.6f} seconds")

gpu_x = cp.arange(N, dtype=cp.float32)
cp.cuda.Stream.null.synchronize()

gpu_start = time.time()
gpu_y = gpu_x**2
cp.cuda.Stream.null.synchronize()
gpu_time = time.time() - gpu_start

print(f"GPU Time: {gpu_time:.6f} seconds")