In [8]:
import numpy as np
import cupy as cp
import time

# Hàm cũ sử dụng NumPy
def weightss_to_lambda_numpy(weightss: np.ndarray, lambdas: np.ndarray) -> np.ndarray:
    num_qubits = weightss.shape[1]
    new_lambdas = np.zeros((4**num_qubits))
    for j, weights in enumerate(weightss):
        combinations = np.stack(np.meshgrid(*weights, indexing='ij'), axis=-1).reshape(-1, len(weights))
        new_lambdas += lambdas[j] * np.prod(combinations, axis=1)
    return new_lambdas

# Hàm mới sử dụng CuPy
def weightss_to_lambda_cupy(weightss: cp.ndarray, lambdas: cp.ndarray) -> cp.ndarray:
    num_terms, num_qubits, _ = weightss.shape
    new_lambdas = cp.zeros(4**num_qubits)
    for j in range(num_terms):
        weights = weightss[j]
        products = weights[0]
        for k in range(1, num_qubits):
            products = cp.outer(products, weights[k]).ravel()
        new_lambdas += lambdas[j] * products
    return new_lambdas

# Thiết lập tham số
num_qubits = 10  # 4^10 = 1,048,576 phần tử
num_terms = 200

# Tạo dữ liệu ngẫu nhiên cho NumPy
weightss_np = np.random.rand(num_terms, num_qubits, 4)
lambdas_np = np.random.rand(num_terms)

# Chuyển dữ liệu sang CuPy
weightss_cp = cp.asarray(weightss_np)
lambdas_cp = cp.asarray(lambdas_np)

# Benchmark hàm NumPy
start_time = time.time()
result_numpy = weightss_to_lambda_numpy(weightss_np, lambdas_np)
numpy_time = time.time() - start_time
print(f"Thời gian NumPy (CPU): {numpy_time:.4f} giây")

# Benchmark hàm CuPy
start_time = time.time()
result_cupy = weightss_to_lambda_cupy(weightss_cp, lambdas_cp)
cupy_time = time.time() - start_time
print(f"Thời gian CuPy (GPU): {cupy_time:.4f} giây")

# Benchmark hàm CuPy
start_time = time.time()
result_vip = weightss_to_lambda_vip(weightss_cp, lambdas_cp)
vip_time = time.time() - start_time
print(f"Thời gian CuPyvip_time (GPU): {vip_time:.4f} giây")

# So sánh kết quả
result_vip = cp.asnumpy(result_cupy)
if np.allclose(result_numpy, result_vip):
    print("Kết quả giống nhau.")
else:
    print("Kết quả khác nhau.")

# Tính tốc độ tăng tốc
speedup = numpy_time / cupy_time
print(f"Tăng tốc: {speedup:.2f}x")
# Tính tốc độ tăng tốc
speedup = numpy_time / vip_time
print(f"Tăng tốc: {speedup:.2f}x")

Thời gian NumPy (CPU): 18.9341 giây
Thời gian CuPy (GPU): 0.2089 giây
Thời gian CuPyvip_time (GPU): 0.0010 giây
Kết quả giống nhau.
Tăng tốc: 90.64x
Tăng tốc: 18930.94x


In [3]:
import cupy as cp

# Định nghĩa CUDA kernel
weightss_to_lambda_kernel = cp.RawKernel(r'''
extern "C" __global__
void weightss_to_lambda(const float* weightss, const float* lambdas, float* new_lambdas, 
                        int num_terms, int num_qubits, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;  
    if (idx < output_size) {
        float sum = 0.0f;
        
        for (int j = 0; j < num_terms; j++) {
            float product = lambdas[j];  
            
            int temp_idx = idx;
            for (int k = num_qubits - 1; k >= 0; k--) {
                int qubit_val = temp_idx % 4; 
                temp_idx /= 4;
                
                int weight_offset = j * num_qubits * 4 + k * 4 + qubit_val;
                product *= weightss[weight_offset];
            }
            sum += product;
        }
        new_lambdas[idx] = sum;
    }
}
''', 'weightss_to_lambda')

# Hàm wrapper để gọi kernel
def weightss_to_lambda_vip(weightss: cp.ndarray, lambdas: cp.ndarray) -> cp.ndarray:
    num_terms, num_qubits, _ = weightss.shape
    output_size = 4 ** num_qubits
    
    # Chuyển weightss thành mảng phẳng để truy cập dễ dàng trong kernel
    weightss_flat = weightss.reshape(num_terms * num_qubits * 4)
    
    # Tạo mảng đầu ra trên GPU
    new_lambdas = cp.zeros(output_size, dtype=cp.float32)
    
    # Cấu hình block và grid
    block_size = 256
    grid_size = (output_size + block_size - 1) // block_size
    
    # Gọi kernel
    weightss_to_lambda_kernel((grid_size,), (block_size,), 
                             (weightss_flat, lambdas, new_lambdas, num_terms, num_qubits, output_size))
    
    return new_lambdas

# # Ví dụ sử dụng
# num_terms = 3
# num_qubits = 2
# weightss = cp.array([[[1, 2, 3, 4], [1, 2, 3, 4]], 
#                      [[1, 2, 3, 4], [1, 2, 3, 4]], 
#                      [[1, 2, 3, 4], [1, 2, 3, 4]]], dtype=cp.float32)
# lambdas = cp.array([1.0, 1.0, 1.0], dtype=cp.float32)

# new_lambdas = weightss_to_lambda(weightss, lambdas)
# print("Kết quả new_lambdas:", new_lambdas)

In [10]:
import cupy as cp

# Định nghĩa CUDA kernel cho N lần chạy
weightss_to_lambda_kernel_2d = cp.RawKernel(r'''
extern "C" __global__
void weightss_to_lambda_2d(const float* weightss, const float* lambdas, float* new_lambdas, 
                           int N, int num_terms, int num_qubits, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;  
    int n = blockIdx.y * blockDim.y + threadIdx.y;
    if (idx < output_size && n < N) {
        float sum = 0.0f;
        
        int lambda_offset = n * num_terms;
        int output_offset = n * output_size;
        for (int j = 0; j < num_terms; j++) {
            float product = lambdas[lambda_offset + j]; 
            
            int temp_idx = idx;
            for (int k = num_qubits - 1; k >= 0; k--) {
                int qubit_val = temp_idx % 4;
                temp_idx /= 4;
                
                int weight_offset = n * num_terms * num_qubits * 4 + 
                                  j * num_qubits * 4 + 
                                  k * 4 + 
                                  qubit_val;
                product *= weightss[weight_offset];
            }
            sum += product;
        }
        new_lambdas[output_offset + idx] = sum;
    }
}
''', 'weightss_to_lambda_2d')

# Hàm wrapper để gọi kernel
def weightss_to_lambda_2d(weightss: cp.ndarray, lambdas: cp.ndarray) -> cp.ndarray:
    N, num_terms, num_qubits, _ = weightss.shape
    output_size = 4 ** num_qubits
    
    # Chuyển weightss thành mảng phẳng
    weightss_flat = weightss.reshape(N * num_terms * num_qubits * 4)
    
    # Chuyển lambdas thành mảng phẳng
    lambdas_flat = lambdas.reshape(N * num_terms)
    
    # Tạo mảng đầu ra trên GPU
    new_lambdas = cp.zeros((N, output_size), dtype=cp.float32)
    new_lambdas_flat = new_lambdas.reshape(N * output_size)
    
    # Cấu hình block và grid 2D
    block_size_x = 16  # Số luồng trên chiều x
    block_size_y = 16  # Số luồng trên chiều y
    grid_size_x = (output_size + block_size_x - 1) // block_size_x
    grid_size_y = (N + block_size_y - 1) // block_size_y
    
    # Gọi kernel
    weightss_to_lambda_kernel_2d((grid_size_x, grid_size_y), (block_size_x, block_size_y), 
                                (weightss_flat, lambdas_flat, new_lambdas_flat, 
                                 N, num_terms, num_qubits, output_size))
    
    return new_lambdas

# Ví dụ sử dụng
N = 2  # Số lần chạy
num_terms = 3
num_qubits = 2
weightss = cp.array([
    [[[1, 2, 3, 4], [1, 2, 3, 4]], [[1, 2, 3, 4], [1, 2, 3, 4]], [[1, 2, 3, 4], [1, 2, 3, 4]]],
    [[[2, 3, 4, 5], [2, 3, 4, 5]], [[2, 3, 4, 5], [2, 3, 4, 5]], [[2, 3, 4, 5], [2, 3, 4, 5]]]
], dtype=cp.float32)
lambdas = cp.array([[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]], dtype=cp.float32)

new_lambdas = weightss_to_lambda_2d(weightss, lambdas)
print("Kết quả new_lambdas (shape:", new_lambdas.shape, "):")
print(new_lambdas)

Kết quả new_lambdas (shape: (2, 16) ):
[[  3.   6.   9.  12.   6.  12.  18.  24.   9.  18.  27.  36.  12.  24.
   36.  48.]
 [ 24.  36.  48.  60.  36.  54.  72.  90.  48.  72.  96. 120.  60.  90.
  120. 150.]]
