In [5]:
import numpy as np
import cupy as cp
import time

# Hàm cũ sử dụng NumPy
def weightss_to_lambda_numpy(weightss: np.ndarray, lambdas: np.ndarray) -> np.ndarray:
    num_qubits = weightss.shape[1]
    new_lambdas = np.zeros((4**num_qubits))
    for j, weights in enumerate(weightss):
        combinations = np.stack(np.meshgrid(*weights, indexing='ij'), axis=-1).reshape(-1, len(weights))
        new_lambdas += lambdas[j] * np.prod(combinations, axis=1)
    return new_lambdas

# Hàm mới sử dụng CuPy
def weightss_to_lambda_cupy(weightss: cp.ndarray, lambdas: cp.ndarray) -> cp.ndarray:
    num_terms, num_qubits, _ = weightss.shape
    new_lambdas = cp.zeros(4**num_qubits)
    for j in range(num_terms):
        weights = weightss[j]
        products = weights[0]
        for k in range(1, num_qubits):
            products = cp.outer(products, weights[k]).ravel()
        new_lambdas += lambdas[j] * products
    return new_lambdas

# Thiết lập tham số
num_qubits = 10  # 4^10 = 1,048,576 phần tử
num_terms = 200

# Tạo dữ liệu ngẫu nhiên cho NumPy
weightss_np = np.random.rand(num_terms, num_qubits, 4)
lambdas_np = np.random.rand(num_terms)

# Chuyển dữ liệu sang CuPy
weightss_cp = cp.asarray(weightss_np)
lambdas_cp = cp.asarray(lambdas_np)

# Benchmark hàm NumPy
start_time = time.time()
result_numpy = weightss_to_lambda_numpy(weightss_np, lambdas_np)
numpy_time = time.time() - start_time
print(f"Thời gian NumPy (CPU): {numpy_time:.4f} giây")

# Benchmark hàm CuPy
start_time = time.time()
result_cupy = weightss_to_lambda_cupy(weightss_cp, lambdas_cp)
cupy_time = time.time() - start_time
print(f"Thời gian CuPy (GPU): {cupy_time:.4f} giây")

if np.allclose(result_numpy, result_cupy):
    print("Kết quả giống nhau.")
else:
    print("Kết quả khác nhau.")

# Tính tốc độ tăng tốc
speedup = numpy_time / cupy_time
print(f"Tăng tốc: {speedup:.2f}x")

Thời gian NumPy (CPU): 19.0819 giây
Thời gian CuPy (GPU): 0.3014 giây
Kết quả giống nhau.
Tăng tốc: 63.32x


In [73]:
from gqimax.mapper import weightsss_to_lambdas
num_qubits = 3  # 4^10 = 1,048,576 phần tử
num_terms = 5
ks = [np.random.randint(1, num_terms) for _ in range(num_qubits)]

weightsss_np = [cp.random.rand(ks[i], num_qubits, 4) * (cp.random.rand(ks[i], num_qubits, 4) > 0.5) for i in range(num_qubits)]
lambdass_np = [cp.random.rand(ks[i]) for i in range(num_qubits)]
for i in range(num_qubits):
	print(weightsss_np[i].shape, lambdass_np[i].shape)
mapped_lambdass, non_zeros_indicess = weightsss_to_lambdas(weightsss_np, lambdass_np)

(2, 3, 4) (2,)
(1, 3, 4) (1,)
(1, 3, 4) (1,)


In [75]:
for i in range(num_qubits):
	print(f"lambdass[{i}]: {mapped_lambdass[i]}")
	print(f"non_zeros_indicess: {non_zeros_indicess[i]}")

lambdass[0]: [0.00144713 0.00110885 0.00139623 0.00053744]
non_zeros_indicess: [ 2  6 10 14]
lambdass[1]: [0.17136025 0.3747416  0.18828798 0.41176026 0.09784203 0.21396724]
non_zeros_indicess: [ 9 11 41 43 57 59]
lambdass[2]: [0.0037968  0.00178773 0.00043901 0.00839594 0.00395323 0.00097079
 0.02371438 0.01116593 0.002742   0.05244004 0.02469142 0.00606344]
non_zeros_indicess: [ 8  9 11 12 13 15 56 57 59 60 61 63]


In [81]:

# Hàm wrapper để gọi kernel
def cuda_map_cx(words_array, control, target):
    """
    Áp dụng map_cx trên mảng k word bằng CUDA kernel.
    Args:
        words_array: cp.ndarray shape (k, n), dtype=cp.int8
        control: int, chỉ số control
        target: int, chỉ số target
    Returns:
        lambdas: cp.ndarray shape (k,), dtype=cp.int8
        new_words_array: cp.ndarray shape (k, n), dtype=cp.int8
    """
    k, n = words_array.shape
    new_words_array = cp.empty_like(words_array, dtype=cp.int8)
    lambdas = cp.empty(k, dtype=cp.int8)

    block_size = 256
    grid_size = (k + block_size - 1) // block_size

    map_cx_kernel((grid_size,), (block_size,), 
                  (words_array, new_words_array, lambdas, k, n, control, target))

    return lambdas, new_words_array

def flatten_ragged_matrix_cupy(ragged_matrix):
    lengths = cp.array([len(row) for row in ragged_matrix], dtype=cp.int8)
    starts = cp.concatenate((cp.array([0]), cp.cumsum(lengths)), dtype=cp.int8)
    flatten_vector = cp.concatenate(ragged_matrix, dtype=cp.int8)
    return flatten_vector, starts[:-1]

def unflatten_ragged_matrix_cupy(flatten_vector, starts):
    return cp.vsplit(flatten_vector, starts[1:].tolist())

def map_indices_to_weighted(ragged_lambdas, ragged_tensor, control, target):
    """
    --- First, I encode the n-qubit Pauli word (index) as list of n int8 array
    Ex: 0(III) --> [0, 0, 0]
    For n-stabilizer, we have n x k indices, so the encoded tensor will be n x k x n (ragged tensor)
    Original tensor: [
		array([1, 2]),
		array([3]),
		array([4]),
	]
    --- Next step, I flatten this tensor to 1D array (each element is still n-dim array)
    Flatten vector: [1,2,3,4] and following starts (variable) = [0, 2, 3]
    --- Map this array to the new array using map_indices_to_weighted kernel
    Mapped flatten vector: [
        [0, 0, 1],
        [0, 0, 2],
        [0, 0, 3],
        [0, 1, 0],
    ] (n x k x n)
    
    --- Finally, I unflatten the mapped array to the original shape (ragged tensor)
    --- Obviously, this function requires starts variable (the start index of each row in the flatten vector)
    
    Out ragged tensor (with starts = [0, 2, 3]): [
        array([[0, 0, 1], [0, 0, 2]]),
		array([[0, 0, 3]]),
		array([[0, 1, 0]]),
    ]

    """
    flatten_vector, starts = flatten_ragged_matrix_cupy(ragged_tensor)
    lambdas_sign, mapped_flatten_vector = cuda_map_cx(flatten_vector, control, target)
    starts = starts[1:].tolist()
	# Convert flatten vector to ragged tensor
    ragged_tensor = cp.vsplit(mapped_flatten_vector, starts)
    lambdas_sign = cp.split(lambdas_sign, starts)
	# OP: lambdas_sign * ragged_lambdas
    # This operator can be implemented in CUDA kernel (in file notebook)
    # But I see there is no different between two methods
    return [cp.multiply(m1, m2) for m1, m2 in zip(ragged_lambdas, lambdas_sign)], ragged_tensor



In [82]:
flatten_ragged_matrix_cupy(non_zeros_indicess)

(array([ 2,  6, 10, 14,  9, 11, 41, 43, 57, 59,  8,  9, 11, 12, 13, 15, 56,
        57, 59, 60, 61, 63], dtype=int8),
 array([ 0,  4, 10], dtype=int8))

In [80]:
non_zeros_indicess

[array([ 2,  6, 10, 14], dtype=int64),
 array([ 9, 11, 41, 43, 57, 59], dtype=int64),
 array([ 8,  9, 11, 12, 13, 15, 56, 57, 59, 60, 61, 63], dtype=int64)]

In [28]:
weightsss = cp.array([
    [[0,0, 1,4], [1,2,3,4]], 
    [[0,1,0,0], [1,2,0,0]], 
    [[1,0,0,4], [0,0, 1,4]]] )

In [63]:

from gqimax.mapper import weightss_to_lambda
from gqimax.utils import index_to_indices, index_to_word

lambdas = weightss_to_lambda(weightsss, lambdass_np[0])
lambdas

(array([ 0.5873697 ,  2.34947881,  0.69422608,  1.38845216,  0.58182811,
         1.16365622,  1.74548433,  2.32731244,  2.32731244,  4.65462487,
         9.33141613, 18.70716501]),
 array([ 2,  3,  4,  5,  8,  9, 10, 11, 12, 13, 14, 15], dtype=int64))

In [62]:
indicess = []
for i in range(2):	
    indices = cp.nonzero(lambdas)[0]
    lambdas = lambdas[indices]
    new_indices = []
    for index in indices:
        new_indices.append(index_to_indices(int(index), 2))
    indicess.append(cp.array(new_indices))
print(indicess)

[array([[0, 0],
       [0, 1],
       [0, 2],
       [0, 3],
       [1, 0],
       [1, 1],
       [1, 2],
       [1, 3],
       [2, 0],
       [2, 1],
       [2, 2],
       [2, 3]]), array([[0, 0],
       [0, 1],
       [0, 2],
       [0, 3],
       [1, 0],
       [1, 1],
       [1, 2],
       [1, 3],
       [2, 0],
       [2, 1],
       [2, 2],
       [2, 3]])]
