In [2]:
import cupy as cp

def vectorized_map_cx(words_array, control, target):
    """
    Áp dụng hàm map_cx trên mảng k word sử dụng CuPy và CUDA.
    Args:
        words_array: cp.ndarray shape (k, n), dtype=cp.int8, giá trị 0-3
        control: int, chỉ số control (0 <= control < n)
        target: int, chỉ số target (0 <= target < n)
    Returns:
        lambdas: cp.ndarray shape (k,), dtype=cp.int8, giá trị 1 hoặc -1
        new_words_array: cp.ndarray shape (k, n), dtype=cp.int8, mảng sau biến đổi
    """
    # Bảng tra cứu
    new_control_table = cp.array([
        [0, 0, 3, 3],
        [1, 1, 2, 2],
        [2, 2, 1, 1],
        [3, 3, 0, 0]
    ], dtype=cp.int8)
    
    new_target_table = cp.array([
        [0, 1, 2, 3],
        [1, 0, 3, 2],
        [1, 2, 3, 2],
        [0, 1, 2, 3]
    ], dtype=cp.int8)
    
    lambda_table = cp.array([
        [1, 1, 1, 1],
        [1, 1, 1, -1],
        [1, 1, -1, 1],
        [1, 1, 1, 1]
    ], dtype=cp.int8)

    # Trích xuất giá trị tại control và target
    control_values = words_array[:, control]
    target_values = words_array[:, target]

    # Tra cứu giá trị mới
    new_control_values = new_control_table[control_values, target_values]
    new_target_values = new_target_table[control_values, target_values]
    lambdas = lambda_table[control_values, target_values]

    # Cập nhật mảng
    new_words_array = words_array.copy()
    new_words_array[:, control] = new_control_values
    new_words_array[:, target] = new_target_values

    return lambdas, new_words_array

# Ví dụ sử dụng
words_array = cp.array([
    [0, 1, 2],  # "ixy"
    [1, 2, 3]   # "xyz"
], dtype=cp.int8)
control, target = 0, 1
lambdas, new_words_array = vectorized_map_cx(words_array, control, target)
print("Lambdas:", lambdas)           # [1, 1]
print("New words array:", new_words_array)  # [[0, 1, 2], [1, 0, 3]]

Lambdas: [1 1]
New words array: [[0 1 2]
 [2 3 3]]


In [54]:
import cupy as cp

# Định nghĩa kernel CUDA
map_cx_kernel = cp.RawKernel(r'''
extern "C" __global__
void map_cx_kernel(
    const char* words_array,  
    char* new_words_array,     
    char* lambdas,            
    int k,                    
    int n,              
    int control,                
    int target                 
) {
    // Lookup tables
    const char new_control_table[4][4] = {
        {0, 0, 3, 3},
        {1, 1, 2, 2},
        {2, 2, 1, 1},
        {3, 3, 0, 0}
    };
    const char new_target_table[4][4] = {
        {0, 1, 2, 3},
        {1, 0, 3, 2},
        {1, 2, 3, 2},
        {0, 1, 2, 3}
    };
    const char lambda_table[4][4] = {
        {1, 1, 1, 1},
        {1, 1, 1, -1},
        {1, 1, -1, 1},
        {1, 1, 1, 1}
    };

    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx >= k) return;

    int offset = idx * n;

    for (int i = 0; i < n; i++) {
        new_words_array[offset + i] = words_array[offset + i];
    }

    char control_val = words_array[offset + control];
    char target_val = words_array[offset + target];

    new_words_array[offset + control] = new_control_table[control_val][target_val];
    new_words_array[offset + target] = new_target_table[control_val][target_val];
    lambdas[idx] = lambda_table[control_val][target_val];
}
''', 'map_cx_kernel')

# Hàm wrapper để gọi kernel
def cuda_map_cx(words_array, control, target):
    """
    Áp dụng map_cx trên mảng k word bằng CUDA kernel.
    Args:
        words_array: cp.ndarray shape (k, n), dtype=cp.int8
        control: int, chỉ số control
        target: int, chỉ số target
    Returns:
        lambdas: cp.ndarray shape (k,), dtype=cp.int8
        new_words_array: cp.ndarray shape (k, n), dtype=cp.int8
    """
    k, n = words_array.shape
    new_words_array = cp.empty_like(words_array, dtype=cp.int8)
    lambdas = cp.empty(k, dtype=cp.int8)

    block_size = 256
    grid_size = (k + block_size - 1) // block_size

    map_cx_kernel((grid_size,), (block_size,), 
                  (words_array, new_words_array, lambdas, k, n, control, target))

    return lambdas, new_words_array

words_array = cp.array([
[0, 1, 3],
 [2 ,3, 0],
 [3, 1, 1],
 [2, 1 ,3],
 [3 ,0, 1],
 [2, 2 ,3],
 [3, 3 ,1],
 [3 ,3, 3]
], dtype=cp.int8)
control, target = 0, 1
lambdas, new_words_array = cuda_map_cx(words_array, control, target)
print("Lambdas:", lambdas)           # [1, 1]
print("New words array:", new_words_array)  # [[0, 1, 2], [1, 0, 3]]

Lambdas: [ 1  1  1  1  1 -1  1  1]
New words array: [[0 1 3]
 [1 2 0]
 [3 1 1]
 [2 2 3]
 [3 0 1]
 [1 3 3]
 [0 3 1]
 [0 3 3]]


In [67]:
ragged_tensor = [
	cp.array([[0, 1, 2],[0, 2, 2], [0, 3, 2]], dtype=cp.int8), 
	cp.array([[1, 2, 3]], dtype=cp.int8), 
	cp.array([[2, 3, 1], [2,2,1]], dtype=cp.int8)    
]

In [78]:
def flatten_ragged_matrix_cupy(ragged_matrix):
    lengths = cp.array([len(row) for row in ragged_matrix], dtype=cp.int8)
    starts = cp.concatenate((cp.array([0]), cp.cumsum(lengths)), dtype=cp.int8)
    flatten_vector = cp.concatenate(ragged_matrix, dtype=cp.int8)
    return flatten_vector, starts[:-1]

def unflatten_ragged_matrix_cupy(flatten_vector, starts):
    return cp.vsplit(flatten_vector, starts[1:].tolist())

def map_cx(ragged_lambdas, ragged_tensor, control, target):
	flatten_vector, starts = flatten_ragged_matrix_cupy(ragged_tensor)

	lambdas_sign, mapped_flatten_vector = cuda_map_cx(flatten_vector, control, target)

	starts = starts[1:].tolist()
 
	# Convert flatten vector to ragged tensor
	ragged_tensor = cp.vsplit(mapped_flatten_vector, starts)
	lambdas_sign = cp.split(lambdas_sign, starts)
	# lambdas_sign * ragged_lambdas
	return [cp.multiply(m1, m2) for m1, m2 in zip(ragged_lambdas, lambdas_sign)], ragged_tensor

In [87]:

lambdas = [
    cp.array([2, 3.5, 1]),
    cp.array([1]),
    cp.array([1, 1j])
]
lambdas_out, tensor_out = map_cx(lambdas, ragged_tensor, control, target)

In [88]:
lambdas_out

[array([2. , 3.5, 1. ]), array([1], dtype=int32), array([ 1.+0.j, -0.-1.j])]

In [81]:
tensor_out

[array([[0, 1, 2],
        [3, 2, 2],
        [3, 3, 2]], dtype=int8),
 array([[2, 3, 3]], dtype=int8),
 array([[1, 2, 1],
        [1, 3, 1]], dtype=int8)]

In [82]:
import cupy as cp




flatten_vector, starts = flatten_ragged_matrix_cupy(ragged_tensor)
print("Flatten vector:", flatten_vector)
lambdas, mapped_flatten_vector = cuda_map_cx(flatten_vector, 0, 1)
print("Mapped flatten vector:", mapped_flatten_vector)
print(lambdas)
matrix = unflatten_ragged_matrix_cupy(mapped_flatten_vector, starts)
print(starts)
print(matrix)

Flatten vector: [[0 1 2]
 [0 2 2]
 [0 3 2]
 [1 2 3]
 [2 3 1]
 [2 2 1]]
Mapped flatten vector: [[0 1 2]
 [3 2 2]
 [3 3 2]
 [2 3 3]
 [1 2 1]
 [1 3 1]]
[ 1  1  1  1  1 -1]
[0 3 4]
[array([[0, 1, 2],
       [3, 2, 2],
       [3, 3, 2]], dtype=int8), array([[2, 3, 3]], dtype=int8), array([[1, 2, 1],
       [1, 3, 1]], dtype=int8)]


In [4]:
import cupy as cp
import time
import random

# Define the CUDA kernel for element-wise multiplication
elementwise_mult_kernel = cp.RawKernel(r'''
extern "C" __global__
void elementwise_mult(const char* a, const char* b, char* c, 
                      const int* lengths, const int n_rows) {
    int row = blockIdx.x;  // Each block handles one row
    if (row >= n_rows) return;

    int len = lengths[row];  // Length of the current row
    int tid = threadIdx.x;   // Thread index within the block

    if (tid < len) {
        c[row * len + tid] = a[row * len + tid] * b[row * len + tid];
    }
}
''', 'elementwise_mult')

# Generate large ragged matrices
n_rows = 3  # Number of rows
max_length = 5  # Maximum length of a row
matrix1 = []
matrix2 = []
lengths = []

for _ in range(n_rows):
    length = random.randint(1, max_length)  # Random length for each row
    lengths.append(length)
    matrix1.append(cp.array([random.randint(-127, 127) for _ in range(length)], dtype=cp.int8))
    matrix2.append(cp.array([random.randint(-127, 127) for _ in range(length)], dtype=cp.int8))

lengths = cp.array(lengths, dtype=cp.int32)

# Method 1: Without CUDA Kernel (using cp.multiply)
def multiply_without_kernel(matrix1, matrix2):
    start_time = time.time()
    result = [cp.multiply(m1, m2) for m1, m2 in zip(matrix1, matrix2)]
    end_time = time.time()
    return result, end_time - start_time

# Method 2: With CUDA Kernel
def multiply_with_kernel(matrix1, matrix2, lengths):
    flat_matrix1 = cp.concatenate(matrix1)
    flat_matrix2 = cp.concatenate(matrix2)
    result = cp.zeros_like(flat_matrix1)

    start_time = time.time()
    block_size = 256
    grid_size = n_rows
    elementwise_mult_kernel((grid_size,), (block_size,), 
                            (flat_matrix1, flat_matrix2, result, lengths, n_rows))
    end_time = time.time()

    # Split result back into ragged form (optional, excluded from timing)
    result_split = []
    offset = 0
    for length in lengths.get():
        result_split.append(result[offset:offset + length].copy())
        offset += length

    return result_split, end_time - start_time

# Run benchmarks
print("Running benchmark...")
result_no_kernel, time_no_kernel = multiply_without_kernel(matrix1, matrix2)
result_kernel, time_kernel = multiply_with_kernel(matrix1, matrix2, lengths)
# Print results
print(f"Time without CUDA kernel: {time_no_kernel:.6f} seconds")
print(f"Time with CUDA kernel: {time_kernel:.6f} seconds")

Running benchmark...
Time without CUDA kernel: 0.000000 seconds
Time with CUDA kernel: 0.000000 seconds


In [5]:
result_no_kernel

[array([  -8, -102,   -4,  -34,  -20], dtype=int8),
 array([ 88,  43, 112,  96,  26], dtype=int8),
 array([40, 96], dtype=int8)]