In [7]:
import cupy as cp
import time

# -------------------------------------------------------------------
# CUDA kernel for base-4 encoding (pre-compiled only once)
# -------------------------------------------------------------------
base4_kernel = cp.RawKernel(r'''
extern "C" __global__
void encode_base4_all(
    const int* shapes,        
    const int* data,          
    const int* offsets,        
    const int* offsets_result, 
    const int* powers,         
    int* output,               
    int N,                    
    int n_dim,               
    int total_size             
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= total_size) return;

    int i = 0;
    while (i < N - 1 && idx >= offsets_result[i + 1]) i++;
    int local_idx = idx - offsets_result[i];

    int temp = local_idx;
    int indices[3]; 
    for (int j = n_dim - 1; j >= 0; j--) {
        int s = shapes[i * n_dim + j];
        indices[j] = temp % s;
        temp /= s;
    }

    int acc = 0;
    for (int j = 0; j < n_dim; j++) {
        int offset = offsets[i * n_dim + j];
        int index = indices[j];
        acc += data[offset + index] * powers[j];
    }
    output[idx] = acc;
}
''', 'encode_base4_all')

# -------------------------------------------------------------------
# Data preparation helper function
# -------------------------------------------------------------------
def prepare_data(arrayss):
    """
    Prepares input data for the encode_base4_all kernel.
    
    Parameters:
      arrayss: A list of arrays (each job) where each job is a list of cp.ndarray.
               Each inner list represents a multi-dimension set of arrays.
               
    Returns:
      shapes, data, offsets, offsets_result, powers, sizes
    """
    N = len(arrayss)           # Number of batches in the job
    n_dim = len(arrayss[0])    # Number of dimensions per batch item

    # Build shapes from the lengths of each array
    shapes = cp.array([len(arr) for arrays in arrayss for arr in arrays], dtype=cp.int32)

    # Concatenate data arrays into one large array
    data = cp.concatenate([arr for arrays in arrayss for arr in arrays])

    # Compute the offset for each array in the flattened data
    lengths = shapes.reshape(N, n_dim)
    offsets = cp.cumsum(cp.concatenate([cp.array([0], dtype=cp.int32), lengths.flatten()[:-1]]))

    # Compute the total number of elements for each batch and the result offsets
    sizes = cp.prod(lengths.reshape(N, n_dim), axis=1)
    offsets_result = cp.cumsum(cp.concatenate([cp.array([0], dtype=cp.int32), sizes[:-1]]))

    # Pre-calculate weights (powers of 4) for each dimension: 4^(n_dim-1), ..., 4^0.
    powers = cp.array([4 ** (n_dim - i - 1) for i in range(n_dim)], dtype=cp.int32)

    return shapes, data, offsets, offsets_result, powers, sizes

# -------------------------------------------------------------------
# Original function to encode arrays to base-4 on the GPU
# -------------------------------------------------------------------
def encode_base4s_gpu(arrayss):
    """
    Encode base-4 for all batches in arrayss using a CUDA kernel.
    
    Parameters:
      arrayss: List of batch items, each batch item is a list of cp.ndarray.
      
    Returns:
      A tuple (output, offsets_result) where output is the flattened encoded result.
    """
    shapes, data, offsets, offsets_result, powers, sizes = prepare_data(arrayss)
    N = len(arrayss)
    n_dim = len(arrayss[0])
    total_size = int(sizes.sum())

    output = cp.empty(total_size, dtype=cp.int32)
    threads_per_block = 256
    blocks_per_grid = (total_size + threads_per_block - 1) // threads_per_block

    base4_kernel((blocks_per_grid,), (threads_per_block,),
                 (shapes, data, offsets, offsets_result, powers, output, N, n_dim, total_size))
    return output, offsets_result

# -------------------------------------------------------------------
# Optimized batched version using asynchronous streams
# -------------------------------------------------------------------
def run_batch_encode_base4s_gpu_optimized(arrayss_list):
    """
    Run multiple encode_base4s_gpu jobs concurrently on the GPU.
    
    Parameters:
      arrayss_list: List of arrayss variables (each representing one job)
                    where each arrayss is a list of batch items (list of cp.ndarray).
                    
    Returns:
      results: List of lists for each job. Each job's result is a list of encoded arrays
               (obtained after splitting the flattened output using offsets_result).
    """
    n_jobs = len(arrayss_list)
    # Create one non-blocking stream per job.
    streams = [cp.cuda.Stream(non_blocking=True) for _ in range(n_jobs)]
    results = [None] * n_jobs

    for i, arrayss in enumerate(arrayss_list):
        with streams[i]:
            shapes, data, offsets, offsets_result, powers, sizes = prepare_data(arrayss)
            N = len(arrayss)
            n_dim = len(arrayss[0])
            total_size = int(sizes.sum())

            # Allocate output array on GPU.
            output = cp.empty(total_size, dtype=cp.int32)
            threads_per_block = 256
            blocks_per_grid = (total_size + threads_per_block - 1) // threads_per_block

            # Launch the kernel asynchronously in the current stream.
            base4_kernel((blocks_per_grid,), (threads_per_block,),
                         (shapes, data, offsets, offsets_result, powers, output, N, n_dim, total_size))
            
            # Split the flattened result into separate arrays using the calculated offsets.
            results[i] = cp.split(output, offsets_result[1:].tolist())

    # Synchronize all streams to make sure all jobs are finished.
    for stream in streams:
        stream.synchronize()

    return results

# -------------------------------------------------------------------
# Example usage of the optimized batched function
# -------------------------------------------------------------------
if __name__ == "__main__":
    # Let's say we want to run 10 jobs concurrently.
    n_jobs = 10

    # Prepare a list of arrayss for each job.
    # Each job has 5000 batch items and each batch item is a list of 4 arrays.
    arrayss_list = [
        [
            [cp.array([1, 2, 3], dtype=cp.int32), 
             cp.array([1], dtype=cp.int32), 
             cp.array([2, 3], dtype=cp.int32),
             cp.array([1, 3], dtype=cp.int32)]
            for _ in range(5000)
        ]
        for _ in range(n_jobs)
    ]

    start = time.time()
    all_results = run_batch_encode_base4s_gpu_optimized(arrayss_list)
    # cp.cuda.Stream.null.synchronize()  # Ensure completion of all asynchronous jobs
    print("Total execution time for {} jobs: {:.4f} seconds".format(n_jobs, time.time() - start))

    # For example, access the results of the first job.
    # all_results[0] is a list, where each element corresponds to one batch's encoded array.
    # Uncomment below to verify shapes for the first 3 batches:
    # for i, res in enumerate(all_results):
    #     print(f"Job 0, batch {i}: {res}")


Total execution time for 10 jobs: 0.1055 seconds
