#### Testing parallelized combination indices generation with pycuda

This indices will form the inner loop in combinations 

In [1]:
import pycuda.autoinit
import pycuda
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.compiler import *
import numpy

In [2]:
mod = SourceModule('''
__global__ void comb_indices(int* left,int* right, int* start, int* stop)
{
    int len = stop[0]-start[0];
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = threadIdx.y + blockDim.y*blockIdx.y;
    if ( i<len && j< len) 
    {
        left[i*(len) + j] = i + start[0];
        right[i*(len)+j] = j;
    }
}
''')
func = mod.get_function("comb_indices")

In [3]:
start = numpy.array([0]).astype(numpy.int32)
stop = numpy.array([8]).astype(numpy.int32)
length = (stop-start)**2
left = numpy.zeros(length).astype(numpy.int32)
right = numpy.zeros(length).astype(numpy.int32)

In [4]:
d_start = cuda.mem_alloc(start.nbytes)
d_stop = cuda.mem_alloc(stop.nbytes)
d_left = cuda.mem_alloc(left.nbytes)
d_right = cuda.mem_alloc(right.nbytes)
cuda.memcpy_htod(d_start, start)
cuda.memcpy_htod(d_stop, stop)
cuda.memcpy_htod(d_left, left)
cuda.memcpy_htod(d_right, right)

In [5]:
func(cuda.InOut(left), cuda.InOut(right), cuda.In(start),cuda.In(stop), 
     block=(8,8,1), grid=(1,1))

In [6]:
print(right)

[0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4
 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7]


In [7]:
print(left)

[0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 4 4 4 4 4
 4 4 4 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
