In [1]:
import pycuda.driver as cuda
import pycuda.autoinit  # noqa
from pycuda.compiler import SourceModule
import numpy

Define the functions in C
The nvcc compiler is called.

In [2]:
mod3 = SourceModule("""
    #include <stdio.h>
    
    __global__ void add3(float *z, int n, float a, float *x, float *y)
    {
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        int stride = blockDim.x * gridDim.x;
        for (int i = index; i < n; i += stride)
          z[i] = a * x[i] + y[i];
    }
    """)




In [3]:
N=1<<20
blockSize = 256;
numBlocks = int((N + blockSize - 1) / blockSize)

In [4]:
    for power in range(10, 25): # 24
        size = 1<<power
        print (power,size)

10 1024
11 2048
12 4096
13 8192
14 16384
15 32768
16 65536
17 131072
18 262144
19 524288
20 1048576
21 2097152
22 4194304
23 8388608
24 16777216


In [5]:
print(blockSize)
print(numBlocks)

256
4096


In [6]:
xh = numpy.random.randn(N)
yh = numpy.random.randn(N)

In [7]:
xh.astype(numpy.float32)
yh.astype(numpy.float32)

array([ 0.14177604, -0.50945115,  0.67235315, ..., -1.98463249,
       -1.46721137,  0.01760701], dtype=float32)

In [8]:
zh = numpy.zeros_like(xh)
zh.astype(numpy.float32)

array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)

In [9]:
xh.itemsize
yh.itemsize

8

In [10]:
len(xh)

1048576

Allocate memory space

In [11]:
x_gpu = cuda.mem_alloc(xh.size * xh.dtype.itemsize)
cuda.memcpy_htod(x_gpu, xh)

In [12]:
y_gpu = cuda.mem_alloc(yh.size * yh.dtype.itemsize)
cuda.memcpy_htod(y_gpu, yh)

In [13]:
z_gpu = cuda.mem_alloc(zh.size * zh.dtype.itemsize)
cuda.memcpy_htod(z_gpu, zh)

In [14]:
func = mod3.get_function("add3")

In [15]:
numBlocks

4096

In [16]:
blockSize

256

In [17]:
n = numpy.int32(N)
A=1
a = numpy.float32(A)

In [18]:
#func(N, x_gpu, y_gpu, block=(256, 1, 1), grid=(1, 1), shared=0) 
func(z_gpu, n, a, x_gpu, y_gpu,block=(256, 1, 1), grid=(numBlocks,1), shared=0)

In [19]:
cuda.memcpy_dtoh(zh, z_gpu)

In [20]:
len(zh)

1048576

In [21]:
zh

array([  3.01068092e+01,   2.19559544e-10,   9.63087047e-06, ...,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00])