# PyCuda 

reference: https://documen.tician.de/pycuda/tutorial.html

In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [2]:
import numpy
a = numpy.random.randn(4,4)

a = a.astype(numpy.float32)

a_gpu = cuda.mem_alloc(a.nbytes)

cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
  __global__ void doublify(float *a)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] *= 2;
  }
  """)

func = mod.get_function("doublify")
func(a_gpu, block=(4,4,1))


a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print(a_doubled)
print(a)

[[ 1.1402913   0.05243977 -2.6309035  -3.0563636 ]
 [-1.5327526   2.3234503  -1.917323   -0.46605432]
 [-1.3096534   0.94649047 -0.4462073   1.7413996 ]
 [ 0.78382814 -0.80751884  1.209512   -0.7679626 ]]
[[ 0.57014567  0.02621988 -1.3154517  -1.5281818 ]
 [-0.7663763   1.1617252  -0.9586615  -0.23302716]
 [-0.6548267   0.47324523 -0.22310366  0.8706998 ]
 [ 0.39191407 -0.40375942  0.604756   -0.3839813 ]]


In [5]:
grid = (1, 1)
block = (4, 4, 1)
func.prepare("P")
func.prepared_call(grid, block, a_gpu)