In [4]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2021.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting mako
  Downloading Mako-1.1.5-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 4.0 MB/s 
[?25hCollecting pytools>=2011.2
  Downloading pytools-2021.2.8.tar.gz (63 kB)
[K     |████████████████████████████████| 63 kB 2.1 MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (PEP 517) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2021.1-cp37-cp37m-linux_x86_64.whl size=627558 sha256=04fa53b1d94d18276b119688df9ec5c90fd1d4ff28cddeadb0e0b0a53803e568
  Stored in directory: /root/.cache/pip/wheels/c4/ef/49/dc6a5feb8d980b37c83d465ecab24949a6aa19458522a9e001
  Building wheel for pytools (setup.py) ... [?25l[?25hdo

In [29]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np


def iDivUp(a, b):
    return a // b + 1


start = cuda.Event()
end   = cuda.Event()

count = 100

BLOCKSIZE = 256


a = np.random.randn(1, count)
b = np.random.randn(1, count)


a = a.astype(np.float32)
b = b.astype(np.float32)
c = np.empty_like(a)

mod = SourceModule("""
__global__ void vectorAdd(float * __restrict__ d_c, const float * __restrict__ d_a, 
                                                    const float * __restrict__ d_b,
                                                    const int N)
{
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid >= N) return;
  d_c[tid] = d_a[tid] + d_b[tid];
}
""")

vectorAdd = mod.get_function("vectorAdd")
blockDim  = (BLOCKSIZE, 1, 1)
gridDim   = (iDivUp(N, BLOCKSIZE), 1, 1)
start.record()
vectorAdd(cuda.Out(c), cuda.In(a), cuda.In(b), np.int32(N), block = blockDim, grid = gridDim)
end.record() 
end.synchronize()
secs = start.time_till(end) * 1e-3
print("Processing time = %fs" % (secs))

if np.array_equal(c, a + b):
  print("Пройдено!")
  print("Ожидаемый результат: ", str(c))
  print("Полученный реузьтат: ", str(a + b))
else :
  print("Потрачено!")

cuda.Context.synchronize()

Processing time = 0.000321s
Пройдено!
Ожидаемый результат:  [[ 1.640908   -0.4235562   0.4149223   1.0976689   1.2136481  -2.0167303
  -0.01769257  1.2943896  -1.8662503  -0.8984314  -1.4006885   2.885817
   0.2002386  -1.0647409  -2.8455367   3.077952   -1.6369033  -0.20380542
   0.20725742  0.2013802   1.7580473   0.9404578  -2.1393392  -1.3064051
   2.6392066   0.27668273  0.3479379   2.585082    0.6241794  -0.596244
   0.7655045   1.620623   -3.750266    1.7375662  -0.62586963  0.47455567
   0.4064255   0.18493378  0.15715837 -1.0630178  -0.984003    0.33918372
  -1.7522651  -0.67725754 -1.7896042  -0.8586312  -0.10624605  0.94693154
  -0.284414    2.3688328   1.328336    1.5043439   1.2305505   2.5134468
   2.4283137   1.2092813  -0.73825336  2.535647    0.11161119  0.87147975
   2.4980168  -0.9616691   0.29873002 -0.17109784  1.2031877  -0.58028924
   1.2928725  -0.71906716 -0.36815047  2.6832426   0.90943944 -0.6747913
  -0.5514878   0.42715865 -0.74280506 -0.65223545 -1.5380421