In [1]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2021.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pytools>=2011.2
  Downloading pytools-2021.2.8.tar.gz (63 kB)
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
Collecting mako
  Downloading Mako-1.1.5-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 3.6 MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (PEP 517) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2021.1-cp37-cp37m-linux_x86_64.whl size=627558 sha256=f9912f60edb4c96e6d2d81d43f58f51b32840e7ae021d68a03281d51f144c7a2
  Stored in directory: /root/.cache/pip/wheels/c4/ef/49/dc6a5feb8d980b37c83d465ecab24949a6aa19458522a9e001
  Building wheel for pytools (setup.py) ... [?25l[?25hdone
  C

In [26]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

(n, m, p) = (3, 4, 5)

n = np.int32(n)
m = np.int32(m)
p = np.int32(p)

a = np.random.randint(2, size=(n, m))
b = np.random.randint(2, size=(m, p))
c = np.zeros((n, p), dtype=np.float32)

a = a.astype(np.float32)
b = b.astype(np.float32)

a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize)
c_gpu = cuda.mem_alloc(c.size * c.dtype.itemsize)

cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

mod = SourceModule("""
    __global__ void multiply
      ( int n, int m, int p,
        float *a, float *b, float *c )
    {
        int idx = p*threadIdx.x + threadIdx.y;

        c[idx] = 0.0;
        for(int k=0; k<m; k++)
           c[idx] += a[m*threadIdx.x+k]
                    *b[threadIdx.y+k*p];
    }
    """)

func = mod.get_function("multiply")
func(n, m, p, a_gpu, b_gpu, c_gpu, \
     block=(np.int(n), np.int(p), 1), \
     grid=(1, 1), shared=0)

cuda.memcpy_dtoh(c, c_gpu)


if np.array_equal(c, a.dot(b)):
  print("Пройдено!")
  print("Ожидаемый результат:\n", c)
  print("Полученный реузьтат:\n", a.dot(b))
else:
  print("Потрачено!")



Пройдено!
Ожидаемый результат:
 [[1. 0. 0. 0. 0.]
 [1. 0. 1. 1. 0.]
 [1. 0. 1. 1. 0.]]
Полученный реузьтат:
 [[1. 0. 0. 0. 0.]
 [1. 0. 1. 1. 0.]
 [1. 0. 1. 1. 0.]]
