In [1]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2021.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pytools>=2011.2
  Downloading pytools-2021.2.8.tar.gz (63 kB)
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
Collecting mako
  Downloading Mako-1.1.5-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 4.6 MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (PEP 517) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2021.1-cp37-cp37m-linux_x86_64.whl size=627558 sha256=807b2eb224e6d25985c63ce7850cdc65d42a207aae29e81faba0df5e2e9606ed
  Stored in directory: /root/.cache/pip/wheels/c4/ef/49/dc6a5feb8d980b37c83d465ecab24949a6aa19458522a9e001
  Building wheel for pytools (setup.py) ... [?25l[?25hdone
  C

In [52]:
from pycuda import compiler, gpuarray, tools
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np



MATRIX_SIZE = 10
BLOCK_SIZE = 32

a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)


a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)

c_cpu = np.dot(a_cpu, b_cpu)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

mod = SourceModule("""
__global__ void matrixBlockMulti(int matrixsize,float *a, float *b, float *c)
{
    int tx = blockDim.x*blockIdx.x + threadIdx.x; 
    int ty = blockDim.y*blockIdx.y + threadIdx.y; 
  
    if((ty <matrixsize) && (tx < matrixsize))
    {
    float Pvalue = 0;
    for(int k=0; k<matrixsize;++k)
    {
    float Aelement = a[ty*matrixsize +k];
    float Belement = b[k*matrixsize +tx];
    Pvalue += Aelement * Belement;
    }
    c[ty * matrixsize + tx] = Pvalue;
    }
}
""")


matrixBlockMul = mod.get_function("matrixBlockMulti")

matrixBlockMul(np.uint32(MATRIX_SIZE),
    a_gpu, b_gpu,
    c_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1),
    )



if np.allclose(c_cpu, c_gpu.get()):
  print("Пройдено!")
  print("Ожидаемый результат:\n", a_cpu.dot(b_cpu))
  print("Полученный реузьтат:\n", c_gpu)
else:
  print("Потрачено!")

#Разница
print("Разница:\n", c_cpu - c_gpu.get())



Пройдено!
Ожидаемый результат:
 [[-5.481274    1.4328903  -3.5691638   1.324232   -2.2422214  -0.34533882
   2.9829378   2.4787016  -0.43767893  3.3674662 ]
 [ 7.2742763   3.559244    0.58913684 -0.36974904 -2.1070461   1.4966358
  -2.110726   -3.0599186  -3.5032144  -0.31216198]
 [-1.2095232  -1.9447328  -3.2166414  -3.4611216   2.5929718   1.2310638
  -3.2009146   0.44320536  0.82281286 -6.6093144 ]
 [ 5.6732354  -1.0002084  -2.9190116   2.561654   -1.3176036  -2.1442366
   7.3452673   0.3148358  -0.54799706 -0.54057175]
 [-5.3538165  -2.6092594   5.0403743  -4.6408405   4.041741    2.9719968
  -1.7079093  -1.2749777   1.3857522   0.13560368]
 [-3.9868197   1.0768305  -3.2123795  -2.901809   -0.54469204  4.120506
   0.5314719  -0.979011    0.3125621  -5.611764  ]
 [ 0.26511407  0.02635884 -0.57915604 -1.0514169   0.8716409   6.3904076
  -0.29652524  2.1826146  -2.9577425  -8.836997  ]
 [ 1.0654223   1.1790355  -3.6938496   3.9172459  -3.8834655   1.3317904
   8.18461    -0.26096037  