## 

In [64]:

from numba import cuda
import numba
import numpy as np

MX = 2048
MY = 2048
TILE_DIM = 32
BLOCK_SIZE = 8


In [65]:

@cuda.jit
def transpose(idata, odata):
     tile = cuda.shared.array((TILE_DIM, TILE_DIM), numba.types.float32)
     x = cuda.blockIdx.x  * TILE_DIM + cuda.threadIdx.x
     y = cuda.blockIdx.y  * TILE_DIM + cuda.threadIdx.y
     w = cuda.gridDim.x * TILE_DIM

     if x >= MX or y >=MY: return

     for i in range(0, TILE_DIM, BLOCK_SIZE):
         tile[cuda.threadIdx.y + i, cuda.threadIdx.x] = idata[y + i, x]

     cuda.synchronize()
    # 转置
     x = cuda.blockIdx.y  * TILE_DIM + cuda.threadIdx.x
     y = cuda.blockIdx.x  * TILE_DIM + cuda.threadIdx.y
     for i in range(0, TILE_DIM, BLOCK_SIZE):
        #  tile[cuda.threadIdx.y + i, cuda.threadIdx.x] = idata[y + i, x]
        odata[y + i, x] = tile[cuda.threadIdx.x, cuda.threadIdx.y + i]

In [69]:
threads = (TILE_DIM, BLOCK_SIZE)
blocks = (MX + TILE_DIM - 1) // TILE_DIM,   (MY + TILE_DIM - 1) // TILE_DIM
a_in = cuda.to_device(np.arange(MX*MY, dtype=np.float32).reshape((MX, MY)))
a_out = cuda.device_array_like(a_in)

In [70]:
print(a_out.copy_to_host())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [73]:
# %timeit在pycham无法使用,需要在jupyter ipython下才可使用
%timeit transpose[blocks, threads](a_in, a_out);   cuda.synchronize()

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Unknown attribute 'synchronize' of type Module(<module 'numba.cuda' from '/home/hcq/anaconda3/envs/torch/lib/python3.6/site-packages/numba/cuda/__init__.py'>)

File "<ipython-input-65-109f2a6c9421>", line 13:
def transpose(idata, odata):
    <source elided>

     cuda.synchronize()
     ^

During: typing of get attribute at <ipython-input-65-109f2a6c9421> (13)

File "<ipython-input-65-109f2a6c9421>", line 13:
def transpose(idata, odata):
    <source elided>

     cuda.synchronize()
     ^


In [74]:
print(a_out.copy_to_host())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [75]:
# 验证
from numpy import testing
res = np.transpose(a_in.copy_to_host())
testing.assert_almost_equal(res, a_out.copy_to_host())

AssertionError: 
Arrays are not almost equal to 7 decimals

Mismatched elements: 4194303 / 4194304 (100%)
Max absolute difference: 4194303.
Max relative difference: inf
 x: array([[0.000000e+00, 2.048000e+03, 4.096000e+03, ..., 4.188160e+06,
        4.190208e+06, 4.192256e+06],
       [1.000000e+00, 2.049000e+03, 4.097000e+03, ..., 4.188161e+06,...
 y: array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],...