cupy:  gpu加速  
https://docs-cupy.chainer.org/en/stable/install.html#install-cupy  
numexpr: Intel VML加速  
https://numexpr.readthedocs.io/en/latest/user_guide.html

In [7]:
import numpy as np
import cupy as cp
import numexpr as ne
N = 512

# numpy 与 cupy速度比较

In [92]:
%%timeit
np.random.seed(0)
a = np.random.rand(N,N).astype('float32')
b = np.random.rand(N,N).astype('float32')
res0= np.dot(a,np.sin(b)+1)

9.98 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [49]:
%%time
a1 = cp.random.rand(N,N).astype('float32')
a2 = cp.random.rand(N,N).astype('float32')
res1= cp.dot(a1, cp.sin(a2)+10)

Wall time: 997 µs


cupy在第一次运算时，速度较慢

# numpy 与 numexpr速度比较

In [50]:
%%timeit
np.random.seed(0)
a = np.random.rand(N,N).astype('float32')
b = np.random.rand(N,N).astype('float32')
res2= ne.evaluate('a* (sin(b)+1)')

6.51 ms ± 52.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [51]:
ne.set_vml_accuracy_mode('fast')

In [91]:
%%timeit
np.random.seed(0)
a = np.random.rand(N,N).astype('float32')
b = np.random.rand(N,N).astype('float32')
res2= ne.evaluate('a*(sin(b)+1)')

7.36 ms ± 239 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [70]:
ne.set_vml_num_threads(ne.detect_number_of_cores())

In [71]:
%%timeit
np.random.seed(0)
a = np.random.rand(N,N).astype('float32')
b = np.random.rand(N,N).astype('float32')
res2= ne.evaluate('a*(sin(b)+1)')

6.66 ms ± 83.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [72]:
ne.set_num_threads(ne.detect_number_of_cores())

6

In [87]:
%%timeit
np.random.seed(0)
a = np.random.rand(N,N).astype('float32')
b = np.random.rand(N,N).astype('float32')
res2= ne.evaluate('a*(sin(b)+1)')

7.54 ms ± 268 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [55]:
# Cpython

In [4]:
%load_ext Cython

In [89]:
%%cython
import numpy as np
cimport numpy as np
# cimport cython
def f(int N):
    cdef int i,j
    cdef np.ndarray[np.float32_t, ndim=2] a
    cdef np.ndarray[np.float32_t, ndim=2] b
    a = np.ones((N,N)).astype('float32')
    b = np.ones((N,N)).astype('float32')

    return a*(np.sin(b)+1)

In [90]:
timeit res = f(N)

4.25 ms ± 238 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
