In [None]:
from numba import cuda
cuda.detect()

Found 1 CUDA devices
id 0             b'Tesla T4'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 4
                              PCI Bus ID: 0
                                    UUID: GPU-dc868b2c-06d9-61d9-6220-79531df43e05
                                Watchdog: Disabled
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


True

In [None]:
import cupy as cp
import numpy as np

In [None]:
#create array in host memory
cpu_array = np.random.randint(0,255, size = (2000, 2000))
cpu_array

array([[ 33, 182,   1, ..., 231, 238,  28],
       [190, 239,  97, ...,  33, 162,  15],
       [244, 186, 166, ...,  17,  57, 212],
       ...,
       [121, 170, 218, ..., 118, 210, 132],
       [228,   2,   7, ...,  49, 218,  65],
       [216,  62,  43, ..., 234, 134, 222]])

In [None]:
cpu_array.ndim

2

In [None]:
cpu_array.shape

(2000, 2000)

In [None]:
cpu_array.itemsize

8

In [None]:
cpu_array.nbytes /1e6  #32MB

32.0

In [None]:
#passing numpy array to cupy array
gpu_array = cp.asarray(cpu_array)
gpu_array

array([[ 33, 182,   1, ..., 231, 238,  28],
       [190, 239,  97, ...,  33, 162,  15],
       [244, 186, 166, ...,  17,  57, 212],
       ...,
       [121, 170, 218, ..., 118, 210, 132],
       [228,   2,   7, ...,  49, 218,  65],
       [216,  62,  43, ..., 234, 134, 222]])

In [None]:
gpu_array.shape

(2000, 2000)

In [None]:
%%timeit
cp.asarray(cpu_array) # more data will take more time to pass to the gpu

5.85 ms ± 56.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
type(cpu_array)

numpy.ndarray

In [None]:
type(gpu_array)

cupy.ndarray

In [None]:
cpu_array_x = np.ones((2,3))
cpu_array_y = np.full((3,2), 2)


In [None]:
%%timeit
np.matmul(cpu_array_x, cpu_array_y)

1.81 µs ± 25.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [None]:
gpu_array_x = cp.ones((2,3))
gpu_array_y = cp.full((3,2), 2)

In [None]:
%%timeit
cp.dot(gpu_array_x, gpu_array_y)

The slowest run took 4.98 times longer than the fastest. This could mean that an intermediate result is being cached.
73.2 µs ± 57.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
from scipy import fft


In [None]:
%%timeit
fft.fftn(cpu_array)

57.7 ms ± 705 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
fft.fftn(gpu_array)  #exception, b/c gpu_array is not in the host memory

TypeError: ignored

In [None]:
#import gpu implementation of fft of scipy from cupy
from cupyx.scipy import fft as fft_gpu

In [None]:
%%timeit
fft_gpu.fftn(gpu_array)

111 µs ± 54.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
#return back gpu array to the cpu
cpu_fft = fft.fftn(cpu_array)
fft_return_back = cp.asnumpy(fft_gpu.fftn(gpu_array))
np.allclose(cpu_fft, fft_return_back)

True

In [None]:
fft_gpu.fft(cpu_array)

TypeError: ignored

In [None]:
fft.fftn(gpu_array)

TypeError: ignored

In [None]:
np.save("hostFile", cpu_array)

In [None]:
cp.load("hostFile")

FileNotFoundError: ignored

In [None]:
cpu_array1 = np.random.randint(0,255, size = (2000, 2000))
cpu_array2 = np.random.randint(0,255, size = (2000, 2000))


In [None]:
%%timeit
np.dot(cpu_array1, cpu_array2)

12.4 s ± 285 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
cp.dot(cpu_array1, cpu_array2)

15.1 s ± 1.7 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
gpu_array1 = cp.random.randint(0,255, size = (2000, 2000))
gpu_array2 = cp.random.randint(0,255, size = (2000, 2000))

In [None]:
%%timeit
cp.dot(gpu_array1, gpu_array2)

33.8 µs ± 19.6 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
#SVD on CPU
%%timeit
np.linalg.svd(cpu_array1)

4.6 s ± 465 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
#SVD on GPU
%%timeit
cp.linalg.svd(gpu_array1)

3.62 s ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
### Numpy and CPU
import time
s = time.time()
x_cpu = np.ones((1000,1000,1000))
e = time.time()
print(e - s)
### CuPy and GPU
s = time.time()
x_gpu = cp.ones((1000,1000,1000))
cp.cuda.Stream.null.synchronize()
e = time.time()
print(e - s)

2.285921812057495
0.26053929328918457
