### cupy
``` 
Run some ffts using cupy on gpus and numpy on cpu.  

Do an inline kernel
```

In [1]:
import cupy as cp
import numpy as np
import time
import sys

In [2]:
cp.show_config()

OS                           : Linux-3.10.0-1062.9.1.el7.x86_64-x86_64-with-glibc2.10
CuPy Version                 : 8.6.0
NumPy Version                : 1.19.2
SciPy Version                : 1.4.1
Cython Build Version         : 0.29.23
CUDA Root                    : /nopt/nrel/apps/cuda/10.0.130
CUDA Build Version           : 10000
CUDA Driver Version          : 10020
CUDA Runtime Version         : 10000
cuBLAS Version               : 10000
cuFFT Version                : 10000
cuRAND Version               : 10000
cuSOLVER Version             : (10, 0, 0)
cuSPARSE Version             : 10000
NVRTC Version                : (10, 0)
Thrust Version               : 100903
CUB Build Version            : <unknown>
cuDNN Build Version          : None
cuDNN Version                : None
NCCL Build Version           : None
NCCL Runtime Version         : None
cuTENSOR Version             : None
Device 0 Name                : Tesla V100-PCIE-16GB
Device 0 Compute Capability  : 70
Device 1 Name    

In [3]:
# get config as a dictionary
# there has to an easier way to do this
# but didn't find it in the docs
from io import StringIO
import sys
old_stdout = sys.stdout
result = StringIO()
sys.stdout = result
cp.show_config()
sys.stdout = old_stdout
result_string = result.getvalue()
result=result_string.split("\n")
config={}
for x in result:
    x=x.split(":")
    #print(x)
    if(len(x)> 1):
        config[x[0].strip()]=x[1].strip()
#print(config)
nd=0
for k in config:
    # find number of devices
    if(k.find("Name") > -1):
        nd=nd+1
    print(k,":",config[k])


OS : Linux-3.10.0-1062.9.1.el7.x86_64-x86_64-with-glibc2.10
CuPy Version : 8.6.0
NumPy Version : 1.19.2
SciPy Version : 1.4.1
Cython Build Version : 0.29.23
CUDA Root : /nopt/nrel/apps/cuda/10.0.130
CUDA Build Version : 10000
CUDA Driver Version : 10020
CUDA Runtime Version : 10000
cuBLAS Version : 10000
cuFFT Version : 10000
cuRAND Version : 10000
cuSOLVER Version : (10, 0, 0)
cuSPARSE Version : 10000
NVRTC Version : (10, 0)
Thrust Version : 100903
CUB Build Version : <unknown>
cuDNN Build Version : None
cuDNN Version : None
NCCL Build Version : None
NCCL Runtime Version : None
cuTENSOR Version : None
Device 0 Name : Tesla V100-PCIE-16GB
Device 0 Compute Capability : 70
Device 1 Name : Tesla V100-PCIE-16GB
Device 1 Compute Capability : 70


In [4]:
for n in range(0,nd):
    print("Device ",n)
    atts=cp.cuda.Device(n).attributes
    for x in atts:
        print(x,":",atts[x])

Device  0
AsyncEngineCount : 7
CanFlushRemoteWrites : 0
CanMapHostMemory : 1
CanUseHostPointerForRegisteredMem : 1
ClockRate : 1380000
ComputeMode : 0
ComputePreemptionSupported : 1
ConcurrentKernels : 1
ConcurrentManagedAccess : 1
CooperativeLaunch : 1
CooperativeMultiDeviceLaunch : 1
DirectManagedMemAccessFromHost : 0
EccEnabled : 1
GlobalL1CacheSupported : 1
GlobalMemoryBusWidth : 4096
GpuOverlap : 1
HostNativeAtomicSupported : 0
HostRegisterSupported : 1
Integrated : 0
IsMultiGpuBoard : 0
KernelExecTimeout : 0
L2CacheSize : 6291456
LocalL1CacheSupported : 1
ManagedMemory : 1
MaxBlockDimX : 1024
MaxBlockDimY : 1024
MaxBlockDimZ : 64
MaxGridDimX : 2147483647
MaxGridDimY : 65535
MaxGridDimZ : 65535
MaxPitch : 2147483647
MaxRegistersPerBlock : 65536
MaxRegistersPerMultiprocessor : 65536
MaxSharedMemoryPerBlock : 49152
MaxSharedMemoryPerBlockOptin : 98304
MaxSharedMemoryPerMultiprocessor : 98304
MaxSurface1DLayeredLayers : 2048
MaxSurface1DLayeredWidth : 32768
MaxSurface1DWidth : 32768


In [5]:
### Numpy and CPU allocation times
s = time.time()
x_cpu = np.ones((1000,1000,1000))
e = time.time()
print(e - s)
### CuPy and GPU
s = time.time()
x_gpu = cp.ones((1000,1000,1000))
cp.cuda.Stream.null.synchronize()
e = time.time()
print(e - s)

2.094177007675171
0.24394774436950684


In [6]:
### Time some large ffts on GPU and CPU

In [7]:
a=cp.random.rand(10240,10240,dtype = 'float32')
cp.cuda.Stream.null.synchronize()

In [8]:
%timeit -n 200 b=cp.fft.fft2(a)

13.4 ms ± 3.49 ms per loop (mean ± std. dev. of 7 runs, 200 loops each)


In [9]:
al=np.random.rand(10240,10240)
%timeit -n 2 b=np.fft.fft2(al)
#cp.cuda.Stream.null.synchronize()

4.7 s ± 2.75 ms per loop (mean ± std. dev. of 7 runs, 2 loops each)


In [10]:
!nvidia-smi

Fri Apr 23 13:07:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:37:00.0 Off |                    0 |
| N/A   43C    P0    37W / 250W |  11757MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:86:00.0 Off |                    0 |
| N/A   38C    P0    29W / 250W |     12MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

In [11]:
cp.cuda.runtime.setDevice(1)
a=cp.random.rand(10240,10240,dtype = 'float32')
cp.cuda.Stream.null.synchronize()
%timeit -n 200 b=cp.fft.fft2(a)

13.4 ms ± 3.56 ms per loop (mean ± std. dev. of 7 runs, 200 loops each)


In [12]:
!nvidia-smi

Fri Apr 23 13:08:11 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:37:00.0 Off |                    0 |
| N/A   43C    P0    37W / 250W |  11757MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:86:00.0 Off |                    0 |
| N/A   51C    P0   175W / 250W |   3977MiB / 16160MiB |    100%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

### Another way to specify which gpu

In [13]:
with cp.cuda.Device(1):
    a=cp.random.rand(10240,10240,dtype = 'float32')
    cp.cuda.Stream.null.synchronize()
    %timeit -n 200 b=cp.fft.fft2(a)

13.4 ms ± 3.81 ms per loop (mean ± std. dev. of 7 runs, 200 loops each)


In [14]:
with cp.cuda.Device(0):
    a=cp.random.rand(10240,10240,dtype = 'float32')
    cp.cuda.Stream.null.synchronize()
    %timeit -n 200 b=cp.fft.fft2(a)

13.3 ms ± 3.8 ms per loop (mean ± std. dev. of 7 runs, 200 loops each)


### A linear solve

In [15]:
a=cp.random.rand(10240,10240)
b=cp.random.rand(10240)
c=cp.linalg.solve(a, b)
c

array([ 98.50158885,  32.14185012, -34.70445343, ...,  31.43735105,
        21.59285238,  26.02087605])

### A definition of an elementwise kernel consists of four parts: 
- an input argument list
- an output argument list
- a loop body code
- and the kernel name. 

##### A kernel that computes a squared difference 



In [16]:

inargs='float32 x, float32 y'
outargs='float32 z'
body='z = (x - y) * (x - y)'
name='squared_diff'

squared_diff = cp.ElementwiseKernel(inargs,outargs,body,name)


In [17]:
x = cp.arange(10, dtype=np.float32).reshape(2, 5)
y = cp.arange(5, dtype=np.float32)
squared_diff(x, y)

array([[ 0.,  0.,  0.,  0.,  0.],
       [25., 25., 25., 25., 25.]], dtype=float32)

In [18]:
x

array([[0., 1., 2., 3., 4.],
       [5., 6., 7., 8., 9.]], dtype=float32)

In [19]:
y

array([0., 1., 2., 3., 4.], dtype=float32)