In [1]:
#This file is to compare speed ups and how to use numba and cp 

from numba import njit
import numpy as np 
import cupy as cp 

In [2]:

#Just apply the @njit decorator to python functions that use simple python packages to improve speed up 
@njit
def monte_carlo_pi_numba(nsamples):
    acc = 0
    for i in range(nsamples):
        x = np.random.random()
        y = np.random.random()
        if (x ** 2 + y ** 2) < 1.0:
            acc += 1
    return 4.0 * acc / nsamples

def monte_carlo_pi(nsamples):
    acc = 0
    for i in range(nsamples):
        x = np.random.random()
        y = np.random.random()
        if (x ** 2 + y ** 2) < 1.0:
            acc += 1
    return 4.0 * acc / nsamples

In [3]:
%%timeit 
monte_carlo_pi(1000)

655 μs ± 2.74 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [4]:
%%timeit 
monte_carlo_pi_numba(1000)

6.43 μs ± 987 ns per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
#The njit improved speedup by 99% and used as much if not less memory. 

In [6]:
#These are memory pools used to see stats of GPU memory 
mempool = cp.get_default_memory_pool()
pinned_mempool = cp.get_default_pinned_memory_pool()

In [7]:
# Create an array on CPU.
# np allocates 40000 bytes in CPU (not managed by cp memory pool).
a_cpu = np.ndarray(shape=(100,100), dtype=np.float32)
print(a_cpu.nbytes)                      # 40000

# You can access statistics of these memory pools.
print(mempool.used_bytes())              # 0
print(mempool.total_bytes())             # 0
print(pinned_mempool.n_free_blocks())    # 0

40000
0
0
0


In [8]:


# Transfer the array from CPU to GPU. Note that this takes time. 
# This allocates 400 bytes from the device memory pool, and another 400
# bytes from the pinned memory pool.  The allocated pinned memory will be
# released just after the transfer is complete.  Note that the actual
# allocation size may be rounded to larger value than the requested size
# for performance.
a = cp.array(a_cpu)
print(a.nbytes)                          # 400
print(mempool.used_bytes())              # 512
print(mempool.total_bytes())             # 512
print(pinned_mempool.n_free_blocks())    # 1
type(a)

40000
40448
40448
0


cupy.ndarray

In [9]:

# When the array goes out of scope, the allocated device memory is released
# and kept in the pool for future reuse.
a = None  # (or `del a`)
print(mempool.used_bytes())              # 0
print(mempool.total_bytes())             # 512
print(pinned_mempool.n_free_blocks())    # 1


0
40448
0


In [10]:

# You can clear the memory pool by calling `free_all_blocks`.
mempool.free_all_blocks()
pinned_mempool.free_all_blocks()
print(mempool.used_bytes())              # 0
print(mempool.total_bytes())             # 0
print(pinned_mempool.n_free_blocks())    # 0

0
0
0


In [11]:
#Lets test the effeciency of GPU blocks 
from scipy import fft 

a_cpu = np.random.randint(0, 255, size=(1000,1000))
a_gpu = cp.array(a_cpu)


In [12]:
%%timeit 
fft.fftn(a_cpu)

45.5 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
#You have to import custom cupy functions that resemble the same functions in scipy to apply on the cupy arrays
from cupyx.scipy import fft as fft_gpu 

In [14]:
%%timeit 
fft_gpu.fftn(a_gpu)

4.86 ms ± 22.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [15]:
#One thing to note is that the bigger the tensor, the bigger the difference in speed between gpu and cpu compilation 

In [16]:
#This shows that the tensors are very similar to each other 
fft_cpu = fft.fftn(a_cpu)
fft_sent_back = cp.asnumpy(fft_gpu.fftn(a_gpu))
np.allclose(fft_cpu, fft_sent_back)

True

In [17]:
#Suggest watching https://www.youtube.com/watch?v=9bBsvpg-Xlk 

In [18]:
#This function uses a cuda just in time decorator to increment all values in a given matrix by 1 
from numba import cuda

cuda.detect()

Found 1 CUDA devices
id 0    b'NVIDIA GeForce RTX 4060'                              [SUPPORTED]
                      Compute Capability: 8.9
                           PCI Device ID: 0
                              PCI Bus ID: 1
                                    UUID: GPU-01a4cc78-4e20-c926-c2c8-2fdf09d794ab
                                Watchdog: Enabled
                            Compute Mode: WDDM
             FP32/FP64 Performance Ratio: 64
Summary:
	1/1 devices are supported


True

In [19]:
a_cpu = np.random.randint(0, 10, size=(2000, 2000))

#Add numpy array to gpu via cuda instead of cupy 
a_cuda = cuda.to_device(a_cpu)
type(a_cuda)

numba.cuda.cudadrv.devicearray.DeviceNDArray

In [20]:
#Adds a cupy array wrapper to cuda object (Numba works with cupy arrays as well)
a_gpu = cp.asarray(a_cuda)
type(a_gpu)

cupy.ndarray

In [21]:
#Naive cuda implementation of matrix multiplication 
@cuda.jit
def matmul_1(A, B, C):
    #C = AB
    i, j = cuda.grid(2)
    if i < C.shape[0] and j < C.shape[1]: 
        tmp = 0 
    for k in range(A.shape[1]):
        tmp += A[i, k] *B[k, j]
    C[i,j] = tmp 

In [66]:
cp.random.seed(1)
SIZE = (3, 3)
A = np.random.uniform(1, 10, size=SIZE)
B = np.random.uniform(1, 10, size=SIZE)

A_gpu = cp.asarray(A)
B_gpu = cp.asarray(B)
C_gpu = cp.zeros(SIZE, dtype=np.float64)

In [67]:
threadsperblock = (16, 16) 
blockspergrid_x = int(np.ceil(C.shape[0]/threadsperblock[0]))
blockspergrid_y = int(np.ceil(C.shape[1]/threadsperblock[1]))
blockspergrid = (blockspergrid_x , blockspergrid_y)
print(f"blocks in grid {blockspergrid}")
print(f"Matrix multiplication works for {threadsperblock[0]*blockspergrid_x} by {threadsperblock[1]*blockspergrid_y}")

blocks in grid (1, 1)
Matrix multiplication works for 16 by 16


In [68]:
#excute cuda kernel function 
matmul_1[blockspergrid, threadsperblock](A, B, C_gpu)
C = np.dot(A, B)
np.allclose(C, C_gpu)



array(False)

In [69]:
type(C), type(C_gpu)

(numpy.ndarray, cupy.ndarray)

In [70]:
C

array([[ 59.92982297, 111.8885066 ,  75.98230458],
       [ 54.83285418,  81.38135179,  79.5280989 ],
       [ 46.20612758,  68.34816442,  71.49490658]])

In [71]:
C_gpu

array([[ 59.92982297, 111.8885066 ,  75.98230458],
       [ 28.28848932,  81.38135179,  79.5280989 ],
       [ 17.50708962,  68.34816442,  14.63911448]])