In [2]:
import cupy as cp 
import numpy as np 
import time

In [3]:
print(f"Cupy version {cp.__version__}")
print(f"Numpy version {np.__version__}")

Cupy version 13.6.0
Numpy version 2.0.2


#### Test 1 - Creating a large array on GPU and CPU and timing how long a simple array operation will take

##### Conclusion - As expected, GPU performs better. The constraint is memory here. Throw in a couple more zeros to the array sizes and GPU will run out of memory. 

In [8]:
a = cp.random.randn(100000000)
b = cp.random.randn(100000000)
cp.cuda.Device().synchronize()
start = time.perf_counter()

c = a + b

cp.cuda.Device().synchronize()
print(f"Cupy - Total time taken for this ops {time.perf_counter()-start}")

Cupy - Total time taken for this ops 0.010318666999864945


In [16]:
##These are all directly CPU operations
x = np.random.randn(100000000)
y = np.random.randn(100000000)

cpu_start = time.perf_counter()
z = x + y
print(f"Total time taken for this ops {time.perf_counter()-cpu_start}")

Total time taken for this ops 0.11178816099982214


In [23]:
##Delete all objects to free up memory for the nezt operation

#CPU objects
del x, y, z

##GPU objects
del a, b , c  ##Apparently this is not simply enough
cp.get_default_memory_pool().free_all_blocks()

In [None]:
Test 2 - Copying arrays between CPU <-> GPU (vice versa)


In [4]:
a = np.random.rand(1000,1000)

print(a[0][0])

##Copying it to GPU now
x = cp.asarray(a)
x = x +1
print(x[0][0])

##Copy it back to CPU - Note there is no "Movement"  without creating a copy
b = cp.asnumpy(x)
print(b[0][0])


0.12983321461312702
1.129833214613127
1.129833214613127


In [17]:
b

array([[1.12983321, 1.5632718 , 1.62070485, ..., 1.15965652, 1.18105834,
        1.8283314 ],
       [1.44659989, 1.28061619, 1.87149547, ..., 1.86483263, 1.65253083,
        1.17313518],
       [1.5828872 , 1.93567005, 1.10073777, ..., 1.38458885, 1.65162186,
        1.22649382],
       ...,
       [1.29626014, 1.2667818 , 1.5265712 , ..., 1.30447462, 1.23690257,
        1.036798  ],
       [1.49899814, 1.99735562, 1.89764818, ..., 1.51902181, 1.82455071,
        1.2614754 ],
       [1.42317631, 1.49999285, 1.78378434, ..., 1.11834344, 1.33218027,
        1.50729309]])

In [20]:
p = cp.dot(b,b)
print(type(p))
print(p.device)

<class 'numpy.ndarray'>
cpu


In [23]:
cp.get_default_memory_pool().used_bytes()


16000512

In [22]:
cp.get_default_memory_pool().free_all_blocks()

In [None]:
## Creating an elementwise kernel



In [24]:
add_kernel = cp.ElementwiseKernel(
    'float32 x, float32 y',
    'float32 z',
    'z = x + y',
    'add_kernel'
)

x = cp.arange(10, dtype=cp.float32)
y = cp.arange(10, dtype=cp.float32)

z = add_kernel(x, y)
print(z)


[ 0.  2.  4.  6.  8. 10. 12. 14. 16. 18.]


In [25]:
del x,y

In [28]:
subtract_kernel = cp.ElementwiseKernel(
    in_params  = "int64 x, int64 y",
    out_params = "int64 z",
    operation = "z = x - y",
    name = "subtract_kernel" )

a = cp.arange(20)
b = cp.arange(20)

c = subtract_kernel(a,b)
print(a)
print(b)
print(c)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [30]:
k = np.array((1,2,3))
k > 2

array([False, False,  True])

In [58]:
## Naive cupy version

SIZE = 1e8

input_array = cp.random.uniform(low = -5, high = 5, size=int(SIZE))
print(f"Min and max of the input array {input_array.min()} {input_array.max()}")
mu = cp.mean(input_array)
sigma = cp.std(input_array)

cp.cuda.Device().synchronize()

start = time.perf_counter()
output_array = (input_array - mu) / sigma
output_array = cp.clip(output_array, -3,3)

cp.cuda.Device().synchronize()

print(f"Time taken {time.perf_counter()-start}")

print(f"Min and max of the output array {output_array.min(), output_array.max()}")

print(f"Device {input_array.device, output_array.device}")






Min and max of the input array -4.999999998470756 4.999999906157157
Time taken 0.020057753999935812
Min and max of the output array (array(-1.73179891), array(1.73216906))
Device (<CUDA Device 0>, <CUDA Device 0>)


In [78]:
## Using the Kernel version lets see if the performance improves


normalize_and_clip  = cp.ElementwiseKernel(
    in_params = "float64 in_array, float64 mu, float64 sigma",
    out_params = "float64 output_y",
    operation = """
    float z = (in_array - mu) / sigma;
    if (z > 3.0f) {
       output_y = 3.0f; }
    else if (z < -3.0f) {
        output_y = -3.0f; }
    else {
        output_y = z;
    }
    """,
    name = "normalize_and_clip")


In [81]:
cp.cuda.Device().synchronize()
start_1 = time.perf_counter()

k = normalize_and_clip(input_array, mu, sigma)
cp.cuda.Device().synchronize()

print(f"Total time for execution via Kernel {time.perf_counter() - start_1}")

Total time for execution via Kernel 0.006841743001132272


In [80]:
k.min(), k.max()

(array(-1.73179889), array(1.73216903))

In [38]:
cp.random.uniform(low = -10, high = 10, size=50).min()


array(-9.03049321)

In [49]:
int(1e10)

10000000000