In [1]:
import cupy as cp
import numpy as np
import time

In [2]:
SIZE = 1e8

input_array = cp.random.uniform(low = -5, high = 5, size=int(SIZE))
print(f"Min and max of the input array {input_array.min()} {input_array.max()}")
mu = cp.mean(input_array)
sigma = cp.std(input_array)



Min and max of the input array -4.999999939743684 4.999999575097256


In [3]:
cp.cuda.Device().synchronize()

start = time.perf_counter()
output_array = (input_array - mu) / sigma
output_array = cp.clip(output_array, -3,3)

cp.cuda.Device().synchronize()

print(f"Time taken {time.perf_counter()-start}")

print(f"Min and max of the output array {output_array.min(), output_array.max()}")



Time taken 0.02140147799946135
Min and max of the output array (array(-1.73204568), array(1.7319533))


In [4]:
normalize_and_clip  = cp.ElementwiseKernel(
    in_params = "float64 in_array, float64 mu, float64 sigma",
    out_params = "float64 output_y",
    operation = """
    float z = (in_array - mu) / sigma;
    if (z > 3.0f) {                     
       output_y = 3.0f; }
    else if (z < -3.0f) {
        output_y = -3.0f; }
    else {
        output_y = z;
    }
    """,
    name = "normalize_and_clip")


In [5]:
cp.cuda.Device().synchronize()
start_1 = time.perf_counter()

k = normalize_and_clip(input_array, mu, sigma)
cp.cuda.Device().synchronize()

print(f"Total time for execution via Kernel {time.perf_counter() - start_1}")

Total time for execution via Kernel 0.00775575499937986


In [None]:
## A few caveats in this version 
# 1) mu and sigma are calculated outside the kernel not inside the kernel
    ##In Deep learning loads it is not uncommon to compute aggregates statistics
    ## To accomplish this use reduction kernels

# 2) The if else condition creates a warp divergence, where else canot execute when if is still running
##So apparently c++ has a way to accomplish eitehr via ternary operators (which wont solve the problem of warp divergence) or fmaxf

In [15]:
normalize_and_clip_inter = cp.ElementwiseKernel(

        in_params = "float64 in_array, float64 mu, float64 sigma",
        out_params = "float64 out_array",
        operation = """
        float z = (in_array - mu) / sigma;
        out_array = fminf(3.0f, fmaxf(-3.0f, z));
        """,
        name = "normalize_and_clip_inter")



In [17]:
cp.cuda.Device().synchronize()
start_2 = time.perf_counter()

l = normalize_and_clip_inter(input_array, mu, sigma)
cp.cuda.Device().synchronize()

print(f"Time taken for this operation {time.perf_counter() - start_2}")
print(f"Min and Max are as follows {l.min(), l.max()}")


##slighlt better - I would probably call it as marginally better

Time taken for this operation 0.00716631400064216
Min and Max are as follows (array(-1.73204565), array(1.73195326))


In [38]:
##Reducing kernesls 

##Reduing Mean
mean_kernel = cp.ReductionKernel(

    in_params = "float64 x, int64 N",
    out_params = "float64 mu",
    map_expr = "x", 
    reduce_expr = "a + b",
    post_map_expr = "mu = a / N",
    identity = "0.0f",
    name = "mean_kernel"
)



In [43]:
##Variance kernel

variance_kernel = cp.ReductionKernel(

    in_params  = "float64 x, int64 N, float64 mu",
    out_params = "float64 variance",
    map_expr = "(x-mu) * (x-mu)",
    reduce_expr = "a+b",
    post_map_expr = "variance = a / N",
    identity = "0.0f",
    name = "variance_kernel"
)



In [46]:
cp.sqrt(variance), sigma

(array(2.88683674), array(2.88683674))

In [52]:
##Final kernel

cp.cuda.Device().synchronize()

start_final = time.perf_counter()
reduce_mu = mean_kernel(input_array, len(input_array))

cp.cuda.Device().synchronize()

variance = variance_kernel(input_array, len(input_array), reduce_mu)
cp.cuda.Device().synchronize()

clip_values = normalize_and_clip_inter(input_array, reduce_mu, cp.sqrt(variance))
cp.cuda.Device().synchronize()

print(f"Total time {time.perf_counter()-start_final}")

Total time 0.15043037500072387


In [53]:
clip_values.min(), clip_values.max()

(array(-1.73204565), array(1.73195326))

In [51]:
clip_values

array([ 0.55200857,  0.26896426,  0.08677603, ...,  0.03925672,
       -0.26806849,  0.28067935])