In [1]:
import cupy as cp
import numpy as np
import time

In [2]:
SIZE = 1e8

input_array = cp.random.uniform(low = -5, high = 5, size=int(SIZE))
print(f"Min and max of the input array {input_array.min()} {input_array.max()}")
mu = cp.mean(input_array)
sigma = cp.std(input_array)



Min and max of the input array -4.999999994746547 4.999999955955936


In [3]:
cp.cuda.Device().synchronize()

start = time.perf_counter()
output_array = (input_array - mu) / sigma
output_array = cp.clip(output_array, -3,3)

cp.cuda.Device().synchronize()

print(f"Time taken {time.perf_counter()-start}")

print(f"Min and max of the output array {output_array.min(), output_array.max()}")



Time taken 0.021100837000631145
Min and max of the output array (array(-1.73205186), array(1.73199842))


In [4]:
normalize_and_clip  = cp.ElementwiseKernel(
    in_params = "float64 in_array, float64 mu, float64 sigma",
    out_params = "float64 output_y",
    operation = """
    float z = (in_array - mu) / sigma;
    if (z > 3.0f) {                     
       output_y = 3.0f; }
    else if (z < -3.0f) {
        output_y = -3.0f; }
    else {
        output_y = z;
    }
    """,
    name = "normalize_and_clip")


In [5]:
cp.cuda.Device().synchronize()
start_1 = time.perf_counter()

k = normalize_and_clip(input_array, mu, sigma)
cp.cuda.Device().synchronize()

print(f"Total time for execution via Kernel {time.perf_counter() - start_1}")

Total time for execution via Kernel 0.007087576999765588


In [6]:
## A few caveats in this version 
# 1) mu and sigma are calculated outside the kernel not inside the kernel
    ##In Deep learning loads it is not uncommon to compute aggregates statistics
    ## To accomplish this use reduction kernels

# 2) The if else condition creates a warp divergence, where else canot execute when if is still running
##So apparently c++ has a way to accomplish eitehr via ternary operators (which wont solve the problem of warp divergence) or fmaxf

In [7]:
normalize_and_clip_inter = cp.ElementwiseKernel(

        in_params = "float64 in_array, float64 mu, float64 sigma",
        out_params = "float64 out_array",
        operation = """
        float z = (in_array - mu) / sigma;
        out_array = fminf(3.0f, fmaxf(-3.0f, z));
        """,
        name = "normalize_and_clip_inter")



In [8]:
cp.cuda.Device().synchronize()
start_2 = time.perf_counter()

l = normalize_and_clip_inter(input_array, mu, sigma)
cp.cuda.Device().synchronize()

print(f"Time taken for this operation {time.perf_counter() - start_2}")
print(f"Min and Max are as follows {l.min(), l.max()}")


##slighlt better - I would probably call it as marginally better

Time taken for this operation 0.007398813000691007
Min and Max are as follows (array(-1.73205185), array(1.73199844))


In [9]:
##Reducing kernesls 

##Reduing Mean
mean_kernel = cp.ReductionKernel(

    in_params = "float64 x, int64 N",
    out_params = "float64 mu",
    map_expr = "x", 
    reduce_expr = "a + b",
    post_map_expr = "mu = a / N",
    identity = "0.0f",
    name = "mean_kernel"
)



In [10]:
##Variance kernel

variance_kernel = cp.ReductionKernel(

    in_params  = "float64 x, int64 N, float64 mu",
    out_params = "float64 variance",
    map_expr = "(x-mu) * (x-mu)",
    reduce_expr = "a+b",
    post_map_expr = "variance = a / N",
    identity = "0.0f",
    name = "variance_kernel"
)



In [11]:
##Final kernel

cp.cuda.Device().synchronize()

start_final = time.perf_counter()
reduce_mu = mean_kernel(input_array, len(input_array))
variance = variance_kernel(input_array, len(input_array), reduce_mu)
clip_values = normalize_and_clip_inter(input_array, reduce_mu, cp.sqrt(variance))
cp.cuda.Device().synchronize()

print(f"Total time {time.perf_counter()-start_final}")

Total time 0.15016697000100976


In [12]:
## Custom activation functions = f(x) = x * sigmoid(x); where sigmoid is 1/(1+exp(-x)

custom_sigmoid_kernel = cp.ElementwiseKernel(
    in_params = "float32 input_array",
    out_params = "float32 activations",
    operation = """
        activations = input_array / (1.0f + exp(-input_array));
    """,
    name = "custom_sigmoid_kernel")

x = cp.random.uniform(low=2, high=10, size=1000,dtype=cp.float32)
swish_kernel_values = custom_sigmoid_kernel(x)

In [13]:
x = cp.random.uniform(low=2, high=10, size=1000,dtype=cp.float32)
swish_kernel_values = custom_sigmoid_kernel(x)

In [14]:
## Clipped Relu - Clip the activation between 0 and 6

clipped_relu = cp.ElementwiseKernel(
    in_params = "float32 in_array",
    out_params = "float32 clipped_activations",
    operation = "clipped_activations = fminf(fmaxf(0.0f,in_array), 6.0f);",
    name = "clipped_relu")

test_val1 = cp.array((-2,0,3,6,10), dtype=cp.float32)
print(clipped_relu(test_val1))

[0. 0. 3. 6. 6.]


In [15]:
## (x-mean) / (std + epsilon); where epsilon is 1e-8

test_val2 = cp.array([1,2,3,4,5], dtype=cp.float32)

mean = cp.mean(test_val2)
std = cp.std(test_val2)
eps = 1e-8

normalize_with_epsilon = cp.ElementwiseKernel(

    in_params = "float32 in_array, float32 mu, float32 sigma, float32 eps",
    out_params = "float32 out_array",
    operation = "out_array = (in_array - mu) / (sigma+eps);",
    name = "normalize_with_epsilon" )

print(normalize_with_epsilon(test_val2, mean, std, eps))



[-1.4142135  -0.70710677  0.          0.70710677  1.4142135 ]


In [16]:
## Polynomial Feature - f(x,y) = x^^2 + 2xy + y^^2

x = cp.array([1.0, 2.0, 3.0, 4.0], dtype=cp.float32)
y = cp.array([2.0, 3.0, 4.0, 5.0], dtype=cp.float32)

polynomial_kernel = cp.ElementwiseKernel(
    in_params = "float32 x, float32 y",
    out_params = "float32 out",
    operation = """
    out = x*x + 2*x*y + y*y
    """,
    name = "polynomial_kernel")

print(polynomial_kernel(x,y))


[ 9. 25. 49. 81.]


In [17]:
##Reduction Kernels

##sqrt(mean(x²)) = sqrt(sum(x²) / n) - root mean square

rmse_kernel = cp.ReductionKernel(

    in_params = "float32 x, float32 n",
    out_params = "float32 y",
    map_expr = "x*x",
    reduce_expr = "a+b",
    post_map_expr = "y=sqrt(a/n)",  ##Note _in_ind.size() doesnt work
    identity = "0",
    name = "rmse_kernel")



data = cp.array([3.0, 4.0], dtype=cp.float32)
print(rmse_kernel(data, len(data)))


3.535534


In [18]:
##sum(values * weights)


weighted_sum = cp.ReductionKernel(

    in_params = "float32 values, float32 weights",
    out_params = "float32 weighted_sum",
    map_expr = "values * weights",
    reduce_expr = "a+b",
    post_map_expr = "weighted_sum=a",
    identity = "0",
    name = "weighted_sum"
)



values = cp.array([1.0, 2.0, 3.0, 4.0], dtype=cp.float32)
weights = cp.array([0.1, 0.2, 0.3, 0.4], dtype=cp.float32)

print(weighted_sum(values, weights))

3.0
