In [1]:
from numba import njit
import numpy as np
from numba import prange
import time

#### Experiment 1 -
Check times of cold start run (which includes compilation) against a 
hot path

In [2]:
@njit
def l2_norm(x):
    s = 0.0
    for i in range(x.size):
        s += x[i] * x[i]
        return np.sqrt(s)

In [3]:
x = np.random.rand(10000)

start = time.perf_counter()
print(l2_norm(x))
print(f"Time taken {time.perf_counter()-start}")


0.8400269295694986
Time taken 0.25345543306320906


In [4]:
start = time.perf_counter()
print(l2_norm(x))
print(f"Time taken {time.perf_counter()-start}")

##Notice the difference in time taken between the two runs. This is the difference between
#running the fucntion the first tim

0.8400269295694986
Time taken 0.00017721299082040787


#### Experiment 2

Checking data types- See if you run the same function with inputs of 2 different types
will it still have an impact

In [5]:
##maybe switch it and see with prange

@njit
def add_one(x):
    return x + 1

In [6]:
start = time.perf_counter()
print(add_one(x))
print(f"Time taken {time.perf_counter()-start}")


[1.84002693 1.04955028 1.26622096 ... 1.08578059 1.1769469  1.76485895]
Time taken 0.29375286726281047


In [7]:
x_int = np.random.randint(1, 10000000, size=10000)

start = time.perf_counter()
print(add_one(x_int))
print(f"Time taken {time.perf_counter()-start}")

#the time is not as big as calling the fucntionm with float; but still high



[7101745 1260937 1811662 ... 7110128 4386913 1487734]
Time taken 0.1820190278813243


In [8]:
#Call the same function again with int

start = time.perf_counter()
print(add_one(x_int))
print(f"Time taken {time.perf_counter()-start}") 

[7101745 1260937 1811662 ... 7110128 4386913 1487734]
Time taken 0.0004934952594339848


In [9]:
##Call the float function now for the second time

start = time.perf_counter()
print(add_one(x))
print(f"Time taken {time.perf_counter()-start}")

[1.84002693 1.04955028 1.26622096 ... 1.08578059 1.1769469  1.76485895]
Time taken 0.0006331950426101685


#### Experiment 3 

Check the benefits of parallelizing using njit/jit against a simple python fucntion

In [10]:
##For sake of comparison - let's just use the looping way; Because njit/jit prefers looping way
##Intentionally not using vectorization

def py_sum_of_squares(x):
    sum = 0.0
    for i in range(x.size):
        sum += x[i]*x[i]
    return sum

In [11]:
#NJIT version

@njit
def njit_py_sum_of_squares(x):
    sum = 0.0
    for i in range(x.size):
        sum += x[i]
    return x

In [12]:
# N = 10000

In [13]:
start = time.perf_counter()
print(py_sum_of_squares(x))
print(f"Time taken {time.perf_counter()-start}") 

3290.5742462906123
Time taken 0.0032963789999485016


In [14]:
start = time.perf_counter()
print(njit_py_sum_of_squares(x))
print(f"Time taken {time.perf_counter()-start}")  ##Slower the first time

[0.84002693 0.04955028 0.26622096 ... 0.08578059 0.1769469  0.76485895]
Time taken 0.07895788084715605


In [15]:
start = time.perf_counter()
print(njit_py_sum_of_squares(x))
print(f"Time taken {time.perf_counter()-start}")   

##The secone run is ~8 times faster to the run with non NJIT version of the same function

[0.84002693 0.04955028 0.26622096 ... 0.08578059 0.1769469  0.76485895]
Time taken 0.00040259724482893944


#### Experiment 4

Try parallel loops - Split the loops across CPU cores -> this is quite cool tbh. This is not the same as n_jobs/num_threads; prange is just a way of saying that a particular loop is safe to run parallely


In [16]:
y = np.random.rand(int(1e8))

##An array that is a bit larger than previous examples

In [17]:
##A simple example to normalize the array (x-mean)/std

##Dont use prange
@njit
def normalize_array(x):
    mean = 0.0
    n = x.size
    for i in range(n):
        mean += x[i]
    mean /= n

    var = 0.0
    for i in range(n):
        var += (x[i] - mean) ** 2

    std = np.sqrt(var/n)

    for i in range(n):
        
        x[i] = (x[i]-mean)/std

    return x


In [18]:
start = time.perf_counter()
print(normalize_array(y))
print(f"Time taken {time.perf_counter()-start}") ##WIthout using prange and parallelization of loops

[-1.40289599  1.13721189 -0.70605161 ... -0.83637167  0.3548388
  1.51364748]
Time taken 0.5963368453085423


In [19]:
@njit(parallel=True)
def prange_normalize_array(x):
    mean = 0.0
    n = x.size
    for i in prange(n):
        mean += x[i]
    mean /= n

    var = 0.0
    for i in prange(n):
        var += (x[i] - mean) ** 2

    std = np.sqrt(var/n)

    for i in prange(n):
        
        x[i] = (x[i]-mean)/std

    return x


In [20]:
start = time.perf_counter()
print(normalize_array(y))
print(f"Time taken {time.perf_counter()-start}")  ##Much better. Would be only more better on the second run

[-1.40289599  1.13721189 -0.70605161 ... -0.83637167  0.3548388
  1.51364748]
Time taken 0.43836397817358375


### Performance related threads

#### np.ascontiguousarray

In [21]:
x_new = x.reshape(100,100)
x_new

array([[0.84002693, 0.04955028, 0.26622096, ..., 0.64368238, 0.28019758,
        0.26487337],
       [0.29216408, 0.38090816, 0.7612505 , ..., 0.66164906, 0.64780212,
        0.41967419],
       [0.07342131, 0.53605332, 0.14116536, ..., 0.96748222, 0.44041243,
        0.98363245],
       ...,
       [0.21814183, 0.41072274, 0.39061127, ..., 0.82919449, 0.22918948,
        0.04752365],
       [0.08974953, 0.70727792, 0.18043474, ..., 0.99616713, 0.45107289,
        0.26834287],
       [0.7635695 , 0.69208128, 0.18903997, ..., 0.08578059, 0.1769469 ,
        0.76485895]], shape=(100, 100))

In [22]:
##if you access the first column; x[:,0] pretty much first element in all rows;

x_new[:,0] ##This is not good

array([0.84002693, 0.29216408, 0.07342131, 0.9592694 , 0.41520116,
       0.54478693, 0.90575041, 0.67521907, 0.34270014, 0.34040091,
       0.01241144, 0.13196554, 0.06820829, 0.06420004, 0.48110108,
       0.83639009, 0.76227609, 0.16222955, 0.93939819, 0.05724285,
       0.50647702, 0.85978434, 0.08614795, 0.59211464, 0.60060353,
       0.7562504 , 0.52440416, 0.84940249, 0.43469953, 0.88896669,
       0.86912972, 0.09221459, 0.64569887, 0.11687875, 0.06494714,
       0.53331022, 0.7906279 , 0.91415838, 0.37798401, 0.12262134,
       0.75823205, 0.90433479, 0.90015477, 0.23976939, 0.75160517,
       0.73973466, 0.38775275, 0.99333585, 0.59270018, 0.37746222,
       0.64651581, 0.91450097, 0.20707014, 0.27605826, 0.80239138,
       0.45914674, 0.41125951, 0.20199466, 0.74469776, 0.47087775,
       0.67259551, 0.17383563, 0.5241743 , 0.62087663, 0.21725733,
       0.32372106, 0.28264144, 0.47864475, 0.69909042, 0.36654552,
       0.43151456, 0.66623722, 0.33984558, 0.54374899, 0.17016

In [23]:
#Lets check this with and example

@njit
def add_cols_noncontiguous(x):
    col_size = x.shape[1]
    col_sum = 0.0
    
    for i in range(col_size):
        col_sum += np.sum(x[:, i])  

    return col_sum
        

In [24]:
start = time.perf_counter()
add_cols_noncontiguous(x_new)
print(f"Time taken {time.perf_counter()-start}")

Time taken 0.3007206697948277


In [25]:
@njit
def add_cols_contiguous(x):
    col_size = x.shape[1]
    col_sum = 0.0
    
    for i in range(col_size):
        col_sum += np.sum(np.ascontiguousarray(x[:, i]))  

    return col_sum
        

In [26]:
start = time.perf_counter()
add_cols_contiguous(x_new)

print(f"Time taken {time.perf_counter()-start}")

Time taken 0.3501463648863137


In [27]:
x_new.flags["C_CONTIGUOUS"], x_new.flags["F_CONTIGUOUS"]

##C_CONTIGUOUS is row major matrix whereas the other one is F_CONTIGUOUS

(True, False)

In [28]:
##The version without np.ascontiguous does a lot better than the onw with it
#reason could be that the function pretty much copies the result of the operation and is 
# not "free". So in this case the non-contiguous version is a lot better than the other one

## DO NOT put np.ascontiguousarray within a loop. Take it outside if possible

##Note:- It was also checked if x_new was row or column major as that would impact how the
#indices of a particular matrix would be accessed. In this case it looks the x_new is indeed
##row major


In [29]:
x_intsimple = np.random.randint(1,100, size=20)

### Using Fastmath - > You will loose some precision but it more speedier.

In [30]:
@njit(fastmath=True, parallel=True)
def product_of_numbers(x):
    asize = x.size
    product = 1.0

    for i in prange(asize):
        product *= x[i]

    return product
        

In [31]:
start = time.perf_counter()
print(product_of_numbers(x_intsimple))

print(f"Time taken {time.perf_counter()-start}")

1.1379539146689362e+30
Time taken 0.4133854638785124


In [32]:
@njit(fastmath=False)
def product_of_numbers_wo_fastmath(x):
    asize = x.size
    product = 1.0

    for i in range(asize):
        product *= x[i]

    return product
        

In [33]:
start = time.perf_counter()
print(product_of_numbers_wo_fastmath(x_intsimple))

print(f"Time taken {time.perf_counter()-start}")

1.1379539146689362e+30
Time taken 0.08239627769216895


In [34]:
##notice how the time didnt change but the results are slightly different

In [35]:
##Some basic stuff

#1) Avoid allocations within numba loops/functions

#2) Avoid out of place or vectorized type of operations when working with numba njit and jit

#3) With @jit decorators (give that they are flexible in nature) even if they cannot run some code
#using jit compiler, they switch to plain python. So you wont know what was used underneath/
# Use @njit -> So that it will raise a typing error if something cannot be executed

#4) Use njit or jit(nopython=True) because with just jit - If your code doesnt compile as 
#requirements of jit it will simply switch to python without any warning




