In [4]:
import numpy as np
import os

# disable numpy's multithreading for demonstration reasons
os.environ['MKL_NUM_THREADS']='1'

num_results=1000

output=np.zeros((num_results,))
arr=np.random.rand(200,200)

    
def expensive_calculation(idx):
    for i in range(10):
        inv=np.linalg.inv(arr+idx)
    return np.linalg.det(inv)


def expensive_loop(output):
    for i in range(output.shape[0]):
        output[i]=expensive_calculation(i)
        
        
%timeit expensive_calculation(0)
%timeit expensive_loop(output)

9.25 ms ± 93 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
9.24 s ± 4.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
import cProfile

pr=cProfile.Profile()
pr.enable()
expensive_loop(output)
pr.disable()

pr.print_stats()

In [13]:
import multiprocessing as mp

# for CPUs that use hyperthreading there are half as many physical cores, trying to use each core twice may be slower
num_cpus=mp.cpu_count()//2

def expensive_loop_mp(output):
    with mp.Pool(num_cpus) as pool:
        # 1 argument tuple for each call
        args=[(i,) for i in range(output.shape[0])]
        
        # `expensive_calculation` is called in separate processes which all have their own copies
        # of `arr`, alternative would be to define `arr` in shared memory and pass it to each as argument
        out=pool.starmap(expensive_calculation,args)
        
    # copy result into output array, could have shared this instead and include it as an argument
    for i in range(output.shape[0]):
        output[i]=out[i]
        
%timeit expensive_loop_mp(output)

1.6 s ± 107 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
