In [1]:
from multiprocessing import Pool, shared_memory
from functools import partial
import numpy as np

# Not parallelized

In [26]:
shape = (2, 10000, 10000)
array = np.empty(shape, dtype=np.float32)

def single_core():
    result = []
    for i in range(shape[0]):
        result.append(np.linalg.norm(array[i]))
        
    return result

%timeit single_core()

953 µs ± 5.75 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Attempt 1

In [18]:
def work(i, shm):
    return np.linalg.norm(shm[i])

shape = (2, 1000, 1000)
dtype = np.float32

shm = shared_memory.SharedMemory(create=True, size=np.prod(shape) * 4) # 4bytes per float32
array = np.ndarray(shape, dtype=dtype,buffer=shm.buf)

workers = Pool()
%time result = workers.map(partial(work, shm=array), range(shape[0]))

CPU times: user 1.01 s, sys: 660 ms, total: 1.67 s
Wall time: 3.86 s


# Attempt 2

In [19]:
def work(i, shm):
    matrix = np.ndarray(shape, dtype=dtype,buffer=shm.buf)
    return np.linalg.norm(matrix[i])

shape = (2, 10000, 10000)
dtype = np.float32

shm = shared_memory.SharedMemory(create=True, size=np.prod(shape) * 4) # 4bytes per float32
array = np.ndarray(shape, dtype=dtype,buffer=shm.buf)

workers = Pool()
%time result = workers.map(partial(work, shm=shm), range(shape[0]))

CPU times: user 390 µs, sys: 293 µs, total: 683 µs
Wall time: 78.5 ms
