In [2]:
import time

def costly_compute(data, column):
    time.sleep(2)
    return data[column]

def data_processing_mean(data, column):
    return costly_compute(data, column).mean()

In [5]:
import numpy as np
rng = np.random.RandomState(42)
data = rng.randn(int(1e4), 4)
data.shape

(10000, 4)

In [6]:
start = time.time()
results = [data_processing_mean(data, col) for col in range(data.shape[1])]
stop = time.time()

print('\nSequential processing')
print('Elapsed time for the entire processing: {:.2f} s'
      .format(stop - start))


Sequential processing
Elapsed time for the entire processing: 8.01 s


`costly_compute` is expensive to compute and it is used as an intermediate step in `data_processing_mean`. Therefore, it is interesting to store the intermediate results from `costly_compute` using `joblib.Memory`.


In [8]:
from joblib import Memory

location = './cachedir'
memory = Memory(location, verbose=0)
costly_compute_cached = memory.cache(costly_compute)

In [9]:
def data_processing_mean_using_cache(data, column):
    """Compute the mean of a column."""
    return costly_compute_cached(data, column).mean()

In [11]:
from joblib import Parallel, delayed

start = time.time()
results = Parallel(n_jobs=-1)(
    delayed(data_processing_mean_using_cache)(data, col)
    for col in range(data.shape[1]))
stop = time.time()

print('\nFirst round - caching the data')
print('Elapsed time for the entire processing: {:.2f} s'
      .format(stop - start))


First round - caching the data
Elapsed time for the entire processing: 0.62 s
