In [101]:
import numpy as np
import copy
import time

from joblib import Memory
from joblib import Parallel, delayed

In [24]:
cachedir = './'
memory = Memory(cachedir, verbose=False)

### Use joblib.Memory to cache results of a simple function taking Python primitive arguments
With small, primitive inputs like this, it would be sufficient to use the memoize decorator (http://code.activestate.com/recipes/52201/)

In [73]:
@memory.cache
def say(s, n):
    print('Running say(\'%s\', %d)' % (s, n))
    full_statement = ''
    for i in range(n):
        full_statement += s + '\n'
    return full_statement

print(say('Hello world', 1))  # first time calling on this argument
print(say('Hello world', 2))  # re-run this because both arguments are not the same
print(say('Hello world', 1))  # don't re-run this
print(say('Polly wants a cracker', 2)) # first time calling on this argument
print(say('Polly wants a cracker', 2)) # don't re-run this

Running say('Hello world', 1)
Hello world

Running say('Hello world', 2)
Hello world
Hello world

Hello world

Running say('Polly wants a cracker', 2)
Polly wants a cracker
Polly wants a cracker

Polly wants a cracker
Polly wants a cracker



### Use Memory to cache results from a function accepting a mutable argument (e.g. list)
This is one of the stated use cases of Memory over simple memoizers that hash input arguments and can't accept mutable arguments. However, a  slightly more complicated version of memoizer can pickle the input into a string representation, so this isn't a huge win for Memory.

In [84]:
@memory.cache
def list_add(l, n):
    print('Running list_add(\'%s\', %d)' % (str(l), n))
    l_copy = copy.deepcopy(l) # avoid modifying the original list
    for i in range(len(l_copy)):
        l_copy[i] += n
    return l_copy

l = [1, 2, 3, 4, 5]

print(list_add(l, 1))  # First call
print('\n')

print(list_add(l, 1))  # Don't re-run
print('\n')

l[0] = 11
print(list_add(l, 1))  # Re-run, as the list has changed
print('\n')


Running list_add('[1, 2, 3, 4, 5]', 1)
[2, 3, 4, 5, 6]


[2, 3, 4, 5, 6]


Running list_add('[11, 2, 3, 4, 5]', 1)
[12, 3, 4, 5, 6]




### Use Memory to cache results from a function accepting numpy array arguments.
* Handling numpy arrays is another one of the stated use cases of Memory over memoize. Could you just pickle the array to string to solve this?  
* Test how Memory handles very small changes to a value in the numpy array. 

In [72]:
@memory.cache
def print_array(array):
    print('Running print_array(%s)' % np.array2string(array))
    return array

# First time calling on this argument
array = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
print(print_array(array))
print('\n')

# Don't re-run, as the argument is identical
print(print_array(array))  
print('\n')

# Don't re-run, as the array has changed insignificantly
array[0, 0] += 1e-200
print(print_array(array)) 
print('\n')

# First time calling on this argument
print(print_array(np.array([1])))
print('\n')

# It DOES re-run this.
print(print_array(np.array([1 + 1e-200])))
print('\n')

# But it doesn't re-run this...
print(print_array(np.array([1 + 1e-100]))) 
print('\n')

Running print_array([[1 2 3 4]
 [5 6 7 8]])
[[1 2 3 4]
 [5 6 7 8]]


[[1 2 3 4]
 [5 6 7 8]]


[[1 2 3 4]
 [5 6 7 8]]


Running print_array([1])
[1]


Running print_array([1.])
[1.]


[1.]




### Memoize a costly helper function, which is called in other functions

In [95]:
@memory.cache
def costly_preprocess(x):
    print('Running costly_preprocess(%s)' % str(x))
    time.sleep(2)
    return x

def array_add(array, n):
    print('Running array_add({0}, {1})'.format(array, n))
    array2 = costly_preprocess(array)  # The inner function call is memoized
    return(np.add(array2, n))

def array_minus(array, n):
    print('Running array_minus({0}, {1})'.format(array, n))
    array2 = costly_preprocess(array)  # The inner function call is memoized
    return(np.subtract(array2, n))



array = np.array([1, 2, 3, 4])

start = time.time()
print(array_add(array, 1))
stop = time.time()
print('{0} seconds'.format(round(stop-start, 4)))
print('\n')

# It re-runs the outer function, but not the inner memoized function
start = time.time()
print(array_add(array, 1))
stop = time.time()
print('{0} seconds'.format(round(stop-start, 4)))
print('\n')


# Even with a different outer function, we benefit from the memoization of the inner function
start = time.time()
print(array_minus(array, 1))
stop = time.time()
print('{0} seconds'.format(round(stop-start, 4)))
print('\n')




Running array_add([1 2 3 4], 1)
Running costly_preprocess([1 2 3 4])
[2 3 4 5]
2.0088 seconds


Running array_add([1 2 3 4], 1)
[2 3 4 5]
0.0032 seconds


Running array_minus[1 2 3 4], 1)
[0 1 2 3]
0.0018 seconds




### Use Joblib.Parallel for embarassingly parallel loops
The default usage is to turn the loop into an iterator (e.g. list comprehension).

In [104]:
def costly_preprocess(x):
    time.sleep(1)
    return x

def array_add(array, n):
    array2 = costly_preprocess(array)
    return(np.add(array2, n))


array = np.array([5, 6, 7, 8, 9])
many_arrays = [array]*10

# Serial processing
start = time.time()
result1 = [array_add(array, 1) for array in many_arrays]
stop = time.time()
print('Serial: {0} seconds'.format(round(stop-start, 4)))
print('\n')

# Parallel processing
start = time.time()
result2 = Parallel(n_jobs=-1)(
            delayed(array_add)(array, 1) for array in many_arrays)
stop = time.time()
print('Parallel: {0} seconds'.format(round(stop-start, 4)))
print('\n')


Serial: 10.0344 seconds


Parallel: 0.0226 seconds


