From 4e23405311f40ae000da1b6a7ec7b31f54f64845 Mon Sep 17 00:00:00 2001
From: Jakob Jordan <jakobjordan@posteo.de>
Date: Wed, 13 Jan 2021 18:08:34 +0100
Subject: [PATCH] Allow custom functions to be used for computing cache keys

---
 cgp/utils.py                    | 107 +++++++++++++++++---------------
 examples/example_caching.py     |  20 +++---
 examples/example_fec_caching.py |  13 ++--
 test/conftest.py                |   7 +++
 test/test_utils.py              |  89 ++++++++++++++++++--------
 5 files changed, 148 insertions(+), 88 deletions(-)

diff --git a/cgp/utils.py b/cgp/utils.py
index 59ddd223..74a6f8d2 100644
--- a/cgp/utils.py
+++ b/cgp/utils.py
@@ -55,22 +55,45 @@ def __find_args_and_return_value_for_consistency_check(fn: str) -> Union[Dict[st
     return None
 
 
-def __compute_key_from_args(*args: Any, **kwargs: Any) -> str:
-    """Compute a key from the arguments passed to the decorated
-    function.
+def compute_key_from_sympy_expr_and_args(*args: Any, **kwargs: Any) -> str:
+    """Compute a key from the sympy expression encoded in an individual
+    and the remaining arguments passed to the decorated function.
 
     """
 
-    s: str = str(args) + str(kwargs)
+    if not (
+        isinstance(args[0], IndividualSingleGenome) or isinstance(args[0], IndividualMultiGenome)
+    ):
+        raise ValueError("first argument of decorated function must be an Individual instance")
+
+    s: str = str(args[0].to_sympy()) + str(args[1:]) + str(kwargs)
     return hashlib.sha1(s.encode("utf-8")).hexdigest()
 
 
-def __compute_key_from_evaluation_and_args(
-    seed: int, min_value: float, max_value: float, batch_size: int, *args: Any, **kwargs: Any
+def compute_key_from_numpy_evaluation_and_args(
+    *args: Any,
+    _seed: int = 0,
+    _min_value: float = -100.0,
+    _max_value: float = 100.0,
+    _batch_size: int = 10,
+    **kwargs: Any,
 ) -> str:
-    """Compute a key for the function encoded in an individual by
-    evaluating it's NumPy expression on random input samples and
-    hashing the output values.
+    """Compute a key from the function encoded in an individual
+    and the remaining arguments passed to the decorated function.
+
+    The function is evaluated on random inputs and a key is generated
+    by hashing the corresponding output values.
+
+    Parameters
+    ----------
+    _seed : int, optional
+        Seed value for fec. Defaults to 0.
+    _min_value : float, optional
+        Minimal value for fec input samples. Defaults to -100.0.
+    _max_value : float, optional
+        Maximal value for fec input samples. Defaults to 100.0.
+    _batch_size : int, optional
+        Number of fec input samples. Defaults to 10.
 
     """
 
@@ -79,18 +102,18 @@ def __compute_key_from_evaluation_and_args(
     ):
         raise ValueError("first argument of decorated function must be an Individual instance")
 
-    rng = np.random.RandomState(seed=seed)
+    rng = np.random.RandomState(seed=_seed)
     ind = args[0]
     if isinstance(ind, IndividualSingleGenome):
         f_single = ind.to_numpy()
-        x = rng.uniform(min_value, max_value, (batch_size, ind.genome._n_inputs))
+        x = rng.uniform(_min_value, _max_value, (_batch_size, ind.genome._n_inputs))
         y = f_single(x)
         s = np.array_str(y, precision=15)
     elif isinstance(ind, IndividualMultiGenome):
         f_multi = ind.to_numpy()
         s = ""
         for i in range(len(ind.genome)):
-            x = rng.uniform(min_value, max_value, (batch_size, ind.genome[i]._n_inputs))
+            x = rng.uniform(_min_value, _max_value, (_batch_size, ind.genome[i]._n_inputs))
             y = f_multi[i](x)
             s += np.array_str(y, precision=15)
     else:
@@ -147,39 +170,37 @@ def __store_new_cache_entry(
 def disk_cache(
     fn: str,
     *,
-    use_fec: bool = False,
-    fec_seed: int = 0,
-    fec_min_value: float = -100.0,
-    fec_max_value: float = 100.0,
-    fec_batch_size: int = 10,
+    compute_key: Callable[..., str] = compute_key_from_numpy_evaluation_and_args,
     file_lock: Union[None, "mp.synchronize.Lock"] = None,
 ) -> Callable[[Callable[..., float]], Callable[..., float]]:
     """Cache function return values on disk.
 
-    Decorator that caches a function's return values on disk. Next time the
-    decorated function is called with the same arguments it returns the stored
-    values from disk instead of executing the function.
+    Decorator that caches a function's return values on disk. Next
+    time the decorated function is called with the same arguments it
+    returns the stored values from disk instead of executing the
+    function. The first argument of the decorated function *must* be
+    an IndividualBase instance.
 
     Consistency of the cache is checked upon decorating the function
     by making sure the it returns the same value as the first
     argument/keyword argument combination found in the cache.
 
-    If `use_fec` is `False` (default) the arguments of the decorated
-    function are used to compute a hash. If `use_fec` is `True` the
-    decorator uses functional equivalance checking [Real et al.,
-    2020]: It generates a NumPy-compatible expression from the
-    function's first argument (*must* be an `IndividualSingleGenome`
-    or `IndividualMultiGenome` instance) and evaluates it on randomly
-    generated values. The output values are then used to compute a
-    hash.
+    The `compute_key` parameter is a function receiving the aguments
+    and keyword arguments of the decorated function and must return a
+    unique key. By default, the decorator uses functional equivalance
+    checking [Real et al., 2020]: It generates a NumPy-compatible
+    expression from the function's first argument (*must* be an
+    `IndividualSingleGenome` or `IndividualMultiGenome` instance) and
+    evaluates it on randomly generated values. The output values are
+    then used to compute a hash.
 
     WARNING: this implementation is neither optimized for speed nor storage
     space and does not limit the size of the cache file.
 
     WARNING: the consistency check may pass incorrectly if the
     decorated function happens to return a consistent value for the
-    first argument from the cache although it returns different values
-    for other arguments.
+    arguments from the cache although it returns different values for
+    other arguments.
 
     WARNING: avoid using the decorator on nested functions as the
     consistency check will be applied on each decoration thus doubling
@@ -195,16 +216,10 @@ def disk_cache(
     ----------
     fn : str
         Name of the cache file.
-    use_fec : bool, optional
-        Whether to use functional equivalance checking. Defaults to False.
-    fec_seed : int, optional
-        Seed value for fec. Defaults to 0.
-    fec_min_value : float, optional
-        Minimal value for fec input samples. Defaults to -100.0.
-    fec_max_value : float, optional
-        Maximal value for fec input samples. Defaults to 100.0.
-    fec_batch_size : int, optional
-        Number of fec input samples. Defaults to 10.
+    compute_key : Callable[..., str], optional
+        Function to compute a unique key from an individual and the
+        remaining function arguments. Defaults to
+        `compute_key_from_numpy_evaluation_and_args`.
     file_lock : multiprocessing.synchronize.Lock, optional
         Lock to make sure only a single process reads from/write to
         cache file. Defaults to None.
@@ -220,15 +235,9 @@ def decorator(func: Callable[..., float]) -> Callable[..., float]:
         __check_cache_consistency(fn, func)
 
         @functools.wraps(func)
-        def wrapper(*args: Any, **kwargs: Any) -> Union[float, None]:
-
-            key: str
-            if use_fec:
-                key = __compute_key_from_evaluation_and_args(
-                    fec_seed, fec_min_value, fec_max_value, fec_batch_size, *args, **kwargs
-                )
-            else:
-                key = __compute_key_from_args(*args, **kwargs)
+        def wrapper(*args: Any, **kwargs: Any) -> float:
+
+            key: str = compute_key(*args, **kwargs)
 
             if file_lock is not None:
                 file_lock.acquire()
diff --git a/examples/example_caching.py b/examples/example_caching.py
index 4594beed..67eff2ce 100644
--- a/examples/example_caching.py
+++ b/examples/example_caching.py
@@ -30,18 +30,20 @@ def f_target(x):
 # function to compute (or retrieve from cache) the fitness of the individual.
 
 
-@cgp.utils.disk_cache("example_caching_cache.pkl")
-def inner_objective(expr):
+@cgp.utils.disk_cache(
+    "example_caching_cache.pkl", compute_key=cgp.utils.compute_key_from_sympy_expr_and_args
+)
+def inner_objective(ind):
     """The caching decorator uses the function parameters to identify
     identical function calls. Here, as many different genotypes
-    produce the same simplified SymPy expression we can use such
-    expressions as an argument to the decorated function to avoid
-    reevaluating functionally identical individuals.
-    Note that caching only makes sense for deterministic objective
-    functions, as it assumes that identical expressions will always
-    return the same fitness values.
+    produce the same simplified SymPy expression we can use these
+    avoid reevaluating functionally identical individuals. Note that
+    caching only makes sense for deterministic objective functions, as
+    it assumes that identical expressions will always return the same
+    fitness values.
 
     """
+    expr = ind.to_sympy()
     loss = []
     for x0 in np.linspace(-2.0, 2.0, 100):
         y = float(expr[0].subs({"x_0": x0}).evalf())
@@ -56,7 +58,7 @@ def objective(individual):
     if not individual.fitness_is_None():
         return individual
 
-    individual.fitness = -inner_objective(individual.to_sympy())
+    individual.fitness = -inner_objective(individual)
 
     return individual
 
diff --git a/examples/example_fec_caching.py b/examples/example_fec_caching.py
index 4b94c752..28076892 100644
--- a/examples/example_fec_caching.py
+++ b/examples/example_fec_caching.py
@@ -10,6 +10,7 @@
 
 """
 
+import functools
 import multiprocessing as mp
 import time
 
@@ -38,11 +39,13 @@ def f_target(x):
 
 @cgp.utils.disk_cache(
     "example_fec_caching_cache.pkl",
-    use_fec=True,
-    fec_seed=12345,
-    fec_min_value=-10.0,
-    fec_max_value=10.0,
-    fec_batch_size=5,
+    compute_key=functools.partial(
+        cgp.utils.compute_key_from_numpy_evaluation_and_args,
+        _seed=12345,
+        _min_value=-10.0,
+        _max_value=10.0,
+        _batch_size=5,
+    ),
     file_lock=mp.Lock(),
 )
 def inner_objective(ind):
diff --git a/test/conftest.py b/test/conftest.py
index f819fffb..d80614be 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -73,3 +73,10 @@ def n_offsprings():
 @fixture
 def tournament_size():
     return 2
+
+
+@fixture
+def individual(genome_params, rng):
+    g = cgp.Genome(**genome_params)
+    g.randomize(rng)
+    return cgp.IndividualSingleGenome(g)
diff --git a/test/test_utils.py b/test/test_utils.py
index c68157af..a32233f3 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -22,7 +22,8 @@ def test_cache_decorator_produces_identical_history(
     def f_target(x):
         return x[0] - x[1]
 
-    def inner_objective(expr):
+    def inner_objective(ind):
+        expr = ind.to_sympy()
         np.random.seed(rng_seed)
 
         if individual_type == "SingleGenome":
@@ -39,13 +40,15 @@ def inner_objective(expr):
             ) ** 2
         return loss
 
-    @cgp.utils.disk_cache(tempfile.mkstemp()[1])
-    def inner_objective_decorated(expr):
-        return inner_objective(expr)
+    @cgp.utils.disk_cache(
+        tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_sympy_expr_and_args
+    )
+    def inner_objective_decorated(ind):
+        return inner_objective(ind)
 
     def evolve(inner_objective):
         def objective(ind):
-            ind.fitness = -inner_objective(ind.to_sympy())
+            ind.fitness = -inner_objective(ind)
             return ind
 
         if individual_type == "SingleGenome":
@@ -101,7 +104,9 @@ def inner_objective(ind):
             loss += (f_target(x) - f(x)[0]) ** 2
         return loss
 
-    @cgp.utils.disk_cache(tempfile.mkstemp()[1], use_fec=True, fec_seed=rng_seed)
+    @cgp.utils.disk_cache(
+        tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_numpy_evaluation_and_args
+    )
     def inner_objective_decorated(ind):
         return inner_objective(ind)
 
@@ -138,7 +143,9 @@ def recording_callback(pop):
         assert fitness == pytest.approx(fitness_decorated)
 
 
-@cgp.utils.disk_cache(tempfile.mkstemp()[1], use_fec=True)
+@cgp.utils.disk_cache(
+    tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_numpy_evaluation_and_args
+)
 def _fec_cache_decorator_with_multiple_inputs_multiple_outputs_objective(ind):
     f = ind.to_numpy()
     x = np.array([[1.0, 2.0], [3.0, 4.0]])
@@ -210,19 +217,19 @@ def test_fec_cache_decorator_with_multiple_inputs_multiple_outputs(genome_params
 
 
 @cgp.utils.disk_cache(tempfile.mkstemp()[1])
-def _cache_decorator_objective_single_process(s, sleep_time):
+def _cache_decorator_objective_single_process(ind, sleep_time):
     time.sleep(sleep_time)  # simulate long execution time
-    return s
+    return 0.0
 
 
 @cgp.utils.disk_cache(tempfile.mkstemp()[1], file_lock=mp.Lock())
-def _cache_decorator_objective_two_processes(s, sleep_time):
+def _cache_decorator_objective_two_processes(ind, sleep_time):
     time.sleep(sleep_time)  # simulate long execution time
-    return s
+    return 0.0
 
 
 @pytest.mark.parametrize("n_processes", [1, 2])
-def test_cache_decorator(n_processes):
+def test_cache_decorator(n_processes, individual):
     def evaluate_objective_on_list(x):
         if n_processes == 1:
             objective = functools.partial(
@@ -237,7 +244,7 @@ def evaluate_objective_on_list(x):
                 return list(executor.map(objective, x))
 
     sleep_time = 1.0
-    x = [0, 1]
+    x = [individual, individual]
 
     # WARNING: below the number of processes is *not* taken into
     # account in the timing; one would expect a two-fold speedup when
@@ -263,38 +270,38 @@ def evaluate_objective_on_list(x):
     assert (time.time() - t0) < (0.4 * sleep_time)
 
 
-def test_cache_decorator_consistency():
+def test_cache_decorator_consistency(individual):
 
     cache_fn = tempfile.mkstemp()[1]
     x = 2
 
     @cgp.utils.disk_cache(cache_fn)
-    def objective_f(x):
+    def objective_f(ind):
         return x
 
     # call objective_f once to initialize the cache
-    assert objective_f(x) == pytest.approx(x)
+    assert objective_f(individual) == pytest.approx(x)
 
     # decorating a different function with different output using same
     # filename should raise an error
     with pytest.raises(RuntimeError):
 
         @cgp.utils.disk_cache(cache_fn)
-        def objective_g(x):
+        def objective_g(ind):
             return x ** 2
 
     # decorating a different function with identical output using the
     # same filename should NOT raise an error
     @cgp.utils.disk_cache(cache_fn)
-    def objective_h(x):
+    def objective_h(ind):
         return x
 
 
-def test_cache_decorator_does_not_compare_infinite_return_values():
+def test_cache_decorator_does_not_compare_infinite_return_values(individual):
     cache_fn = tempfile.mkstemp()[1]
 
     @cgp.utils.disk_cache(cache_fn)
-    def objective_f(x):
+    def objective_f(ind, x):
         try:
             return 1.0 / x
         except ZeroDivisionError:
@@ -303,17 +310,17 @@ def objective_f(x):
     # first call produces infinite return value, identical to
     # objective_g although in general their return values are
     # different
-    objective_f(0.0)
+    objective_f(individual, 0.0)
     # second call produces a finite return value which should be used
     # to check consistency
-    objective_f(2.0)
+    objective_f(individual, 2.0)
 
     # since the consistency check uses the finite return value it
     # should detect that the two objectives are indeed different
     with pytest.raises(RuntimeError):
 
         @cgp.utils.disk_cache(cache_fn)
-        def objective_g(x):
+        def objective_g(ind, x):
             try:
                 return 2.0 / x
             except ZeroDivisionError:
@@ -322,7 +329,7 @@ def objective_g(x):
 
 def test_cache_decorator_does_nothing_for_nonexistent_file():
     @cgp.utils.disk_cache("nonexistent_file.pkl")
-    def objective(x):
+    def objective(ind, x):
         return x
 
 
@@ -386,7 +393,9 @@ def test_fec_cache_decorator_with_additional_arguments(genome_params, rng, rng_s
     def f_target(x):
         return x[0] - x[1]
 
-    @cgp.utils.disk_cache(tempfile.mkstemp()[1], use_fec=True, fec_seed=rng_seed)
+    @cgp.utils.disk_cache(
+        tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_numpy_evaluation_and_args
+    )
     def inner_objective(ind, n_samples):
         np.random.seed(rng_seed)
 
@@ -407,3 +416,33 @@ def inner_objective(ind, n_samples):
     y1 = inner_objective(ind, 10)
 
     assert y0 != pytest.approx(y1)
+
+
+def test_custom_compute_key_for_disk_cache(individual, rng):
+    @cgp.utils.disk_cache(
+        tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_numpy_evaluation_and_args
+    )
+    def inner_objective(ind):
+        return ind.to_func()([1.0, 2.0])[0]
+
+    def my_compute_key(ind):
+        return 0
+
+    @cgp.utils.disk_cache(tempfile.mkstemp()[1], compute_key=my_compute_key)
+    def inner_objective_custom_compute_key(ind):
+        return ind.to_func()([1.0, 2.0])[0]
+
+    individual0 = individual.clone()
+    individual0.genome.randomize(rng)
+    individual1 = individual.clone()
+    individual1.genome.randomize(rng)
+
+    loss0 = inner_objective(individual0)
+    loss1 = inner_objective(individual1)
+
+    assert loss0 != pytest.approx(loss1)
+
+    loss0 = inner_objective_custom_compute_key(individual0)
+    loss1 = inner_objective_custom_compute_key(individual1)
+
+    assert loss0 == pytest.approx(loss1)