From 4e23405311f40ae000da1b6a7ec7b31f54f64845 Mon Sep 17 00:00:00 2001 From: Jakob Jordan Date: Wed, 13 Jan 2021 18:08:34 +0100 Subject: [PATCH] Allow custom functions to be used for computing cache keys --- cgp/utils.py | 107 +++++++++++++++++--------------- examples/example_caching.py | 20 +++--- examples/example_fec_caching.py | 13 ++-- test/conftest.py | 7 +++ test/test_utils.py | 89 ++++++++++++++++++-------- 5 files changed, 148 insertions(+), 88 deletions(-) diff --git a/cgp/utils.py b/cgp/utils.py index 59ddd223..74a6f8d2 100644 --- a/cgp/utils.py +++ b/cgp/utils.py @@ -55,22 +55,45 @@ def __find_args_and_return_value_for_consistency_check(fn: str) -> Union[Dict[st return None -def __compute_key_from_args(*args: Any, **kwargs: Any) -> str: - """Compute a key from the arguments passed to the decorated - function. +def compute_key_from_sympy_expr_and_args(*args: Any, **kwargs: Any) -> str: + """Compute a key from the sympy expression encoded in an individual + and the remaining arguments passed to the decorated function. """ - s: str = str(args) + str(kwargs) + if not ( + isinstance(args[0], IndividualSingleGenome) or isinstance(args[0], IndividualMultiGenome) + ): + raise ValueError("first argument of decorated function must be an Individual instance") + + s: str = str(args[0].to_sympy()) + str(args[1:]) + str(kwargs) return hashlib.sha1(s.encode("utf-8")).hexdigest() -def __compute_key_from_evaluation_and_args( - seed: int, min_value: float, max_value: float, batch_size: int, *args: Any, **kwargs: Any +def compute_key_from_numpy_evaluation_and_args( + *args: Any, + _seed: int = 0, + _min_value: float = -100.0, + _max_value: float = 100.0, + _batch_size: int = 10, + **kwargs: Any, ) -> str: - """Compute a key for the function encoded in an individual by - evaluating it's NumPy expression on random input samples and - hashing the output values. + """Compute a key from the function encoded in an individual + and the remaining arguments passed to the decorated function. + + The function is evaluated on random inputs and a key is generated + by hashing the corresponding output values. + + Parameters + ---------- + _seed : int, optional + Seed value for fec. Defaults to 0. + _min_value : float, optional + Minimal value for fec input samples. Defaults to -100.0. + _max_value : float, optional + Maximal value for fec input samples. Defaults to 100.0. + _batch_size : int, optional + Number of fec input samples. Defaults to 10. """ @@ -79,18 +102,18 @@ def __compute_key_from_evaluation_and_args( ): raise ValueError("first argument of decorated function must be an Individual instance") - rng = np.random.RandomState(seed=seed) + rng = np.random.RandomState(seed=_seed) ind = args[0] if isinstance(ind, IndividualSingleGenome): f_single = ind.to_numpy() - x = rng.uniform(min_value, max_value, (batch_size, ind.genome._n_inputs)) + x = rng.uniform(_min_value, _max_value, (_batch_size, ind.genome._n_inputs)) y = f_single(x) s = np.array_str(y, precision=15) elif isinstance(ind, IndividualMultiGenome): f_multi = ind.to_numpy() s = "" for i in range(len(ind.genome)): - x = rng.uniform(min_value, max_value, (batch_size, ind.genome[i]._n_inputs)) + x = rng.uniform(_min_value, _max_value, (_batch_size, ind.genome[i]._n_inputs)) y = f_multi[i](x) s += np.array_str(y, precision=15) else: @@ -147,39 +170,37 @@ def __store_new_cache_entry( def disk_cache( fn: str, *, - use_fec: bool = False, - fec_seed: int = 0, - fec_min_value: float = -100.0, - fec_max_value: float = 100.0, - fec_batch_size: int = 10, + compute_key: Callable[..., str] = compute_key_from_numpy_evaluation_and_args, file_lock: Union[None, "mp.synchronize.Lock"] = None, ) -> Callable[[Callable[..., float]], Callable[..., float]]: """Cache function return values on disk. - Decorator that caches a function's return values on disk. Next time the - decorated function is called with the same arguments it returns the stored - values from disk instead of executing the function. + Decorator that caches a function's return values on disk. Next + time the decorated function is called with the same arguments it + returns the stored values from disk instead of executing the + function. The first argument of the decorated function *must* be + an IndividualBase instance. Consistency of the cache is checked upon decorating the function by making sure the it returns the same value as the first argument/keyword argument combination found in the cache. - If `use_fec` is `False` (default) the arguments of the decorated - function are used to compute a hash. If `use_fec` is `True` the - decorator uses functional equivalance checking [Real et al., - 2020]: It generates a NumPy-compatible expression from the - function's first argument (*must* be an `IndividualSingleGenome` - or `IndividualMultiGenome` instance) and evaluates it on randomly - generated values. The output values are then used to compute a - hash. + The `compute_key` parameter is a function receiving the aguments + and keyword arguments of the decorated function and must return a + unique key. By default, the decorator uses functional equivalance + checking [Real et al., 2020]: It generates a NumPy-compatible + expression from the function's first argument (*must* be an + `IndividualSingleGenome` or `IndividualMultiGenome` instance) and + evaluates it on randomly generated values. The output values are + then used to compute a hash. WARNING: this implementation is neither optimized for speed nor storage space and does not limit the size of the cache file. WARNING: the consistency check may pass incorrectly if the decorated function happens to return a consistent value for the - first argument from the cache although it returns different values - for other arguments. + arguments from the cache although it returns different values for + other arguments. WARNING: avoid using the decorator on nested functions as the consistency check will be applied on each decoration thus doubling @@ -195,16 +216,10 @@ def disk_cache( ---------- fn : str Name of the cache file. - use_fec : bool, optional - Whether to use functional equivalance checking. Defaults to False. - fec_seed : int, optional - Seed value for fec. Defaults to 0. - fec_min_value : float, optional - Minimal value for fec input samples. Defaults to -100.0. - fec_max_value : float, optional - Maximal value for fec input samples. Defaults to 100.0. - fec_batch_size : int, optional - Number of fec input samples. Defaults to 10. + compute_key : Callable[..., str], optional + Function to compute a unique key from an individual and the + remaining function arguments. Defaults to + `compute_key_from_numpy_evaluation_and_args`. file_lock : multiprocessing.synchronize.Lock, optional Lock to make sure only a single process reads from/write to cache file. Defaults to None. @@ -220,15 +235,9 @@ def decorator(func: Callable[..., float]) -> Callable[..., float]: __check_cache_consistency(fn, func) @functools.wraps(func) - def wrapper(*args: Any, **kwargs: Any) -> Union[float, None]: - - key: str - if use_fec: - key = __compute_key_from_evaluation_and_args( - fec_seed, fec_min_value, fec_max_value, fec_batch_size, *args, **kwargs - ) - else: - key = __compute_key_from_args(*args, **kwargs) + def wrapper(*args: Any, **kwargs: Any) -> float: + + key: str = compute_key(*args, **kwargs) if file_lock is not None: file_lock.acquire() diff --git a/examples/example_caching.py b/examples/example_caching.py index 4594beed..67eff2ce 100644 --- a/examples/example_caching.py +++ b/examples/example_caching.py @@ -30,18 +30,20 @@ def f_target(x): # function to compute (or retrieve from cache) the fitness of the individual. -@cgp.utils.disk_cache("example_caching_cache.pkl") -def inner_objective(expr): +@cgp.utils.disk_cache( + "example_caching_cache.pkl", compute_key=cgp.utils.compute_key_from_sympy_expr_and_args +) +def inner_objective(ind): """The caching decorator uses the function parameters to identify identical function calls. Here, as many different genotypes - produce the same simplified SymPy expression we can use such - expressions as an argument to the decorated function to avoid - reevaluating functionally identical individuals. - Note that caching only makes sense for deterministic objective - functions, as it assumes that identical expressions will always - return the same fitness values. + produce the same simplified SymPy expression we can use these + avoid reevaluating functionally identical individuals. Note that + caching only makes sense for deterministic objective functions, as + it assumes that identical expressions will always return the same + fitness values. """ + expr = ind.to_sympy() loss = [] for x0 in np.linspace(-2.0, 2.0, 100): y = float(expr[0].subs({"x_0": x0}).evalf()) @@ -56,7 +58,7 @@ def objective(individual): if not individual.fitness_is_None(): return individual - individual.fitness = -inner_objective(individual.to_sympy()) + individual.fitness = -inner_objective(individual) return individual diff --git a/examples/example_fec_caching.py b/examples/example_fec_caching.py index 4b94c752..28076892 100644 --- a/examples/example_fec_caching.py +++ b/examples/example_fec_caching.py @@ -10,6 +10,7 @@ """ +import functools import multiprocessing as mp import time @@ -38,11 +39,13 @@ def f_target(x): @cgp.utils.disk_cache( "example_fec_caching_cache.pkl", - use_fec=True, - fec_seed=12345, - fec_min_value=-10.0, - fec_max_value=10.0, - fec_batch_size=5, + compute_key=functools.partial( + cgp.utils.compute_key_from_numpy_evaluation_and_args, + _seed=12345, + _min_value=-10.0, + _max_value=10.0, + _batch_size=5, + ), file_lock=mp.Lock(), ) def inner_objective(ind): diff --git a/test/conftest.py b/test/conftest.py index f819fffb..d80614be 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -73,3 +73,10 @@ def n_offsprings(): @fixture def tournament_size(): return 2 + + +@fixture +def individual(genome_params, rng): + g = cgp.Genome(**genome_params) + g.randomize(rng) + return cgp.IndividualSingleGenome(g) diff --git a/test/test_utils.py b/test/test_utils.py index c68157af..a32233f3 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -22,7 +22,8 @@ def test_cache_decorator_produces_identical_history( def f_target(x): return x[0] - x[1] - def inner_objective(expr): + def inner_objective(ind): + expr = ind.to_sympy() np.random.seed(rng_seed) if individual_type == "SingleGenome": @@ -39,13 +40,15 @@ def inner_objective(expr): ) ** 2 return loss - @cgp.utils.disk_cache(tempfile.mkstemp()[1]) - def inner_objective_decorated(expr): - return inner_objective(expr) + @cgp.utils.disk_cache( + tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_sympy_expr_and_args + ) + def inner_objective_decorated(ind): + return inner_objective(ind) def evolve(inner_objective): def objective(ind): - ind.fitness = -inner_objective(ind.to_sympy()) + ind.fitness = -inner_objective(ind) return ind if individual_type == "SingleGenome": @@ -101,7 +104,9 @@ def inner_objective(ind): loss += (f_target(x) - f(x)[0]) ** 2 return loss - @cgp.utils.disk_cache(tempfile.mkstemp()[1], use_fec=True, fec_seed=rng_seed) + @cgp.utils.disk_cache( + tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_numpy_evaluation_and_args + ) def inner_objective_decorated(ind): return inner_objective(ind) @@ -138,7 +143,9 @@ def recording_callback(pop): assert fitness == pytest.approx(fitness_decorated) -@cgp.utils.disk_cache(tempfile.mkstemp()[1], use_fec=True) +@cgp.utils.disk_cache( + tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_numpy_evaluation_and_args +) def _fec_cache_decorator_with_multiple_inputs_multiple_outputs_objective(ind): f = ind.to_numpy() x = np.array([[1.0, 2.0], [3.0, 4.0]]) @@ -210,19 +217,19 @@ def test_fec_cache_decorator_with_multiple_inputs_multiple_outputs(genome_params @cgp.utils.disk_cache(tempfile.mkstemp()[1]) -def _cache_decorator_objective_single_process(s, sleep_time): +def _cache_decorator_objective_single_process(ind, sleep_time): time.sleep(sleep_time) # simulate long execution time - return s + return 0.0 @cgp.utils.disk_cache(tempfile.mkstemp()[1], file_lock=mp.Lock()) -def _cache_decorator_objective_two_processes(s, sleep_time): +def _cache_decorator_objective_two_processes(ind, sleep_time): time.sleep(sleep_time) # simulate long execution time - return s + return 0.0 @pytest.mark.parametrize("n_processes", [1, 2]) -def test_cache_decorator(n_processes): +def test_cache_decorator(n_processes, individual): def evaluate_objective_on_list(x): if n_processes == 1: objective = functools.partial( @@ -237,7 +244,7 @@ def evaluate_objective_on_list(x): return list(executor.map(objective, x)) sleep_time = 1.0 - x = [0, 1] + x = [individual, individual] # WARNING: below the number of processes is *not* taken into # account in the timing; one would expect a two-fold speedup when @@ -263,38 +270,38 @@ def evaluate_objective_on_list(x): assert (time.time() - t0) < (0.4 * sleep_time) -def test_cache_decorator_consistency(): +def test_cache_decorator_consistency(individual): cache_fn = tempfile.mkstemp()[1] x = 2 @cgp.utils.disk_cache(cache_fn) - def objective_f(x): + def objective_f(ind): return x # call objective_f once to initialize the cache - assert objective_f(x) == pytest.approx(x) + assert objective_f(individual) == pytest.approx(x) # decorating a different function with different output using same # filename should raise an error with pytest.raises(RuntimeError): @cgp.utils.disk_cache(cache_fn) - def objective_g(x): + def objective_g(ind): return x ** 2 # decorating a different function with identical output using the # same filename should NOT raise an error @cgp.utils.disk_cache(cache_fn) - def objective_h(x): + def objective_h(ind): return x -def test_cache_decorator_does_not_compare_infinite_return_values(): +def test_cache_decorator_does_not_compare_infinite_return_values(individual): cache_fn = tempfile.mkstemp()[1] @cgp.utils.disk_cache(cache_fn) - def objective_f(x): + def objective_f(ind, x): try: return 1.0 / x except ZeroDivisionError: @@ -303,17 +310,17 @@ def objective_f(x): # first call produces infinite return value, identical to # objective_g although in general their return values are # different - objective_f(0.0) + objective_f(individual, 0.0) # second call produces a finite return value which should be used # to check consistency - objective_f(2.0) + objective_f(individual, 2.0) # since the consistency check uses the finite return value it # should detect that the two objectives are indeed different with pytest.raises(RuntimeError): @cgp.utils.disk_cache(cache_fn) - def objective_g(x): + def objective_g(ind, x): try: return 2.0 / x except ZeroDivisionError: @@ -322,7 +329,7 @@ def objective_g(x): def test_cache_decorator_does_nothing_for_nonexistent_file(): @cgp.utils.disk_cache("nonexistent_file.pkl") - def objective(x): + def objective(ind, x): return x @@ -386,7 +393,9 @@ def test_fec_cache_decorator_with_additional_arguments(genome_params, rng, rng_s def f_target(x): return x[0] - x[1] - @cgp.utils.disk_cache(tempfile.mkstemp()[1], use_fec=True, fec_seed=rng_seed) + @cgp.utils.disk_cache( + tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_numpy_evaluation_and_args + ) def inner_objective(ind, n_samples): np.random.seed(rng_seed) @@ -407,3 +416,33 @@ def inner_objective(ind, n_samples): y1 = inner_objective(ind, 10) assert y0 != pytest.approx(y1) + + +def test_custom_compute_key_for_disk_cache(individual, rng): + @cgp.utils.disk_cache( + tempfile.mkstemp()[1], compute_key=cgp.utils.compute_key_from_numpy_evaluation_and_args + ) + def inner_objective(ind): + return ind.to_func()([1.0, 2.0])[0] + + def my_compute_key(ind): + return 0 + + @cgp.utils.disk_cache(tempfile.mkstemp()[1], compute_key=my_compute_key) + def inner_objective_custom_compute_key(ind): + return ind.to_func()([1.0, 2.0])[0] + + individual0 = individual.clone() + individual0.genome.randomize(rng) + individual1 = individual.clone() + individual1.genome.randomize(rng) + + loss0 = inner_objective(individual0) + loss1 = inner_objective(individual1) + + assert loss0 != pytest.approx(loss1) + + loss0 = inner_objective_custom_compute_key(individual0) + loss1 = inner_objective_custom_compute_key(individual1) + + assert loss0 == pytest.approx(loss1)