Implement targeted PBT (#2006)

Implement targeted PBT
HypothesisWorks · Oct 1, 2019 · 08089a1 · 08089a1
2 parents 7766099 + 81b8ab0
commit 08089a1
Show file tree

Hide file tree

Showing 9 changed files with 336 additions and 37 deletions.
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,12 @@
+RELEASE_TYPE: minor
+
+This release adds the :func:`hypothesis.target` function, which implements
+**experimental** support for :ref:`targeted property-based testing <targeted-search>`
+(:issue:`1779`).
+
+By calling :func:`~hypothesis.target` in your test function, Hypothesis can
+do a hill-climbing search for bugs.  If you can calculate a suitable metric
+such as the load factor or length of a queue, this can help you find bugs with
+inputs that are highly improbably from unguided generation - however good our
+heuristics, example diversity, and deduplication logic might be.  After all,
+those features are at work in targeted PBT too!
diff --git a/hypothesis-python/docs/details.rst b/hypothesis-python/docs/details.rst
@@ -382,6 +382,34 @@ Check :ref:`the notes on framework compatibility <framework-compatibility>`
 to see how this affects other testing libraries you may be using.
 
 
+.. _targeted-search:
+
+---------------------------
+Targeted example generation
+---------------------------
+
+Targeted property-based testing combines the advantages of both search-based
+and property-based testing.  Instead of being completely random, T-PBT uses
+a search-based component to guide the input generation towards values that
+have a higher probability of falsifying a property.  This explores the input
+space more effectively and requires fewer tests to find a bug or achieve a
+high confidence in the system being tested than random PBT.
+(`Löscher and Sagonas <http://proper.softlab.ntua.gr/Publications.html>`__)
+
+This is not *always* a good idea - for example calculating the search metric
+might take time better spent running more uniformly-random test cases - but
+Hypothesis has **experimental** support for targeted PBT you may wish to try.
+
+.. autofunction:: hypothesis.target
+
+We recommend that users also skim the papers introducing targeted PBT;
+from `ISSTA 2017 <http://proper.softlab.ntua.gr/papers/issta2017.pdf>`__
+and `ICST 2018 <http://proper.softlab.ntua.gr/papers/icst2018.pdf>`__.
+For the curious, the initial implementation in Hypothesis uses hill-climbing
+search via a mutating fuzzer, with some tactics inspired by simulated
+annealing to avoid getting stuck and endlessly mutating a local maximum.
+
+
 .. _custom-function-execution:
 
 -------------------------

diff --git a/hypothesis-python/src/hypothesis/__init__.py b/hypothesis-python/src/hypothesis/__init__.py
@@ -32,7 +32,7 @@
     settings,
     unlimited,
 )
-from hypothesis.control import assume, event, note, reject
+from hypothesis.control import assume, event, note, reject, target
 from hypothesis.core import example, find, given, reproduce_failure, seed
 from hypothesis.internal.entropy import register_random
 from hypothesis.utils.conventions import infer
@@ -56,6 +56,7 @@
     "event",
     "infer",
     "register_random",
+    "target",
     "__version__",
     "__version_info__",
 ]
diff --git a/hypothesis-python/src/hypothesis/control.py b/hypothesis-python/src/hypothesis/control.py
@@ -17,11 +17,14 @@
 
 from __future__ import absolute_import, division, print_function
 
+import math
 import traceback
 
 from hypothesis import Verbosity, settings
 from hypothesis.errors import CleanupFailed, InvalidArgument, UnsatisfiedAssumption
-from hypothesis.reporting import report
+from hypothesis.internal.compat import string_types
+from hypothesis.internal.validation import check_type
+from hypothesis.reporting import report, verbose_report
 from hypothesis.utils.dynamicvariables import DynamicVariable
 
 if False:
@@ -125,3 +128,59 @@ def event(value):
 
     if context.data is not None:
         context.data.note_event(value)
+
+
+def target(observation, label=""):
+    # type: (float, str) -> None
+    """Calling this function with a ``float`` observation gives it feedback
+    with which to guide our search for inputs that will cause an error, in
+    addition to all the usual heuristics.  Observations must always be finite.
+
+    Hypothesis will try to maximize the observed value over several examples;
+    almost any metric will work so long as it makes sense to increase it.
+    For example, ``-abs(error)`` is a metric that increases as ``error``
+    approaches zero.
+
+    Example metrics:
+
+    - Number of elements in a collection, or tasks in a queue
+    - Mean or maximum runtime of a task (or both, if you use ``label``)
+    - Compression ratio for data (perhaps per-algorithm or per-level)
+    - Number of steps taken by a state machine
+
+    The optional ``label`` argument can be used to distinguish between
+    and therefore separately optimise distinct observations, such as the
+    mean and standard deviation of a dataset.  It is an error to call
+    ``target()`` with any label more than once per test case.
+
+    .. note::
+        **The more examples you run, the better this technique works.**
+
+        As a rule of thumb, the targeting effect is noticeable above
+        :obj:`max_exmples=1000 <hypothesis.settings.max_examples>`,
+        and immediately obvious by around ten thousand examples
+        *per label* used by your test.
+
+    .. note::
+        ``hypothesis.target`` is considered experimental, and may be radically
+        changed or even removed in a future version.  If you find it useful,
+        please let us know so we can share and build on that success!
+    """
+    check_type(float, observation, "observation")
+    if math.isinf(observation) or math.isnan(observation):
+        raise InvalidArgument("observation=%r must be a finite float." % observation)
+    check_type(string_types, label, "label")
+
+    context = _current_build_context.value
+    if context is None:
+        raise InvalidArgument("Calling target() outside of a test is invalid.")
+    verbose_report("Saw target(observation=%r, label=%r)" % (observation, label))
+
+    if context.data is not None:
+        if label in context.data.target_observations:
+            raise InvalidArgument(
+                "Calling target(%r, label=%r) would overwrite target(%r, label=%r)"
+                % (observation, label, context.data.target_observations[label], label)
+            )
+        else:
+            context.data.target_observations[label] = observation
diff --git a/hypothesis-python/src/hypothesis/internal/compat.py b/hypothesis-python/src/hypothesis/internal/compat.py
@@ -42,6 +42,22 @@
 except ImportError:
     import collections as abc  # type: ignore
 
+try:
+    from itertools import accumulate
+except ImportError:
+
+    def accumulate(iterable, func=lambda a, b: a + b):
+        it = iter(iterable)
+        try:
+            total = next(it)
+        except StopIteration:
+            return
+        yield total
+        for element in it:
+            total = func(total, element)
+            yield total
+
+
 if False:
     from typing import Type, Tuple  # noqa
 

diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/data.py b/hypothesis-python/src/hypothesis/internal/conjecture/data.py
@@ -756,6 +756,10 @@ def __init__(self, max_length, draw_bytes, observer=None):
 
         self.__result = None
 
+        # Observations used for targeted search.  They'll be aggregated in
+        # ConjectureRunner.generate_new_examples and fed to TargetSelector.
+        self.target_observations = {}
+
         # Normally unpopulated but we need this in the niche case
         # that self.as_result() is Overrun but we still want the
         # examples for reporting purposes.

diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/engine.py b/hypothesis-python/src/hypothesis/internal/conjecture/engine.py
@@ -17,6 +17,8 @@
 
 from __future__ import absolute_import, division, print_function
 
+from bisect import bisect
+from collections import defaultdict
 from enum import Enum
 from random import Random, getrandbits
 from weakref import WeakKeyDictionary
@@ -28,6 +30,7 @@
 from hypothesis.internal.cache import LRUReusedCache
 from hypothesis.internal.compat import (
     Counter,
+    accumulate,
     ceil,
     hbytes,
     hrange,
@@ -676,39 +679,39 @@ def should_generate_more():
             )
 
         count = 0
-        while should_generate_more() and (
-            count < 10
-            or self.health_check_state is not None
-            # If we have not found a valid prefix yet, the target selector will
-            # be empty and the mutation stage will fail with a very rare internal
-            # error.  We therefore continue this initial random generation step
-            # until we have found at least one prefix to mutate.
-            or len(self.target_selector) == 0
-        ):
-            prefix = self.generate_novel_prefix()
-
-            def draw_bytes(data, n):
-                if data.index < len(prefix):
-                    result = prefix[data.index : data.index + n]
-                    # We always draw prefixes as a whole number of blocks
-                    assert len(result) == n
-                else:
-                    result = uniform(self.random, n)
-                return self.__zero_bound(data, result)
-
-            last_data = self.new_conjecture_data(draw_bytes)
-            self.test_function(last_data)
-            last_data.freeze()
-
-            count += 1
-
         mutations = 0
         mutator = self._new_mutator()
-
         zero_bound_queue = []
 
         while should_generate_more():
-            if zero_bound_queue:
+            if (
+                count < 10
+                or self.health_check_state is not None
+                # If we have not found a valid prefix yet, the target selector will
+                # be empty and the mutation stage will fail with a very rare internal
+                # error.  We therefore continue this initial random generation step
+                # until we have found at least one prefix to mutate.
+                or len(self.target_selector) == 0
+                # For long-running tests, if we are not currently dealing with an
+                # overrun we want a small chance to generate an entirely novel buffer.
+                or not (zero_bound_queue or self.random.randrange(20))
+            ):
+                prefix = self.generate_novel_prefix()
+
+                def draw_bytes(data, n):
+                    if data.index < len(prefix):
+                        result = prefix[data.index : data.index + n]
+                        # We always draw prefixes as a whole number of blocks
+                        assert len(result) == n
+                    else:
+                        result = uniform(self.random, n)
+                    return self.__zero_bound(data, result)
+
+                data = self.new_conjecture_data(draw_bytes)
+                self.test_function(data)
+                data.freeze()
+                count += 1
+            elif zero_bound_queue:
                 # Whenever we generated an example and it hits a bound
                 # which forces zero blocks into it, this creates a weird
                 # distortion effect by making certain parts of the data
@@ -961,9 +964,12 @@ class TargetSelector(object):
        counting INTERESTING, which is special).
     2. We preferentially return examples we've never returned before when
        select() is called.
-    3. The number of retained examples is never more than self.pool_size, with
-       past examples discarded automatically, preferring ones that we have
-       already explored from.
+       If ``target()`` is in use, we maintain a pool of the best examples
+       for each known label, choose a random label, and choose an example
+       from that pool with higher-ranked examples more probable.
+    3. The number of retained examples is never more than the greater of
+       self.pool_size, with older examples discarded automatically, preferring
+       ones that we have already explored from.
 
     These invariants are fairly heavily prone to change - they're not
     especially well validated as being optimal, and are mostly just a decent
@@ -974,14 +980,20 @@ def __init__(self, random, pool_size=MUTATION_POOL_SIZE):
         self.random = random
         self.best_status = Status.OVERRUN
         self.pool_size = pool_size
+        self.weights = tuple(accumulate(0.5 ** i for i in range(10)))
         self.reset()
 
     def __len__(self):
-        return len(self.fresh_examples) + len(self.used_examples)
+        return (
+            len(self.fresh_examples)
+            + len(self.used_examples)
+            + sum(len(pool) for pool in self.scored_examples.values())
+        )
 
     def reset(self):
         self.fresh_examples = []
         self.used_examples = []
+        self.scored_examples = defaultdict(list)
 
     def add(self, data):
         if data.status == Status.INTERESTING:
@@ -992,12 +1004,45 @@ def add(self, data):
             self.best_status = data.status
             self.reset()
 
-        self.fresh_examples.append(data)
+        for label, score in data.target_observations.items():
+            pool = self.scored_examples[label]
+            negative_scores = [-d.target_observations[label] for d in pool]
+            pool.insert(bisect(negative_scores, -score), data)
+            if len(pool) > len(self.weights):
+                pool.pop()
+            self.maybe_discard_one()
+
+        if not self.scored_examples:
+            self.fresh_examples.append(data)
+            self.maybe_discard_one()
+
+    def maybe_discard_one(self):
         if len(self) > self.pool_size:
-            pop_random(self.random, self.used_examples or self.fresh_examples)
-            assert self.pool_size == len(self)
+            if self.used_examples or self.fresh_examples:
+                pop_random(self.random, self.used_examples or self.fresh_examples)
+            else:
+                # Get an arbitrary label from those with the longest pools,
+                # discard the lowest-scored example from that pool,
+                # and discard the label if it had no other examples.
+                label, pool = max(
+                    self.scored_examples.items(), key=lambda kv: len(kv[1])
+                )
+                pool.pop()
+                if not pool:
+                    self.scored_examples.pop(label)
+            assert len(self) == self.pool_size
 
     def select(self):
+        # If we have feedback from targeted PBT, choose a label and then an example
+        # from that pool.  Each example is twice as likely as the next-highest-ranked.
+        if self.scored_examples:  # pragma: no cover
+            # For some reason this clause is often showing up as uncovered,
+            # though tests/cover/test_targeting.py definitely executes it :-/
+            pool = self.random.choice(list(self.scored_examples.values()))
+            stop_at = self.random.random() * self.weights[len(pool) - 1]
+            return pool[bisect(self.weights, stop_at)]
+
+        # Otherwise, prefer first-time mutations to previously-mutated examples
         if self.fresh_examples:
             result = pop_random(self.random, self.fresh_examples)
             self.used_examples.append(result)

diff --git a/hypothesis-python/tests/cover/test_conjecture_engine.py b/hypothesis-python/tests/cover/test_conjecture_engine.py
@@ -1346,6 +1346,7 @@ def x(data):
 class FakeData(object):
     status = attr.ib(default=Status.VALID)
     global_identifer = attr.ib(init=False)
+    target_observations = attr.ib(factory=dict)
 
     def __attrs_post_init__(self):
         global fake_data_counter
@@ -1361,6 +1362,29 @@ def test_target_selector_will_maintain_a_bounded_pool():
         assert len(selector) == min(i + 1, 3)
 
 
+def test_target_selector_can_discard_labels():
+    selector = TargetSelector(random=Random(0), pool_size=2)
+    for i in range(10):
+        selector.add(FakeData(target_observations={str(i): 0.0}))
+        assert len(selector) <= 2
+
+
+@pytest.mark.parametrize("pool_size", [5, 25])
+def test_target_selector_will_maintain_a_bounded_size_with_scores(pool_size):
+    selector = TargetSelector(random=Random(0), pool_size=pool_size)
+    selector.add(FakeData())
+
+    for i in range(100):
+        selector.add(FakeData(target_observations={str(i // 3 == 0): float(i % 30)}))
+        assert len(selector) <= pool_size
+        for label, pool in selector.scored_examples.items():
+            scores = [ex.target_observations[label] for ex in pool]
+            assert scores == sorted(scores, reverse=True)
+
+    selector.add(FakeData())
+    assert len(selector) <= pool_size
+
+
 def test_target_selector_will_use_novel_examples_preferentially():
     selector = TargetSelector(random=Random(0), pool_size=3)
     seen = set()