Skip to content

Commit

Permalink
Implement targeted PBT (#2006)
Browse files Browse the repository at this point in the history
Implement targeted PBT
  • Loading branch information
Zac-HD committed Oct 1, 2019
2 parents 7766099 + 81b8ab0 commit 08089a1
Show file tree
Hide file tree
Showing 9 changed files with 336 additions and 37 deletions.
12 changes: 12 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
RELEASE_TYPE: minor

This release adds the :func:`hypothesis.target` function, which implements
**experimental** support for :ref:`targeted property-based testing <targeted-search>`
(:issue:`1779`).

By calling :func:`~hypothesis.target` in your test function, Hypothesis can
do a hill-climbing search for bugs. If you can calculate a suitable metric
such as the load factor or length of a queue, this can help you find bugs with
inputs that are highly improbably from unguided generation - however good our
heuristics, example diversity, and deduplication logic might be. After all,
those features are at work in targeted PBT too!
28 changes: 28 additions & 0 deletions hypothesis-python/docs/details.rst
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,34 @@ Check :ref:`the notes on framework compatibility <framework-compatibility>`
to see how this affects other testing libraries you may be using.
.. _targeted-search:
---------------------------
Targeted example generation
---------------------------
Targeted property-based testing combines the advantages of both search-based
and property-based testing. Instead of being completely random, T-PBT uses
a search-based component to guide the input generation towards values that
have a higher probability of falsifying a property. This explores the input
space more effectively and requires fewer tests to find a bug or achieve a
high confidence in the system being tested than random PBT.
(`Löscher and Sagonas <http://proper.softlab.ntua.gr/Publications.html>`__)
This is not *always* a good idea - for example calculating the search metric
might take time better spent running more uniformly-random test cases - but
Hypothesis has **experimental** support for targeted PBT you may wish to try.
.. autofunction:: hypothesis.target
We recommend that users also skim the papers introducing targeted PBT;
from `ISSTA 2017 <http://proper.softlab.ntua.gr/papers/issta2017.pdf>`__
and `ICST 2018 <http://proper.softlab.ntua.gr/papers/icst2018.pdf>`__.
For the curious, the initial implementation in Hypothesis uses hill-climbing
search via a mutating fuzzer, with some tactics inspired by simulated
annealing to avoid getting stuck and endlessly mutating a local maximum.
.. _custom-function-execution:
-------------------------
Expand Down
3 changes: 2 additions & 1 deletion hypothesis-python/src/hypothesis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
settings,
unlimited,
)
from hypothesis.control import assume, event, note, reject
from hypothesis.control import assume, event, note, reject, target
from hypothesis.core import example, find, given, reproduce_failure, seed
from hypothesis.internal.entropy import register_random
from hypothesis.utils.conventions import infer
Expand All @@ -56,6 +56,7 @@
"event",
"infer",
"register_random",
"target",
"__version__",
"__version_info__",
]
61 changes: 60 additions & 1 deletion hypothesis-python/src/hypothesis/control.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@

from __future__ import absolute_import, division, print_function

import math
import traceback

from hypothesis import Verbosity, settings
from hypothesis.errors import CleanupFailed, InvalidArgument, UnsatisfiedAssumption
from hypothesis.reporting import report
from hypothesis.internal.compat import string_types
from hypothesis.internal.validation import check_type
from hypothesis.reporting import report, verbose_report
from hypothesis.utils.dynamicvariables import DynamicVariable

if False:
Expand Down Expand Up @@ -125,3 +128,59 @@ def event(value):

if context.data is not None:
context.data.note_event(value)


def target(observation, label=""):
# type: (float, str) -> None
"""Calling this function with a ``float`` observation gives it feedback
with which to guide our search for inputs that will cause an error, in
addition to all the usual heuristics. Observations must always be finite.
Hypothesis will try to maximize the observed value over several examples;
almost any metric will work so long as it makes sense to increase it.
For example, ``-abs(error)`` is a metric that increases as ``error``
approaches zero.
Example metrics:
- Number of elements in a collection, or tasks in a queue
- Mean or maximum runtime of a task (or both, if you use ``label``)
- Compression ratio for data (perhaps per-algorithm or per-level)
- Number of steps taken by a state machine
The optional ``label`` argument can be used to distinguish between
and therefore separately optimise distinct observations, such as the
mean and standard deviation of a dataset. It is an error to call
``target()`` with any label more than once per test case.
.. note::
**The more examples you run, the better this technique works.**
As a rule of thumb, the targeting effect is noticeable above
:obj:`max_exmples=1000 <hypothesis.settings.max_examples>`,
and immediately obvious by around ten thousand examples
*per label* used by your test.
.. note::
``hypothesis.target`` is considered experimental, and may be radically
changed or even removed in a future version. If you find it useful,
please let us know so we can share and build on that success!
"""
check_type(float, observation, "observation")
if math.isinf(observation) or math.isnan(observation):
raise InvalidArgument("observation=%r must be a finite float." % observation)
check_type(string_types, label, "label")

context = _current_build_context.value
if context is None:
raise InvalidArgument("Calling target() outside of a test is invalid.")
verbose_report("Saw target(observation=%r, label=%r)" % (observation, label))

if context.data is not None:
if label in context.data.target_observations:
raise InvalidArgument(
"Calling target(%r, label=%r) would overwrite target(%r, label=%r)"
% (observation, label, context.data.target_observations[label], label)
)
else:
context.data.target_observations[label] = observation
16 changes: 16 additions & 0 deletions hypothesis-python/src/hypothesis/internal/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,22 @@
except ImportError:
import collections as abc # type: ignore

try:
from itertools import accumulate
except ImportError:

def accumulate(iterable, func=lambda a, b: a + b):
it = iter(iterable)
try:
total = next(it)
except StopIteration:
return
yield total
for element in it:
total = func(total, element)
yield total


if False:
from typing import Type, Tuple # noqa

Expand Down
4 changes: 4 additions & 0 deletions hypothesis-python/src/hypothesis/internal/conjecture/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,10 @@ def __init__(self, max_length, draw_bytes, observer=None):

self.__result = None

# Observations used for targeted search. They'll be aggregated in
# ConjectureRunner.generate_new_examples and fed to TargetSelector.
self.target_observations = {}

# Normally unpopulated but we need this in the niche case
# that self.as_result() is Overrun but we still want the
# examples for reporting purposes.
Expand Down
115 changes: 80 additions & 35 deletions hypothesis-python/src/hypothesis/internal/conjecture/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from __future__ import absolute_import, division, print_function

from bisect import bisect
from collections import defaultdict
from enum import Enum
from random import Random, getrandbits
from weakref import WeakKeyDictionary
Expand All @@ -28,6 +30,7 @@
from hypothesis.internal.cache import LRUReusedCache
from hypothesis.internal.compat import (
Counter,
accumulate,
ceil,
hbytes,
hrange,
Expand Down Expand Up @@ -676,39 +679,39 @@ def should_generate_more():
)

count = 0
while should_generate_more() and (
count < 10
or self.health_check_state is not None
# If we have not found a valid prefix yet, the target selector will
# be empty and the mutation stage will fail with a very rare internal
# error. We therefore continue this initial random generation step
# until we have found at least one prefix to mutate.
or len(self.target_selector) == 0
):
prefix = self.generate_novel_prefix()

def draw_bytes(data, n):
if data.index < len(prefix):
result = prefix[data.index : data.index + n]
# We always draw prefixes as a whole number of blocks
assert len(result) == n
else:
result = uniform(self.random, n)
return self.__zero_bound(data, result)

last_data = self.new_conjecture_data(draw_bytes)
self.test_function(last_data)
last_data.freeze()

count += 1

mutations = 0
mutator = self._new_mutator()

zero_bound_queue = []

while should_generate_more():
if zero_bound_queue:
if (
count < 10
or self.health_check_state is not None
# If we have not found a valid prefix yet, the target selector will
# be empty and the mutation stage will fail with a very rare internal
# error. We therefore continue this initial random generation step
# until we have found at least one prefix to mutate.
or len(self.target_selector) == 0
# For long-running tests, if we are not currently dealing with an
# overrun we want a small chance to generate an entirely novel buffer.
or not (zero_bound_queue or self.random.randrange(20))
):
prefix = self.generate_novel_prefix()

def draw_bytes(data, n):
if data.index < len(prefix):
result = prefix[data.index : data.index + n]
# We always draw prefixes as a whole number of blocks
assert len(result) == n
else:
result = uniform(self.random, n)
return self.__zero_bound(data, result)

data = self.new_conjecture_data(draw_bytes)
self.test_function(data)
data.freeze()
count += 1
elif zero_bound_queue:
# Whenever we generated an example and it hits a bound
# which forces zero blocks into it, this creates a weird
# distortion effect by making certain parts of the data
Expand Down Expand Up @@ -961,9 +964,12 @@ class TargetSelector(object):
counting INTERESTING, which is special).
2. We preferentially return examples we've never returned before when
select() is called.
3. The number of retained examples is never more than self.pool_size, with
past examples discarded automatically, preferring ones that we have
already explored from.
If ``target()`` is in use, we maintain a pool of the best examples
for each known label, choose a random label, and choose an example
from that pool with higher-ranked examples more probable.
3. The number of retained examples is never more than the greater of
self.pool_size, with older examples discarded automatically, preferring
ones that we have already explored from.
These invariants are fairly heavily prone to change - they're not
especially well validated as being optimal, and are mostly just a decent
Expand All @@ -974,14 +980,20 @@ def __init__(self, random, pool_size=MUTATION_POOL_SIZE):
self.random = random
self.best_status = Status.OVERRUN
self.pool_size = pool_size
self.weights = tuple(accumulate(0.5 ** i for i in range(10)))
self.reset()

def __len__(self):
return len(self.fresh_examples) + len(self.used_examples)
return (
len(self.fresh_examples)
+ len(self.used_examples)
+ sum(len(pool) for pool in self.scored_examples.values())
)

def reset(self):
self.fresh_examples = []
self.used_examples = []
self.scored_examples = defaultdict(list)

def add(self, data):
if data.status == Status.INTERESTING:
Expand All @@ -992,12 +1004,45 @@ def add(self, data):
self.best_status = data.status
self.reset()

self.fresh_examples.append(data)
for label, score in data.target_observations.items():
pool = self.scored_examples[label]
negative_scores = [-d.target_observations[label] for d in pool]
pool.insert(bisect(negative_scores, -score), data)
if len(pool) > len(self.weights):
pool.pop()
self.maybe_discard_one()

if not self.scored_examples:
self.fresh_examples.append(data)
self.maybe_discard_one()

def maybe_discard_one(self):
if len(self) > self.pool_size:
pop_random(self.random, self.used_examples or self.fresh_examples)
assert self.pool_size == len(self)
if self.used_examples or self.fresh_examples:
pop_random(self.random, self.used_examples or self.fresh_examples)
else:
# Get an arbitrary label from those with the longest pools,
# discard the lowest-scored example from that pool,
# and discard the label if it had no other examples.
label, pool = max(
self.scored_examples.items(), key=lambda kv: len(kv[1])
)
pool.pop()
if not pool:
self.scored_examples.pop(label)
assert len(self) == self.pool_size

def select(self):
# If we have feedback from targeted PBT, choose a label and then an example
# from that pool. Each example is twice as likely as the next-highest-ranked.
if self.scored_examples: # pragma: no cover
# For some reason this clause is often showing up as uncovered,
# though tests/cover/test_targeting.py definitely executes it :-/
pool = self.random.choice(list(self.scored_examples.values()))
stop_at = self.random.random() * self.weights[len(pool) - 1]
return pool[bisect(self.weights, stop_at)]

# Otherwise, prefer first-time mutations to previously-mutated examples
if self.fresh_examples:
result = pop_random(self.random, self.fresh_examples)
self.used_examples.append(result)
Expand Down
24 changes: 24 additions & 0 deletions hypothesis-python/tests/cover/test_conjecture_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,6 +1346,7 @@ def x(data):
class FakeData(object):
status = attr.ib(default=Status.VALID)
global_identifer = attr.ib(init=False)
target_observations = attr.ib(factory=dict)

def __attrs_post_init__(self):
global fake_data_counter
Expand All @@ -1361,6 +1362,29 @@ def test_target_selector_will_maintain_a_bounded_pool():
assert len(selector) == min(i + 1, 3)


def test_target_selector_can_discard_labels():
selector = TargetSelector(random=Random(0), pool_size=2)
for i in range(10):
selector.add(FakeData(target_observations={str(i): 0.0}))
assert len(selector) <= 2


@pytest.mark.parametrize("pool_size", [5, 25])
def test_target_selector_will_maintain_a_bounded_size_with_scores(pool_size):
selector = TargetSelector(random=Random(0), pool_size=pool_size)
selector.add(FakeData())

for i in range(100):
selector.add(FakeData(target_observations={str(i // 3 == 0): float(i % 30)}))
assert len(selector) <= pool_size
for label, pool in selector.scored_examples.items():
scores = [ex.target_observations[label] for ex in pool]
assert scores == sorted(scores, reverse=True)

selector.add(FakeData())
assert len(selector) <= pool_size


def test_target_selector_will_use_novel_examples_preferentially():
selector = TargetSelector(random=Random(0), pool_size=3)
seen = set()
Expand Down

0 comments on commit 08089a1

Please sign in to comment.