HypothesisWorks · Zac-HD · May 29, 2024 · May 1, 2024 · May 2, 2024 · May 2, 2024
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,7 @@
+RELEASE_TYPE: minor
+
+This release migrates the shrinker to our new internal representation, called the IR layer (:pull:`3962`). This improves the shrinker's performance in the majority of cases. For example, on the Hypothesis test suite, shrinking is a median of 1.38x faster.
+
+It is possible this release regresses performance while shrinking certain strategies. If you encounter strategies which reliably shrink more slowly than they used to (or shrink slowly at all), please open an issue!
+
+You can read more about the IR layer at :issue:`3921`.
diff --git a/hypothesis-python/benchmark/README.md b/hypothesis-python/benchmark/README.md
@@ -0,0 +1,14 @@
+This directory contains code for benchmarking Hypothesis' shrinking. This was written for [pull/3962](https://github.com/HypothesisWorks/hypothesis/pull/3962) and is a manual process at the moment, though we may eventually integrate it more closely with ci for automated benchmarking.
+
+To run a benchmark:
+
+* Add the contents of `conftest.py` to the bottom of `hypothesis-python/tests/conftest.py`
+* In `hypothesis-python/tests/common/debug.py`, change `derandomize=True` to `derandomize=False` (if you are running more than one trial)
+* Run the tests: `pytest hypothesis-python/tests/`
+  * Note that the benchmarking script does not currently support xdist, so do not use `-n 8` or similar.
+
+When pytest finishes the output will contain a dictionary of the benchmarking results. Add that as a new entry in `data.json`. Repeat for however many trials you want; n=5 seems reasonable.
+
+Also repeat for both your baseline ("old") and your comparison ("new") code.
+
+Then run `python graph.py` to generate a graph comparing the old and new results.
diff --git a/hypothesis-python/benchmark/conftest.py b/hypothesis-python/benchmark/conftest.py
@@ -0,0 +1,66 @@
+# This file is part of Hypothesis, which may be found at
+# https://github.com/HypothesisWorks/hypothesis/
+#
+# Copyright the Hypothesis Authors.
+# Individual contributors are listed in AUTHORS.rst and the git log.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at https://mozilla.org/MPL/2.0/.
+
+import inspect
+import json
+from collections import defaultdict
+
+import pytest
+from _pytest.monkeypatch import MonkeyPatch
+
+# we'd like to support xdist here for parallelism, but a session-scope fixture won't
+# be enough: https://github.com/pytest-dev/pytest-xdist/issues/271. need a lockfile
+# or equivalent.
+shrink_calls = defaultdict(list)
+
+
+def pytest_collection_modifyitems(config, items):
+    skip = pytest.mark.skip(reason="Does not call minimal()")
+    for item in items:
+        # is this perfect? no. but it is cheap!
+        if " minimal(" in inspect.getsource(item.obj):
+            continue
+        item.add_marker(skip)
+
+
+@pytest.fixture(scope="function", autouse=True)
+def _benchmark_shrinks():
+    from hypothesis.internal.conjecture.shrinker import Shrinker
+
+    monkeypatch = MonkeyPatch()
+
+    def record_shrink_calls(calls):
+        name = None
+        for frame in inspect.stack():
+            if frame.function.startswith("test_"):
+                name = f"{frame.filename.split('/')[-1]}::{frame.function}"
+        # some minimal calls happen at collection-time outside of a test context
+        # (maybe something we should fix/look into)
+        if name is None:
+            return
+
+        shrink_calls[name].append(calls)
+
+    old_shrink = Shrinker.shrink
+
+    def shrink(self, *args, **kwargs):
+        v = old_shrink(self, *args, **kwargs)
+        record_shrink_calls(self.engine.call_count - self.initial_calls)
+        return v
+
+    monkeypatch.setattr(Shrinker, "shrink", shrink)
+    yield
+
+    # start teardown
+    Shrinker.shrink = old_shrink
+
+
+def pytest_sessionfinish(session, exitstatus):
+    print(f"\nshrinker profiling:\n{json.dumps(shrink_calls)}")
diff --git a/hypothesis-python/benchmark/data.json b/hypothesis-python/benchmark/data.json
@@ -0,0 +1,4 @@
+{
+    "old": [],
+    "new": []
+}
diff --git a/hypothesis-python/benchmark/graph.py b/hypothesis-python/benchmark/graph.py
@@ -0,0 +1,114 @@
+# This file is part of Hypothesis, which may be found at
+# https://github.com/HypothesisWorks/hypothesis/
+#
+# Copyright the Hypothesis Authors.
+# Individual contributors are listed in AUTHORS.rst and the git log.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at https://mozilla.org/MPL/2.0/.
+
+import json
+import statistics
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+data_path = Path(__file__).parent / "data.json"
+with open(data_path) as f:
+    data = json.loads(f.read())
+
+old_runs = data["old"]
+new_runs = data["new"]
+all_runs = old_runs + new_runs
+
+# every run should involve the same functions
+names = set()
+for run in all_runs:
+    names.add(frozenset(run.keys()))
+
+intersection = frozenset.intersection(*names)
+diff = frozenset.union(*[intersection.symmetric_difference(n) for n in names])
+
+print(f"skipping these tests which were not present in all runs: {', '.join(diff)}")
+names = list(intersection)
+
+# the similar invariant for number of minimal calls per run is not true: functions
+# may make a variable number of minimal() calls.
+# it would be nice to compare identically just the ones which don't vary, to get
+# a very fine grained comparison instead of averaging.
+# sizes = []
+# for run in all_runs:
+#     sizes.append(tuple(len(value) for value in run.values()))
+# assert len(set(sizes)) == 1
+
+new_names = []
+for name in names:
+    if all(all(x == 0 for x in run[name]) for run in all_runs):
+        print(f"no shrinks for {name}, skipping")
+        continue
+    new_names.append(name)
+names = new_names
+
+
+# name : average calls
+old_values = {}
+new_values = {}
+for name in names:
+
+    # mean across the different minimal() calls in a single test function, then
+    # median across the n iterations we ran that for to reduce error
+    old_vals = [statistics.mean(run[name]) for run in old_runs]
+    new_vals = [statistics.mean(run[name]) for run in new_runs]
+    old_values[name] = statistics.median(old_vals)
+    new_values[name] = statistics.median(new_vals)
+
+# name : (absolute difference, times difference)
+diffs = {}
+for name in names:
+    old = old_values[name]
+    new = new_values[name]
+    diff = old - new
+    diff_times = (old - new) / old
+    if 0 < diff_times < 1:
+        diff_times = (1 / (1 - diff_times)) - 1
+    diffs[name] = (diff, diff_times)
+
+    print(f"{name} {int(diff)} ({int(old)} -> {int(new)}, {round(diff_times, 1)}✕)")
+
+diffs = dict(sorted(diffs.items(), key=lambda kv: kv[1][0]))
+diffs_value = [v[0] for v in diffs.values()]
+diffs_percentage = [v[1] for v in diffs.values()]
+
+print(
+    f"mean: {int(statistics.mean(diffs_value))}, median: {int(statistics.median(diffs_value))}"
+)
+
+
+# https://stackoverflow.com/a/65824524
+def align_axes(ax1, ax2):
+    ax1_ylims = ax1.axes.get_ylim()
+    ax1_yratio = ax1_ylims[0] / ax1_ylims[1]
+
+    ax2_ylims = ax2.axes.get_ylim()
+    ax2_yratio = ax2_ylims[0] / ax2_ylims[1]
+
+    if ax1_yratio < ax2_yratio:
+        ax2.set_ylim(bottom=ax2_ylims[1] * ax1_yratio)
+    else:
+        ax1.set_ylim(bottom=ax1_ylims[1] * ax2_yratio)
+
+
+ax1 = sns.barplot(diffs_value, color="b", alpha=0.7, label="shrink call change")
+ax2 = plt.twinx()
+sns.barplot(diffs_percentage, color="r", alpha=0.7, label=r"n✕ change", ax=ax2)
+
+ax1.set_title("old shrinks - new shrinks (aka shrinks saved, higher is better)")
+ax1.set_xticks([])
+align_axes(ax1, ax2)
+legend = ax1.legend(labels=["shrink call change", "n✕ change"])
+legend.legend_handles[0].set_color("b")
+legend.legend_handles[1].set_color("r")
+
+plt.show()
@@ -779,10 +779,6 @@ def end(self, i: int) -> int:
         """Equivalent to self[i].end."""
         return self.endpoints[i]
 
-    def bounds(self, i: int) -> Tuple[int, int]:
-        """Equivalent to self[i].bounds."""
-        return (self.start(i), self.end(i))
-
     def all_bounds(self) -> Iterable[Tuple[int, int]]:
         """Equivalent to [(b.start, b.end) for b in self]."""
         prev = 0
@@ -970,7 +966,12 @@ class IRNode:
     was_forced: bool = attr.ib()
     index: Optional[int] = attr.ib(default=None)
 
-    def copy(self, *, with_value: IRType) -> "IRNode":
+    def copy(
+        self,
+        *,
+        with_value: Optional[IRType] = None,
+        with_kwargs: Optional[IRKWargsType] = None,
+    ) -> "IRNode":
         # we may want to allow this combination in the future, but for now it's
         # a footgun.
         assert not self.was_forced, "modifying a forced node doesn't make sense"
@@ -979,8 +980,8 @@ def copy(self, *, with_value: IRType) -> "IRNode":
         # after copying.
         return IRNode(
             ir_type=self.ir_type,
-            value=with_value,
-            kwargs=self.kwargs,
+            value=self.value if with_value is None else with_value,
+            kwargs=self.kwargs if with_kwargs is None else with_kwargs,
             was_forced=self.was_forced,
         )
 
@@ -1071,9 +1072,17 @@ def __repr__(self):
 
 def ir_value_permitted(value, ir_type, kwargs):
     if ir_type == "integer":
-        if kwargs["min_value"] is not None and value < kwargs["min_value"]:
+        min_value = kwargs["min_value"]
+        max_value = kwargs["max_value"]
+        shrink_towards = kwargs["shrink_towards"]
+        if min_value is not None and value < min_value:
             return False
-        if kwargs["max_value"] is not None and value > kwargs["max_value"]:
+        if max_value is not None and value > max_value:
+            return False
+
+        if (max_value is None or min_value is None) and (
+            value - shrink_towards
+        ).bit_length() >= 128:
             return False
 
         return True
@@ -1144,14 +1153,22 @@ class ConjectureResult:
     status: Status = attr.ib()
     interesting_origin: Optional[InterestingOrigin] = attr.ib()
     buffer: bytes = attr.ib()
-    blocks: Blocks = attr.ib()
+    # some ConjectureDatas pass through the ir and some pass through buffers.
+    # the ir does not drive its result through the buffer, which means blocks/examples
+    # may differ (I think for forced values?) even when the buffer is the same.
+    # I don't *think* anything was relying on anything but .buffer for result equality,
+    # though that assumption may be leaning on flakiness detection invariants.
+    #
+    # If we consider blocks or examples in equality checks, multiple semantically equal
+    # results get stored in e.g. the pareto front.
+    blocks: Blocks = attr.ib(eq=False)
     output: str = attr.ib()
     extra_information: Optional[ExtraInformation] = attr.ib()
     has_discards: bool = attr.ib()
     target_observations: TargetObservations = attr.ib()
     tags: FrozenSet[StructuralCoverageTag] = attr.ib()
     forced_indices: FrozenSet[int] = attr.ib(repr=False)
-    examples: Examples = attr.ib(repr=False)
+    examples: Examples = attr.ib(repr=False, eq=False)
     arg_slices: Set[Tuple[int, int]] = attr.ib(repr=False)
     slice_comments: Dict[Tuple[int, int], str] = attr.ib(repr=False)
     invalid_at: Optional[InvalidAt] = attr.ib(repr=False)

@@ -342,7 +342,7 @@ def _cache_key_ir(
             for node in nodes + extension
         )
 
-    def _cache(self, data: Union[ConjectureData, ConjectureResult]) -> None:
+    def _cache(self, data: ConjectureData) -> None:
         result = data.as_result()
         # when we shrink, we try out of bounds things, which can lead to the same
         # data.buffer having multiple outcomes. eg data.buffer=b'' is Status.OVERRUN
@@ -357,8 +357,25 @@ def _cache(self, data: Union[ConjectureData, ConjectureResult]) -> None:
         # write to the buffer cache here as we move more things to the ir cache.
         if data.invalid_at is None:
             self.__data_cache[data.buffer] = result
-        key = self._cache_key_ir(data=data)
-        self.__data_cache_ir[key] = result
+
+        # interesting buffer-based data can mislead the shrinker if we cache them.
+        #
+        #   @given(st.integers())
+        #   def f(n):
+        #     assert n < 100
+        #
+        # may generate two counterexamples, n=101 and n=m > 101, in that order,
+        # where the buffer corresponding to n is large due to eg failed probes.
+        # We shrink m and eventually try n=101, but it is cached to a large buffer
+        # and so the best we can do is n=102, a non-ideal shrink.
+        #
+        # We can cache ir-based buffers fine, which always correspond to the
+        # smallest buffer via forced=. The overhead here is small because almost
+        # all interesting data are ir-based via the shrinker (and that overhead
+        # will tend towards zero as we move generation to the ir).
+        if data.ir_tree_nodes is not None or data.status < Status.INTERESTING:
+            key = self._cache_key_ir(data=data)
+            self.__data_cache_ir[key] = result
 
     def cached_test_function_ir(
         self, nodes: List[IRNode]
@@ -1218,7 +1235,7 @@ def shrink_interesting_examples(self) -> None:
             self.interesting_examples.values(), key=lambda d: sort_key(d.buffer)
         ):
             assert prev_data.status == Status.INTERESTING
-            data = self.new_conjecture_data_for_buffer(prev_data.buffer)
+            data = self.new_conjecture_data_ir(prev_data.examples.ir_tree_nodes)
             self.test_function(data)
             if data.status != Status.INTERESTING:
                 self.exit_with(ExitReason.flaky)