Merge pull request #2859 from Zac-HD/scrutineer-poc

Scrutineer: integrating opportunistic fault-localisation with PBT
HypothesisWorks · Mar 7, 2021 · e2622c8 · e2622c8
2 parents f73c337 + c923932
commit e2622c8
Show file tree

Hide file tree

Showing 10 changed files with 275 additions and 12 deletions.
diff --git a/hypothesis-python/.coveragerc b/hypothesis-python/.coveragerc
@@ -5,6 +5,7 @@ include =
     **/.tox/*/lib/*/site-packages/hypothesis/**/*.py
 omit =
     **/pytestplugin.py
+    **/scrutineer.py
     **/strategytests.py
     **/compat*.py
     **/extra/__init__.py

diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,7 @@
+RELEASE_TYPE: minor
+
+This release adds :ref:`the explain phase <phases>`, in which Hypothesis
+attempts to explain *why* your test failed by pointing to suspicious lines
+of code (i.e. those which were always, and only, run on failing inputs).
+We plan to include "generalising" failing examples in this phase in a
+future release (:issue:`2192`).
diff --git a/hypothesis-python/docs/details.rst b/hypothesis-python/docs/details.rst
@@ -687,6 +687,8 @@ providing extra information and convenient access to config options.
   :ref:`override the current verbosity level <verbose-output>`.
 - ``pytest --hypothesis-seed=<an int>`` can be used to
   :ref:`reproduce a failure with a particular seed <reproducing-with-seed>`.
+- ``pytest --hypothesis-explain`` can be used to
+  :ref:`temporarily enable the explain phase <phases>`.
 
 Finally, all tests that are defined with Hypothesis automatically have
 ``@pytest.mark.hypothesis`` applied to them.  See :ref:`here for information

diff --git a/hypothesis-python/docs/settings.rst b/hypothesis-python/docs/settings.rst
@@ -52,14 +52,19 @@ Available settings
 Controlling what runs
 ~~~~~~~~~~~~~~~~~~~~~
 
-Hypothesis divides tests into five logically distinct phases:
+Hypothesis divides tests into logically distinct phases:
 
 1. Running explicit examples :ref:`provided with the @example decorator <providing-explicit-examples>`.
 2. Rerunning a selection of previously failing examples to reproduce a previously seen error
 3. Generating new examples.
 4. Mutating examples for :ref:`targeted property-based testing <targeted-search>`.
 5. Attempting to shrink an example found in previous phases (other than phase 1 - explicit examples cannot be shrunk).
    This turns potentially large and complicated examples which may be hard to read into smaller and simpler ones.
+6. Attempting to explain the cause of the failure, by identifying suspicious lines of code
+   (e.g. the earliest lines which are never run on passing inputs, and always run on failures).
+   This relies on :func:`python:sys.settrace`, and is therefore automatically disabled on
+   PyPy or if you are using :pypi:`coverage` or a debugger.  If there are no clearly
+   suspicious lines of code, :pep:`we refuse the temptation to guess <20>`.
 
 The phases setting provides you with fine grained control over which of these run,
 with each phase corresponding to a value on the :class:`~hypothesis.Phase` enum:
@@ -71,7 +76,7 @@ with each phase corresponding to a value on the :class:`~hypothesis.Phase` enum:
 3. ``Phase.generate`` controls whether new examples will be generated.
 4. ``Phase.target`` controls whether examples will be mutated for targeting.
 5. ``Phase.shrink`` controls whether examples will be shrunk.
-
+6. ``Phase.explain`` controls whether Hypothesis attempts to explain test failures.
 
 The phases argument accepts a collection with any subset of these. e.g.
 ``settings(phases=[Phase.generate, Phase.shrink])`` will generate new examples

diff --git a/hypothesis-python/src/hypothesis/_settings.py b/hypothesis-python/src/hypothesis/_settings.py
@@ -432,6 +432,7 @@ class Phase(IntEnum):
     generate = 2
     target = 3
     shrink = 4
+    explain = 5
 
     def __repr__(self):
         return f"Phase.{self.name}"
@@ -523,7 +524,9 @@ def _validate_phases(phases):
 
 settings._define_setting(
     "phases",
-    default=tuple(Phase),
+    # We leave the `explain` phase disabled by default, for speed and brevity
+    # TODO: consider default-enabling this in CI?
+    default=_validate_phases(set(Phase) - {Phase.explain}),
     description=(
         "Control which phases should be run. "
         "See :ref:`the full documentation for more details <phases>`"

diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py
@@ -27,6 +27,7 @@
 import types
 import warnings
 import zlib
+from collections import defaultdict
 from inspect import getfullargspec
 from io import StringIO
 from random import Random
@@ -61,12 +62,13 @@
 )
 from hypothesis.executors import new_style_executor
 from hypothesis.internal.compat import (
+    PYPY,
     bad_django_TestCase,
     get_type_hints,
     int_from_bytes,
     qualname,
 )
-from hypothesis.internal.conjecture.data import ConjectureData
+from hypothesis.internal.conjecture.data import ConjectureData, Status
 from hypothesis.internal.conjecture.engine import ConjectureRunner
 from hypothesis.internal.conjecture.shrinker import sort_key
 from hypothesis.internal.entropy import deterministic_PRNG
@@ -86,6 +88,7 @@
     is_mock,
     proxies,
 )
+from hypothesis.internal.scrutineer import Tracer, explanatory_lines
 from hypothesis.reporting import (
     current_verbosity,
     report,
@@ -523,6 +526,8 @@ def __init__(
         self.files_to_propagate = set()
         self.failed_normally = False
 
+        self.explain_traces = defaultdict(set)
+
     def execute_once(
         self, data, print_example=False, is_final=False, expected_failure=None
     ):
@@ -675,7 +680,26 @@ def _execute_once_for_engine(self, data):
         ``StopTest`` must be a fatal error, and should stop the entire engine.
         """
         try:
-            result = self.execute_once(data)
+            trace = frozenset()
+            if (
+                self.failed_normally
+                and Phase.explain in self.settings.phases
+                and sys.gettrace() is None
+                and not PYPY
+            ):  # pragma: no cover
+                # This is in fact covered by our *non-coverage* tests, but due to the
+                # settrace() contention *not* by our coverage tests.  Ah well.
+                tracer = Tracer()
+                try:
+                    sys.settrace(tracer.trace)
+                    result = self.execute_once(data)
+                    if data.status == Status.VALID:
+                        self.explain_traces[None].add(frozenset(tracer.branches))
+                finally:
+                    sys.settrace(None)
+                    trace = frozenset(tracer.branches)
+            else:
+                result = self.execute_once(data)
             if result is not None:
                 fail_health_check(
                     self.settings,
@@ -721,7 +745,13 @@ def _execute_once_for_engine(self, data):
                 info.__expected_exception = e
                 verbose_report(info.__expected_traceback)
 
-                data.mark_interesting(get_interesting_origin(e))
+                self.failed_normally = True
+
+                interesting_origin = get_interesting_origin(e)
+                if trace:  # pragma: no cover
+                    # Trace collection is explicitly disabled under coverage.
+                    self.explain_traces[interesting_origin].add(trace)
+                data.mark_interesting(interesting_origin)
 
     def run_engine(self):
         """Run the test function many times, on database input and generated
@@ -772,15 +802,14 @@ def run_engine(self):
         # The engine found one or more failures, so we need to reproduce and
         # report them.
 
-        self.failed_normally = True
-
         flaky = 0
 
         if runner.best_observed_targets:
             for line in describe_targets(runner.best_observed_targets):
                 report(line)
             report("")
 
+        explanations = explanatory_lines(self.explain_traces, self.settings)
         for falsifying_example in self.falsifying_examples:
             info = falsifying_example.extra_information
 
@@ -804,6 +833,10 @@ def run_engine(self):
                     "assumptions on the first run now fails it."
                 )
             except BaseException as e:
+                # If we have anything for explain-mode, this is the time to report.
+                for line in explanations[falsifying_example.interesting_origin]:
+                    report(line)
+
                 if len(self.falsifying_examples) <= 1:
                     # There is only one failure, so we can report it by raising
                     # it directly.

diff --git a/hypothesis-python/src/hypothesis/extra/pytestplugin.py b/hypothesis-python/src/hypothesis/extra/pytestplugin.py
@@ -18,7 +18,7 @@
 
 import pytest
 
-from hypothesis import HealthCheck, Verbosity, core, settings
+from hypothesis import HealthCheck, Phase, Verbosity, core, settings
 from hypothesis.errors import InvalidArgument
 from hypothesis.internal.detection import is_hypothesis_test
 from hypothesis.internal.healthcheck import fail_health_check
@@ -29,6 +29,7 @@
 VERBOSITY_OPTION = "--hypothesis-verbosity"
 PRINT_STATISTICS_OPTION = "--hypothesis-show-statistics"
 SEED_OPTION = "--hypothesis-seed"
+EXPLAIN_OPTION = "--hypothesis-explain"
 
 
 class StoringReporter:
@@ -85,6 +86,12 @@ def pytest_addoption(parser):
             action="store",
             help="Set a seed to use for all Hypothesis tests",
         )
+        group.addoption(
+            EXPLAIN_OPTION,
+            action="store_true",
+            help="Enable the `explain` phase for failing Hypothesis tests",
+            default=False,
+        )
 
     def pytest_report_header(config):
         if config.option.verbose < 1 and settings.default.verbosity < Verbosity.verbose:
@@ -103,13 +110,22 @@ def pytest_configure(config):
         if profile:
             settings.load_profile(profile)
         verbosity_name = config.getoption(VERBOSITY_OPTION)
-        if verbosity_name:
+        if verbosity_name and verbosity_name != settings.default.verbosity.name:
             verbosity_value = Verbosity[verbosity_name]
             name = f"{settings._current_profile}-with-{verbosity_name}-verbosity"
             # register_profile creates a new profile, exactly like the current one,
             # with the extra values given (in this case 'verbosity')
             settings.register_profile(name, verbosity=verbosity_value)
             settings.load_profile(name)
+        if (
+            config.getoption(EXPLAIN_OPTION)
+            and Phase.explain not in settings.default.phases
+        ):
+            name = f"{settings._current_profile}-with-explain-phase"
+            phases = settings.default.phases + (Phase.explain,)
+            settings.register_profile(name, phases=phases)
+            settings.load_profile(name)
+
         seed = config.getoption(SEED_OPTION)
         if seed is not None:
             try:

diff --git a/hypothesis-python/src/hypothesis/internal/scrutineer.py b/hypothesis-python/src/hypothesis/internal/scrutineer.py
@@ -0,0 +1,122 @@
+# This file is part of Hypothesis, which may be found at
+# https://github.com/HypothesisWorks/hypothesis/
+#
+# Most of this work is copyright (C) 2013-2021 David R. MacIver
+# (david@drmaciver.com), but it contains contributions by others. See
+# CONTRIBUTING.rst for a full list of people who may hold copyright, and
+# consult the git log if you need to determine who owns an individual
+# contribution.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at https://mozilla.org/MPL/2.0/.
+#
+# END HEADER
+
+import sys
+from collections import defaultdict
+from functools import lru_cache, reduce
+from itertools import groupby
+from pathlib import Path
+
+from hypothesis._settings import Phase, Verbosity
+from hypothesis.internal.escalation import is_hypothesis_file
+
+
+@lru_cache(maxsize=None)
+def should_trace_file(fname):
+    # fname.startswith("<") indicates runtime code-generation via compile,
+    # e.g. compile("def ...", "<string>", "exec") in e.g. attrs methods.
+    return not (is_hypothesis_file(fname) or fname.startswith("<"))
+
+
+class Tracer:
+    """A super-simple line coverage tracer."""
+
+    def __init__(self):
+        self.branches = set()
+        self.prev = None
+
+    def trace(self, frame, event, arg):
+        if event == "call":
+            return self.trace
+        elif event == "line":
+            fname = frame.f_code.co_filename
+            if should_trace_file(fname):
+                self.branches.add((self.prev, (fname, frame.f_lineno)))
+                self.prev = (fname, frame.f_lineno)
+
+
+def get_explaining_locations(traces):
+    # Traces is a dict[interesting_origin | None, set[frozenset[tuple[str, int]]]]
+    # Each trace in the set might later become a Counter instead of frozenset.
+    if not traces:
+        return {}
+
+    unions = {origin: set().union(*values) for origin, values in traces.items()}
+    seen_passing = {None}.union(*unions.pop(None, set()))
+
+    always_failing_never_passing = {
+        origin: reduce(set.intersection, [set().union(*v) for v in values])
+        - seen_passing
+        for origin, values in traces.items()
+        if origin is not None
+    }
+
+    # Build the observed parts of the control-flow graph for each origin
+    cf_graphs = {origin: defaultdict(set) for origin in unions}
+    for origin, seen_arcs in unions.items():
+        for src, dst in seen_arcs:
+            cf_graphs[origin][src].add(dst)
+        assert cf_graphs[origin][None], "Expected start node with >=1 successor"
+
+    # For each origin, our explanation is the always_failing_never_passing lines
+    # which are reachable from the start node (None) without passing through another
+    # AFNP line.  So here's a whatever-first search with early stopping:
+    explanations = defaultdict(set)
+    for origin in unions:
+        queue = {None}
+        seen = set()
+        while queue:
+            assert queue.isdisjoint(seen), f"Intersection: {queue & seen}"
+            src = queue.pop()
+            seen.add(src)
+            if src in always_failing_never_passing[origin]:
+                explanations[origin].add(src)
+            else:
+                queue.update(cf_graphs[origin][src] - seen)
+
+    return explanations
+
+
+LIB_DIR = str(Path(sys.executable).parent / "lib")
+EXPLANATION_STUB = (
+    "Explanation:",
+    "    These lines were always and only run by failing examples:",
+)
+
+
+def make_report(explanations, cap_lines_at=5):
+    report = defaultdict(list)
+    for origin, locations in explanations.items():
+        assert locations  # or else we wouldn't have stored the key, above.
+        report_lines = [
+            "        {}:{}".format(k, ", ".join(map(str, sorted(l for _, l in v))))
+            for k, v in groupby(locations, lambda kv: kv[0])
+        ]
+        report_lines.sort(key=lambda line: (line.startswith(LIB_DIR), line))
+        if len(report_lines) > cap_lines_at + 1:
+            msg = "        (and {} more with settings.verbosity >= verbose)"
+            report_lines[cap_lines_at:] = [msg.format(len(report_lines[cap_lines_at:]))]
+        report[origin] = list(EXPLANATION_STUB) + report_lines
+    return report
+
+
+def explanatory_lines(traces, settings):
+    if Phase.explain in settings.phases and sys.gettrace() and not traces:
+        msg = "    We didn't try to explain this, because sys.gettrace()="
+        return defaultdict(lambda: [EXPLANATION_STUB[0], msg + repr(sys.gettrace())])
+    # Return human-readable report lines summarising the traces
+    explanations = get_explaining_locations(traces)
+    max_lines = 5 if settings.verbosity <= Verbosity.normal else 100
+    return make_report(explanations, cap_lines_at=max_lines)
diff --git a/hypothesis-python/tests/cover/test_phases.py b/hypothesis-python/tests/cover/test_phases.py
@@ -51,8 +51,8 @@ def test_sorts_and_dedupes_phases(arg, expected):
     assert settings(phases=arg).phases == expected
 
 
-def test_phases_default_to_all():
-    assert settings().phases == tuple(Phase)
+def test_phases_default_to_all_except_explain():
+    assert settings().phases + (Phase.explain,) == tuple(Phase)
 
 
 def test_does_not_reuse_saved_examples_if_reuse_not_in_phases():