Merge pull request #2073 from Zac-HD/lark-python

Explicit terminal strategies for Lark
HypothesisWorks · Sep 4, 2019 · 2a68481 · 2a68481
2 parents ea41eb5 + 2b7ddef
commit 2a68481
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 28 deletions.
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,8 @@
+RELEASE_TYPE: minor
+
+This release improves the :func:`~hypothesis.extra.lark.from_lark` strategy,
+tightening argument validation and adding the ``explicit`` argument to allow use
+with terminals that use ``@declare`` instead of a string or regular expression.
+
+This feature is required to handle features such as indent and dedent tokens
+in Python code, which can be generated with the :pypi:`hypothesmith` package.
diff --git a/hypothesis-python/src/hypothesis/extra/lark.py b/hypothesis-python/src/hypothesis/extra/lark.py
@@ -44,13 +44,13 @@
 
 import hypothesis._strategies as st
 from hypothesis.errors import InvalidArgument
-from hypothesis.internal.compat import getfullargspec
+from hypothesis.internal.compat import getfullargspec, string_types
 from hypothesis.internal.conjecture.utils import calc_label_from_name
 from hypothesis.internal.validation import check_type
 from hypothesis.searchstrategy import SearchStrategy
 
 if False:
-    from typing import Text  # noqa
+    from typing import Dict, Text  # noqa
 
 __all__ = ["from_lark"]
 
@@ -69,14 +69,28 @@ class DrawState(object):
     result = attr.ib(default=attr.Factory(list))
 
 
+def get_terminal_names(terminals, rules, ignore_names):
+    """Get names of all terminals in the grammar.
+
+    The arguments are the results of calling ``Lark.grammar.compile()``,
+    so you would think that the ``terminals`` and ``ignore_names`` would
+    have it all... but they omit terminals created with ``@declare``,
+    which appear only in the expansion(s) of nonterminals.
+    """
+    names = {t.name for t in terminals} | set(ignore_names)
+    for rule in rules:
+        names |= {t.name for t in rule.expansion if isinstance(t, Terminal)}
+    return names
+
+
 class LarkStrategy(SearchStrategy):
     """Low-level strategy implementation wrapping a Lark grammar.
 
     See ``from_lark`` for details.
     """
 
-    def __init__(self, grammar, start=None):
-        check_type(lark.lark.Lark, grammar, "grammar")
+    def __init__(self, grammar, start, explicit):
+        assert isinstance(grammar, lark.lark.Lark)
         if start is None:
             start = grammar.options.start
         if not isinstance(start, list):
@@ -110,6 +124,16 @@ def __init__(self, grammar, start=None):
             t.name: st.from_regex(t.pattern.to_regexp(), fullmatch=True)
             for t in terminals
         }
+        unknown_explicit = set(explicit) - get_terminal_names(
+            terminals, rules, ignore_names
+        )
+        if unknown_explicit:
+            raise InvalidArgument(
+                "The following arguments were passed as explicit_strategies, "
+                "but there is no such terminal production in this grammar: %r"
+                % (sorted(unknown_explicit),)
+            )
+        self.terminal_strategies.update(explicit)
 
         nonterminals = {}
 
@@ -145,8 +169,10 @@ def draw_symbol(self, data, symbol, draw_state):
                 strategy = self.terminal_strategies[symbol.name]
             except KeyError:
                 raise InvalidArgument(
-                    "Undefined terminal %r. Generation does not currently support use of %%declare."
-                    % (symbol.name,)
+                    "Undefined terminal %r. Generation does not currently support "
+                    "use of %%declare unless you pass `explicit`, a dict of "
+                    'names-to-strategies, such as `{%r: st.just("")}`'
+                    % (symbol.name, symbol.name)
                 )
             draw_state.result.append(data.draw(strategy))
         else:
@@ -169,10 +195,22 @@ def calc_has_reusable_values(self, recur):
         return True
 
 
+def check_explicit(name):
+    def inner(value):
+        check_type(string_types, value, "value drawn from " + name)
+        return value
+
+    return inner
+
+
 @st.cacheable
 @st.defines_strategy_with_reusable_values
-def from_lark(grammar, start=None):
-    # type: (lark.lark.Lark, Text) -> st.SearchStrategy[Text]
+def from_lark(
+    grammar,  # type: lark.lark.Lark
+    start=None,  # type: Text
+    explicit=None,  # type: Dict[Text, st.SearchStrategy[Text]]
+):
+    # type: (...) -> st.SearchStrategy[Text]
     """A strategy for strings accepted by the given context-free grammar.
 
     ``grammar`` must be a ``Lark`` object, which wraps an EBNF specification.
@@ -183,12 +221,26 @@ def from_lark(grammar, start=None):
     nonterminal ``start`` symbol in the grammar, which was supplied as an
     argument to the Lark class.  To generate strings matching a different
     symbol, including terminals, you can override this by passing the
-    ``start`` argument to ``from_lark``.
+    ``start`` argument to ``from_lark``.  Note that Lark may remove unreachable
+    productions when the grammar is compiled, so you should probably pass the
+    same value for ``start`` to both.
 
     Currently ``from_lark`` does not support grammars that need custom lexing.
     Any lexers will be ignored, and any undefined terminals from the use of
-    ``%declare`` will result in generation errors. We hope to support more of
-    these features in future.
-    """
+    ``%declare`` will result in generation errors.  To define strategies for
+    such terminals, pass a dictionary mapping their name to a corresponding
+    strategy as the ``explicit`` argument.
 
-    return LarkStrategy(grammar, start)
+    The :pypi:`hypothesmith` project includes a strategy for Python source,
+    based on a grammar and careful post-processing.
+    """
+    check_type(lark.lark.Lark, grammar, "grammar")
+    if explicit is None:
+        explicit = {}
+    else:
+        check_type(dict, explicit, "explicit")
+        explicit = {
+            k: v.map(check_explicit("explicit[%r]=%r" % (k, v)))
+            for k, v in explicit.items()
+        }
+    return LarkStrategy(grammar, start, explicit)
diff --git a/hypothesis-python/tests/lark/test_grammar.py b/hypothesis-python/tests/lark/test_grammar.py
@@ -26,7 +26,7 @@
 from hypothesis.errors import InvalidArgument
 from hypothesis.extra.lark import from_lark
 from hypothesis.internal.compat import integer_types, text_type
-from hypothesis.strategies import data
+from hypothesis.strategies import data, just
 from tests.common.debug import find_any
 
 # Adapted from the official Lark tutorial, with modifications to ensure
@@ -51,6 +51,11 @@
     %ignore WS
 """
 
+LIST_GRAMMAR = r"""
+list : "[" [NUMBER ("," NUMBER)*] "]"
+NUMBER: /[0-9]+/
+"""
+
 
 @given(from_lark(Lark(EBNF_GRAMMAR, start="value")))
 def test_generates_valid_json(string):
@@ -90,16 +95,7 @@ def test_can_generate_ignored_tokens():
 
 
 def test_generation_without_whitespace():
-    list_grammar = r"""
-    list : "[" [NUMBER ("," NUMBER)*] "]"
-    NUMBER: /[0-9]+/
-    """
-
-    @given(from_lark(Lark(list_grammar, start="list")))
-    def test(g):
-        assert " " not in g
-
-    test()
+    find_any(from_lark(Lark(LIST_GRAMMAR, start="list")), lambda g: " " not in g)
 
 
 def test_cannot_convert_EBNF_to_strategy_directly():
@@ -109,13 +105,38 @@ def test_cannot_convert_EBNF_to_strategy_directly():
     with pytest.raises(TypeError):
         # Not even the right number of arguments
         from_lark(EBNF_GRAMMAR, start="value").example()
+    with pytest.raises(InvalidArgument):
+        # Wrong type for explicit_strategies
+        from_lark(Lark(LIST_GRAMMAR, start="list"), explicit=[]).example()
 
 
-def test_can_not_use_undefined_terminals_yet():
-    grammar = r"""
-    list : "[" ELEMENT ("," ELEMENT)* "]"
+def test_undefined_terminals_require_explicit_strategies():
+    elem_grammar = r"""
+    list : "[" [ELEMENT ("," ELEMENT)*] "]"
     %declare ELEMENT
     """
+    with pytest.raises(InvalidArgument):
+        from_lark(Lark(elem_grammar, start="list")).example()
+    strategy = {"ELEMENT": just("200")}
+    from_lark(Lark(elem_grammar, start="list"), explicit=strategy).example()
 
+
+def test_cannot_use_explicit_strategies_for_unknown_terminals():
+    with pytest.raises(InvalidArgument):
+        from_lark(
+            Lark(LIST_GRAMMAR, start="list"), explicit={"unused_name": just("")}
+        ).example()
+
+
+def test_non_string_explicit_strategies_are_invalid():
     with pytest.raises(InvalidArgument):
-        from_lark(Lark(grammar, start="list")).example()
+        from_lark(
+            Lark(LIST_GRAMMAR, start="list"), explicit={"NUMBER": just(0)}
+        ).example()
+
+
+@given(
+    string=from_lark(Lark(LIST_GRAMMAR, start="list"), explicit={"NUMBER": just("0")})
+)
+def test_can_override_defined_terminal(string):
+    assert sum(json.loads(string)) == 0