diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bc4051a..c64b910 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9, 3.10.0-rc.1] + python-version: [3.7, 3.8, 3.9, 3.10.0-rc.1] env: PIP_DISABLE_PIP_VERSION_CHECK: 1 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..9939ca0 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include LICENSE* README.rst +include parsing/py.typed diff --git a/parsing/__init__.py b/parsing/__init__.py index e487e13..8e949a9 100644 --- a/parsing/__init__.py +++ b/parsing/__init__.py @@ -118,9 +118,12 @@ * Precedence * Token * Nonterm +""" + +from __future__ import annotations +from typing import Iterator, List -""" __all__ = [ "SpecError", "UnexpectedToken", @@ -161,22 +164,19 @@ Exception = AnyException SyntaxError = UnexpectedToken -try: - next -except NameError: - # Python <= 2.5 - def next(obj): - return obj.next() - -class Lr(object): +class Lr: """ LR(1) parser. The Lr class uses a Spec instance in order to parse input that is fed to it via the token() method, and terminated via the eoi() method. """ - def __init__(self, spec): + _spec: Spec + _start: list[Symbol] | None + _stack: list[tuple[Symbol, int]] + + def __init__(self, spec: Spec) -> None: if __debug__: if type(self) == Lr: assert spec.pureLR @@ -185,23 +185,16 @@ def __init__(self, spec): self.reset() self._verbose = False - def __getSpec(self): + def __getSpec(self) -> Spec: return self._spec - def __setSpec(self, spec): - raise AttributeError - - spec = property(__getSpec, __setSpec) + spec = property(__getSpec) - def __getStart(self): + def __getStart(self) -> list[Symbol] | None: return self._start - def __setStart(self, start): - raise AttributeError - start = property( __getStart, - __setStart, doc=""" A list of parsing results. For LR parsing, there is only ever one result, but for compatibility with the Glr interface, start is a @@ -209,25 +202,25 @@ def __setStart(self, start): """, ) - def __getVerbose(self): + def __getVerbose(self) -> bool: return self._verbose - def __setVerbose(self, verbose): + def __setVerbose(self, verbose: bool) -> None: assert type(verbose) == bool self._verbose = verbose verbose = property(__getVerbose, __setVerbose) - def reset(self): + def reset(self) -> None: self._start = None self._stack = [(Epsilon(self), 0)] - def token(self, token): + def token(self, token: Token) -> None: """Feed a token to the parser.""" tokenSpec = self._spec._sym2spec[type(token)] - self._act(token, tokenSpec) + self._act(token, tokenSpec) # type: ignore - def eoi(self): + def eoi(self) -> None: """Signal end-of-input to the parser.""" token = EndOfInput(self) self.token(token) @@ -241,7 +234,7 @@ def eoi(self): self._start = [self._stack[1][0]] assert self._start[0].symSpec == self._spec._userStartSym - def _act(self, sym, symSpec): + def _act(self, sym: Token, symSpec: TokenSpec) -> None: if self._verbose: self._printStack() print("INPUT: %r" % sym) @@ -267,7 +260,7 @@ def _act(self, sym, symSpec): if self._verbose: self._printStack() - def _printStack(self): + def _printStack(self) -> None: print("STACK:", end=" ") for node in self._stack: print("%r" % node[0], end=" ") @@ -284,7 +277,7 @@ def _printStack(self): ) print() - def _reduce(self, production): + def _reduce(self, production: Production) -> None: nRhs = len(production.rhs) rhs = [] for i in range(len(self._stack) - nRhs, len(self._stack)): @@ -298,7 +291,9 @@ def _reduce(self, production): top = self._stack[-1] self._stack.append((r, self._spec._goto[top[1]][production.lhs])) - def _production(self, production, rhs): + def _production( + self, production: Production, rhs: list[Symbol] + ) -> Nonterm: sym = production.lhs.nontermType(self) nRhs = len(rhs) assert nRhs == len(production.rhs) @@ -317,60 +312,35 @@ def _production(self, production, rhs): # -class Gss(list): - """Graph-structured stack.""" - - def __init__(self, glr): - list.__init__(self) - - self._glr = glr +class GssPathStep: + pass -class Gsse(object): - """Graph-structured stack edge.""" - - def __init__(self, below, above, value): - self.node = below - above._edges.append(self) - self.value = value - - def __repr__(self): - return "{%r}" % self.value - - def __eq__(self, other): - if self.node != other.node or self.value != other.value: - return False - return True - - -class Gssn(object): +class Gssn(GssPathStep): """Graph-structured stack node.""" - def __init__(self, below, value, nextState): - assert isinstance(below, Gssn) or below is None - - self._edges = [] - if below is not None: + def __init__( + self, below: Gssn | None, value: Symbol | None, nextState: int + ) -> None: + self._edges: list[Gsse] = [] + if below is not None and value is not None: Gsse(below, self, value) self.nextState = nextState - def __repr__(self): + def __repr__(self) -> str: return "[%d]" % self.nextState - def __getEdge(self): + def __getEdge(self) -> Gsse: assert len(self._edges) == 1 return self._edges[0] - def __setEdge(self): - raise AttributeError + edge = property(__getEdge) - edge = property(__getEdge, __setEdge) - - def edges(self): + def edges(self) -> Iterator[Gsse]: for edge in self._edges: yield edge - def nodes(self): + def nodes(self) -> Iterator[Gssn]: for edge in self._edges: yield edge.node @@ -383,18 +353,22 @@ def nodes(self): # # -grammars can cause cycles, which requires that we avoid infinite # recursion. - def paths(self, pathLen=None): + def paths( + self, pathLen: int | None = None + ) -> Iterator[tuple[GssPathStep, ...]]: assert pathLen is None or isinstance(pathLen, int) and pathLen >= 0 for path in self._pathsRecurse(pathLen, []): yield path - def _pathsRecurse(self, pathLen, path): + def _pathsRecurse( + self, pathLen: int | None, path: list[GssPathStep] + ) -> Iterator[tuple[GssPathStep, ...]]: path.insert(0, self) if pathLen is None and len(self._edges) == 0: - yield path[:] + yield tuple(path[:]) elif pathLen is not None and len(path) - 1 == pathLen * 2: - yield path[:] + yield tuple(path[:]) else: for edge in self.edges(): # Avoid infinite recursion due to -production cycles. @@ -406,6 +380,32 @@ def _pathsRecurse(self, pathLen, path): path.pop(0) +class Gsse(GssPathStep): + """Graph-structured stack edge.""" + + def __init__(self, below: Gssn, above: Gssn, value: Symbol) -> None: + self.node = below + above._edges.append(self) + self.value = value + + def __repr__(self) -> str: + return "{%r}" % self.value + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Gsse): + return NotImplemented + else: + return self.node == other.node and self.value == other.value + + +class Gss(List[Gssn]): + """Graph-structured stack.""" + + def __init__(self, glr: Glr): + list.__init__(self) + self._glr = glr + + # # End graph-structured stack (GSS) classes. # ======================================================================== @@ -417,10 +417,7 @@ class Glr(Lr): that is fed to it via the token() method, and terminated via the eoi() method.""" - def __init__(self, spec): - Lr.__init__(self, spec) - - def reset(self): + def reset(self) -> None: self._start = None # Initialize with a stack that is in the start state. @@ -428,20 +425,18 @@ def reset(self): top = Gssn(None, None, 0) self._gss.append(top) - self._paths = [] - - def token(self, token): + def token(self, token: Token) -> None: """ Feed a token to the parser.""" if self._verbose: print("%s" % ("-" * 80)) print("INPUT: %r" % token) tokenSpec = self._spec._sym2spec[type(token)] - self._act(token, tokenSpec) + self._act(token, tokenSpec) # type: ignore if len(self._gss) == 0: raise UnexpectedToken("Unexpected token: %r" % token) - def eoi(self): + def eoi(self) -> None: """ Signal end-of-input to the parser.""" token = EndOfInput(self) @@ -455,6 +450,7 @@ def eoi(self): if self._verbose: print(" --> accept %r" % path) edge = path[1] + assert isinstance(edge, Gsse) assert isinstance(edge.value, Nonterm) assert edge.value.symSpec == self._spec._userStartSym self._start.append(edge.value) @@ -466,11 +462,11 @@ def eoi(self): print("Start: %r" % self._start) print("%s" % ("-" * 80)) - def _act(self, sym, symSpec): + def _act(self, sym: Token, symSpec: TokenSpec) -> None: self._reductions(sym, symSpec) self._shifts(sym, symSpec) - def _reductions(self, sym, symSpec): + def _reductions(self, sym: Token, symSpec: TokenSpec) -> None: # epsilons is a dictionary that maps production-->[tops]. The purpose # is to avoid repeating the same epsilon production on a particular # stack top. Ordinary productions do not require this care because we @@ -481,7 +477,7 @@ def _reductions(self, sym, symSpec): nReduces = 0 # Enqueue work. - workQ = [] + workQ: list[tuple[tuple[GssPathStep, ...], Production]] = [] i = 0 while i < len(self._gss): top = self._gss[i] @@ -540,23 +536,31 @@ def _reductions(self, sym, symSpec): print(" %r" % path) nReduces += 1 - self._reduce(workQ, epsilons, path, production, symSpec) + self._glr_reduce(workQ, epsilons, path, production, symSpec) if self._verbose: if nReduces > 0: self._printStack() - def _reduce(self, workQ, epsilons, path, production, symSpec): + def _glr_reduce( + self, + workQ: list[tuple[tuple[GssPathStep, ...], Production]], + epsilons: dict[Production, list[Gssn]], + path: tuple[GssPathStep, ...], + production: Production, + symSpec: SymbolSpec, + ) -> None: assert len(path[1::2]) == len(production.rhs) # Build the list of RHS semantic values to pass to the reduction # action. - rhs = [edge.value for edge in path[1::2]] + rhs = [edge.value for edge in path[1::2]] # type: ignore # Call the user reduction method. r = self._production(production, rhs) below = path[0] + assert isinstance(below, Gssn) done = False for top in self._gss: if ( @@ -567,22 +571,24 @@ def _reduce(self, workQ, epsilons, path, production, symSpec): # the set of stack tops. for edge in top.edges(): if edge.node == below: + nonterm = edge.value + assert isinstance(nonterm, Nonterm) # There is already a below<--top link, so merge # competing interpretations. if self._verbose: - print(" --> merge %r <--> %r" % (edge.value, r)) - value = production.lhs.nontermType.merge(edge.value, r) + print(" --> merge %r <--> %r" % (nonterm, r)) + value = production.lhs.nontermType.merge(nonterm, r) if self._verbose: if value == edge.value: print( " %s" - % ("-" * len("%r" % edge.value)) + % ("-" * len("%r" % nonterm)) ) else: print( " %s %s" % ( - (" " * len("%r" % edge.value)), + (" " * len("%r" % nonterm)), "-" * len("%r" % r), ) ) @@ -616,7 +622,13 @@ def _reduce(self, workQ, epsilons, path, production, symSpec): self._enqueueLimitedReductions(workQ, epsilons, top.edge, symSpec) # Enqueue paths that incorporate edge. - def _enqueueLimitedReductions(self, workQ, epsilons, edge, symSpec): + def _enqueueLimitedReductions( + self, + workQ: list[tuple[tuple[GssPathStep, ...], Production]], + epsilons: dict[Production, list[Gssn]], + edge: Gsse, + symSpec: SymbolSpec, + ) -> None: gotos = self._spec._goto for top in self._gss: @@ -633,17 +645,17 @@ def _enqueueLimitedReductions(self, workQ, epsilons, edge, symSpec): # twice. pass elif action.production not in epsilons: - path = [top] + p = (top,) epsilons[action.production] = [top] - workQ.append((path, action.production)) + workQ.append((p, action.production)) if self._verbose: print( " --> enqueue(d) %r" % action.production ) - print(" %r" % path) + print(" %r" % p) elif top not in epsilons[action.production]: - path = [top] + path = (top,) epsilons[action.production].append(top) workQ.append((path, action.production)) if self._verbose: @@ -655,17 +667,17 @@ def _enqueueLimitedReductions(self, workQ, epsilons, edge, symSpec): else: # Iterate over all reduction paths through stack # and enqueue them if they incorporate edge. - for path in top.paths(len(action.production.rhs)): - if edge in path[1::2]: - workQ.append((path, action.production)) + for rp in top.paths(len(action.production.rhs)): + if edge in rp[1::2]: + workQ.append((rp, action.production)) if self._verbose: print( " --> enqueue(f) %r" % action.production ) - print(" %r" % path) + print(" %r" % rp) - def _shifts(self, sym, symSpec): + def _shifts(self, sym: Token, symSpec: TokenSpec) -> None: prevGss = self._gss self._gss = Gss(self) @@ -692,7 +704,7 @@ def _shifts(self, sym, symSpec): if nShifts > 0: self._printStack() - def _printStack(self): + def _printStack(self) -> None: i = 0 for top in self._gss: for path in top.paths(): diff --git a/parsing/ast.py b/parsing/ast.py index 7ad242a..26f5b44 100644 --- a/parsing/ast.py +++ b/parsing/ast.py @@ -5,34 +5,30 @@ constructed in the process. """ -from parsing.interfaces import is_parser, is_symspec +from __future__ import annotations +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from parsing.interfaces import Parser, SymbolSpec -class Symbol(object): - def __init__(self, symSpec, parser): - assert is_symspec(symSpec) - assert is_parser(parser) + +class Symbol: + def __init__(self, symSpec: SymbolSpec, parser: Parser): self.__symSpec = symSpec self.__parser = parser - def __repr__(self): - return "%r" % self.symSpec + def __repr__(self) -> str: + return repr(self.symSpec) - def __getSymSpec(self): + def __getSymSpec(self) -> SymbolSpec: return self.__symSpec - def __setSymSpec(self): - raise AttributeError - - symSpec = property(__getSymSpec, __setSymSpec) + symSpec = property(__getSymSpec) - def __getParser(self): + def __getParser(self) -> Parser: return self.__parser - def __setParser(self): - raise AttributeError - - parser = property(__getParser, __setParser) + parser = property(__getParser) class Nonterm(Symbol): @@ -79,11 +75,10 @@ def reduceB(self, id): "%reduce id" """ - def __init__(self, parser): - assert is_parser(parser) + def __init__(self, parser: Parser) -> None: Symbol.__init__(self, parser._spec._sym2spec[type(self)], parser) - def merge(self, other): + def merge(self, other: Nonterm) -> Nonterm: """ Merging happens when there is an ambiguity in the input that allows non-terminals to be part of multiple overlapping series of @@ -139,7 +134,5 @@ class rparen(Token): class id(Token): "%token" """ - def __init__(self, parser): - assert is_parser(parser) + def __init__(self, parser: Parser) -> None: Symbol.__init__(self, parser._spec._sym2spec[type(self)], parser) - self.__parser = parser diff --git a/parsing/automaton.py b/parsing/automaton.py index 934e3ac..f97ec5e 100644 --- a/parsing/automaton.py +++ b/parsing/automaton.py @@ -2,16 +2,31 @@ The classes in this module are used to compute the LR(1) automaton using Pager's (1977) Practical General Method. """ +from __future__ import annotations +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + Type, +) + import types import pickle import sys from parsing.errors import SpecError -from parsing.interfaces import is_spec_source from parsing import introspection from parsing import module_spec +from parsing.ast import Symbol from parsing.grammar import ( Precedence, + PrecedenceRef, Production, TokenSpec, NontermSpec, @@ -26,104 +41,91 @@ ReduceAction, ) +if TYPE_CHECKING: + from typing_extensions import Literal + from parsing.interfaces import SpecSource -class String(list): - def __init__(self, args=[]): - list.__init__(self, args) - - self.hash = self._hash() - - def __cmp__(self, other): - assert isinstance(other, String) - minLen = min(len(self), len(other)) - for i in range(minLen): - if self[i] < other[i]: - return -1 - elif self[i] > other[i]: - return 1 - - # Prefixes are identical. Handle trailing characters, if any. - if len(self) < len(other): - return -1 - elif len(self) == len(other): - return 0 - else: - assert len(self) > len(other) - return 1 - - def __eq__(self, other): - if len(self) == len(other): - for i in range(len(self)): - if self[i] != other[i]: - return False - return True - else: - return False + SpecCompatibility = Literal[ + "compatible", "incompatible", "itemsets", "repickle" + ] - def _hash(self): - ret = 5381 - for sym in self: - ret = ((ret << 5) + ret) + sym.seq - ret &= 0xFFFFFFFFFFFFFFFF - return ret + PickleMode = Literal["r", "w", "rw"] - def __hash__(self): - return self.hash + ConflictResolution = Literal[ + "neither", # Discard both. + "old", # Keep old. + "both", # Keep both. + "new", # Keep new. + "err", # Unresolvable conflict. + ] + ActionState = Dict[SymbolSpec, List[Action]] + GotoState = Dict[SymbolSpec, int] -class StringSpec(object): - cache = {} - def __init__(self, s): - assert isinstance(s, String) - for sym in s: - assert isinstance(sym, SymbolSpec) +class StringSpec: + cache: ClassVar[dict[Tuple[SymbolSpec, ...], set[SymbolSpec]]] = {} - self.s = s + def __init__(self, ss: Iterable[SymbolSpec]) -> None: + self.s = s = tuple(ss) if s in StringSpec.cache: self.firstSet = StringSpec.cache[s] else: # Calculate the first set for the string encoded by the s vector. - self.firstSet = [] # Set. + self.firstSet = set() mergeEpsilon = True for sym in self.s: hasEpsilon = False for elm in sym.firstSet: if elm == epsilon: hasEpsilon = True - elif sym not in self.firstSet: - self.firstSet.append(elm) + else: + self.firstSet.add(elm) if not hasEpsilon: mergeEpsilon = False break # Merge epsilon if it was in the first set of every symbol. if mergeEpsilon: - self.firstSet.append(epsilon) + self.firstSet.add(epsilon) # Cache the result. StringSpec.cache[s] = self.firstSet -class Item(int): - def __new__(cls, production, dotPos, lookahead): - assert isinstance(production, Production) - assert type(dotPos) == int - assert dotPos >= 0 - assert dotPos <= len(production.rhs) - assert type(lookahead) == list - if __debug__: - for elm in lookahead: - assert isinstance(elm, SymbolSpec) - - hash = (dotPos * Production.seq) + production.seq - result = int.__new__(cls, hash) - result.hash = hash - result.production = production - result.dotPos = dotPos - result.lookahead = dict(list(zip(lookahead, lookahead))) - return result - - def __repr__(self): +class Item: + hash: int + production: Production + dotPos: int + lookahead: Dict[SymbolSpec, SymbolSpec] + + def __init__( + self, + production: Production, + dotPos: int, + lookahead: Iterable[SymbolSpec], + ) -> None: + assert 0 <= dotPos <= len(production.rhs) + self.hash = (dotPos * Production.seq) + production.seq + self.production = production + self.dotPos = dotPos + self.lookahead = dict(zip(lookahead, lookahead)) + + def __hash__(self) -> int: + return self.hash + + def __eq__(self, other: Any) -> bool: + if type(other) == Item: + return self.hash == other.hash + else: + return NotImplemented + + def __lt__(self, other: Any) -> bool: + if type(other) == Item: + return self.hash < other.hash + else: + return NotImplemented + + def __repr__(self) -> str: strs = [] strs.append("[%r ::=" % self.production.lhs) assert self.dotPos <= len(self.production.rhs) @@ -147,7 +149,7 @@ def __repr__(self): return "".join(strs) - def lr0__repr__(self): + def lr0__repr__(self) -> str: strs = [] strs.append("%r ::=" % self.production.lhs) assert self.dotPos <= len(self.production.rhs) @@ -163,11 +165,10 @@ def lr0__repr__(self): return "".join(strs) - def lookaheadInsert(self, sym): - assert isinstance(sym, SymbolSpec) + def lookaheadInsert(self, sym: SymbolSpec) -> None: self.lookahead[sym] = sym - def lookaheadDisjoint(self, other): + def lookaheadDisjoint(self, other: Item) -> bool: sLookahead = self.lookahead oLookahead = other.lookahead @@ -181,64 +182,60 @@ def lookaheadDisjoint(self, other): return True - def __getnewargs__(self): - return (self.production, self.dotPos, list(self.lookahead.keys())) - -class ItemSet(dict): - def __init__(self, args=[]): - dict.__init__(self, args) - self._added = {} +class ItemSet: + def __init__(self) -> None: + self._kernel: Dict[Item, Item] = {} + self._added: Dict[Item, Item] = {} - def __repr__(self): - kernel = [item for item in self.keys()] - kernel.sort() - added = [item for item in self._added.keys()] - added.sort() + def __repr__(self) -> str: return "ItemSet(kernel: %s, added: %r)" % ( - ", ".join(["%r" % item for item in kernel]), - ", ".join(["%r" % item for item in added]), + ", ".join(repr(item) for item in sorted(self._kernel)), + ", ".join(repr(item) for item in sorted(self._added)), ) - def __hash__(self): + def __len__(self) -> int: + return len(self._kernel) + + def __hash__(self) -> int: # This works because integers never overflow, and addition is # transitive. ret = 0 - for item in self.keys(): + for item in self._kernel: ret += item.hash return ret - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if len(self) != len(other): return False - for sItem in self.keys(): + for sItem in self._kernel: if sItem not in other: return False return True - def __iter__(self): - for item in self.keys(): + def __iter__(self) -> Iterator[Item]: + for item in self._kernel: assert item.production.lhs.name == "" or item.dotPos != 0 yield item - for item in self._added.keys(): + for item in self._added: assert item.dotPos == 0 assert item.production.lhs.name != "" yield item # Merge a kernel item. - def append(self, item): + def append(self, item: Item) -> None: assert item.production.lhs.name == "" or item.dotPos != 0 if item in self: - self[item].lookahead.update(item.lookahead) + self._kernel[item].lookahead.update(item.lookahead) else: tItem = Item( item.production, item.dotPos, list(item.lookahead.keys()) ) - self[tItem] = tItem + self._kernel[tItem] = tItem # Merge an added item. - def addedAppend(self, item): + def addedAppend(self, item: Item) -> bool: assert item.dotPos == 0 assert item.production.lhs.name != "" @@ -253,45 +250,43 @@ def addedAppend(self, item): # Given a list of items, compute their closure and merge the results into # the set of added items. - def _closeItems(self, items): + def _closeItems(self, items: List[Item]) -> None: # Iterate over the items until no more can be added to the closure. i = 0 while i < len(items): item = items[i] rhs = item.production.rhs dotPos = item.dotPos - if dotPos < len(rhs) and isinstance(rhs[dotPos], NontermSpec): - for lookahead in list(item.lookahead.keys()): - string = StringSpec( - String(rhs[dotPos + 1 :] + [lookahead]) - ) - lhs = rhs[dotPos] - for prod in lhs.productions: - tItem = Item(prod, 0, string.firstSet) - if self.addedAppend(tItem): - items.append(tItem) + if dotPos < len(rhs): + lhs = rhs[dotPos] + if isinstance(lhs, NontermSpec): + for lookahead in tuple(item.lookahead.keys()): + string = StringSpec(rhs[dotPos + 1 :] + [lookahead]) + for prod in lhs.productions: + tItem = Item(prod, 0, string.firstSet) + if self.addedAppend(tItem): + items.append(tItem) i += 1 # Calculate and merge the kernel's transitive closure. - def closure(self): + def closure(self) -> None: items = [] - for item in self.keys(): + for item in self._kernel: rhs = item.production.rhs dotPos = item.dotPos - if dotPos < len(rhs) and isinstance(rhs[dotPos], NontermSpec): - for lookahead in item.lookahead.keys(): - string = StringSpec( - String(rhs[dotPos + 1 :] + [lookahead]) - ) - lhs = rhs[dotPos] - for prod in lhs.productions: - tItem = Item(prod, 0, string.firstSet) - if self.addedAppend(tItem): - items.append(tItem) + if dotPos < len(rhs): + lhs = rhs[dotPos] + if isinstance(lhs, NontermSpec): + for lookahead in item.lookahead.keys(): + string = StringSpec(rhs[dotPos + 1 :] + [lookahead]) + for prod in lhs.productions: + tItem = Item(prod, 0, string.firstSet) + if self.addedAppend(tItem): + items.append(tItem) self._closeItems(items) # Calculate the kernel of the goto set, given a particular symbol. - def goto(self, sym): + def goto(self, sym: SymbolSpec) -> ItemSet: ret = ItemSet() for item in self: rhs = item.production.rhs @@ -306,11 +301,11 @@ def goto(self, sym): # Merge the kernel of other into this ItemSet, then update the closure. # It is not sufficient to copy other's added items, since other has not # computed its closure. - def merge(self, other): + def merge(self, other: ItemSet) -> bool: items = [] - for item in other.keys(): + for item in other._kernel: if item in self: - lookahead = self[item].lookahead + lookahead = self._kernel[item].lookahead tLookahead = [] for sym in item.lookahead.keys(): if sym not in lookahead: @@ -323,7 +318,7 @@ def merge(self, other): tItem = Item( item.production, item.dotPos, list(item.lookahead.keys()) ) - self[tItem] = tItem + self._kernel[tItem] = tItem items.append(tItem) if len(items) > 0: @@ -334,16 +329,16 @@ def merge(self, other): # Determine if self and other are weakly compatible, as defined by the # Pager(1977) algorithm. - def weakCompat(self, other): + def weakCompat(self, other: ItemSet) -> bool: # Check for identical kernel LR(0) items, # and pair items, for later use. if len(self) != len(other): return False pairs = [] - for sItem in self.keys(): + for sItem in self._kernel: if sItem not in other: return False - oItem = other[sItem] + oItem = other._kernel[sItem] pairs.append((sItem, oItem)) # Check for lookahead compatibility. @@ -369,22 +364,24 @@ def weakCompat(self, other): return True -class Spec(object): +class Spec: """ The Spec class contains the read-only data structures that the Parser class needs in order to parse input. Parser generation results in a Spec instance, which can then be shared by multiple Parser instances.""" + _sym2spec: dict[type[Symbol], SymbolSpec] + def __init__( self, - modules, - pickleFile=None, - pickleMode="rw", - skinny=True, - logFile=None, - graphFile=None, - verbose=False, - ): + source: types.ModuleType | List[types.ModuleType] | SpecSource, + pickleFile: Optional[str] = None, + pickleMode: PickleMode = "rw", + skinny: bool = True, + logFile: Optional[str] = None, + graphFile: Optional[str] = None, + verbose: bool = False, + ) -> None: """ modules : Either a single module, or a list of modules, wherein to look for parser generator directives in docstrings. @@ -408,13 +405,6 @@ def __init__( verbose : If true, print progress information while generating the parsing tables.""" - assert pickleFile is None or type(pickleFile) == str - assert pickleMode in ["rw", "r", "w"] - assert type(skinny) == bool - assert logFile is None or type(logFile) == str - assert graphFile is None or type(graphFile) == str - assert type(verbose) == bool - self._skinny = skinny self._verbose = verbose @@ -428,54 +418,140 @@ def __init__( self._none.name: self._none, self._split.name: self._split, } - self._nonterms = {} - self._tokens = {eoi.name: eoi, epsilon.name: epsilon} - self._sym2spec = {EndOfInput: eoi, Epsilon: epsilon} - self._productions = [] + self._nonterms: Dict[str, NontermSpec] = {} + self._tokens: Dict[str, TokenSpec] = { + eoi.name: eoi, + epsilon.name: epsilon, + } + self._sym2spec: Dict[Type[Symbol], SymbolSpec] = { + EndOfInput: eoi, + Epsilon: epsilon, + } + self._productions: list[Production] = [] + + sources: list[types.ModuleType] | SpecSource + if isinstance(source, types.ModuleType): + sources = [source] + else: + sources = source + + if not isinstance(sources, list): + spec_source = sources + else: + spec_source = module_spec.ModuleSpecSource(sources) + + if self._verbose: + print( + ( + "Parsing.Spec: Introspecting to acquire formal " + "grammar specification..." + ) + ) + + # =========================================================== + # Precedence. + # + for prec in spec_source.get_precedences(): + name = prec.name + if name in self._precedences: + raise SpecError("Duplicate precedence name: %s" % (name,)) + if name in self._tokens: + raise SpecError( + "Identical token/precedence names: %s" % (name,) + ) + if name in self._nonterms: + raise SpecError( + "Identical nonterm/precedence names: %s" % (name,) + ) + self._precedences[name] = prec + + # =========================================================== + # Token. + # + for token in spec_source.get_tokens(): + name = token.name + tt = token.tokenType + if name in self._precedences: + raise SpecError( + "Identical precedence/token names: %s" % tt.__doc__ + ) + if name in self._tokens: + raise SpecError("Duplicate token name: %s" % tt.__doc__) + if name in self._nonterms: + raise SpecError( + "Identical nonterm/token names: %s" % tt.__doc__ + ) + self._tokens[name] = token + self._sym2spec[tt] = token - self._userStartSym = None - self._startSym = None - self._startProd = None + # =========================================================== + # Nonterm. + # + nonterms, userStart = spec_source.get_nonterminals() + for nonterm in nonterms: + name = nonterm.name + nt = nonterm.nontermType + if name in self._precedences: + raise SpecError( + "Identical precedence/nonterm names: %s" % nt.__doc__ + ) + if name in self._tokens: + raise SpecError( + "Identical token/nonterm names: %s" % nt.__doc__ + ) + if name in self._nonterms: + raise SpecError("Duplicate nonterm name: %s" % nt.__doc__) + self._nonterms[name] = nonterm + self._sym2spec[nt] = nonterm + + self._userStartSym = userStart + if not isinstance(self._userStartSym, NontermSpec): + raise SpecError("No start symbol specified") + + self._startSym = NontermSpec( + NontermStart, "", f"{__name__}.NontermStart", self._none + ) + + self._startProd = Production( + NontermStart.reduce, + f"{__name__}.NontermStart.reduce", + self._none, + self._startSym, + [self._userStartSym, eoi], + ) # Everything below this point is computed from the above (once # introspection is complete). - self._itemSets = ( - [] - ) # Each element corresponds to an element in _action. - self._itemSetsHash = None + # Each element corresponds to an element in _action. + self._itemSets: list[ItemSet] = [] + self._itemSetsHash: dict[ItemSet, list[int]] | None = None # LR parsing tables. The tables conceptually contain one state per # row, where each row contains one element per symbol. The table is # conceptually in row-major form, but each row is actually a # dictionary. If no entry for a symbol exists for a particular state, # then input of that symbol is an error for that state. - self._action = [] - self._goto = [] - self._startState = None + self._action: list[ActionState] = [] + self._goto: list[GotoState] = [] + self._startState: int | None = None self._nActions = 0 self._nConflicts = 0 self._nImpure = 0 # Number of LR impurities (does not affect GLR). - # Introspect modules and generate parse tables. - self._prepare(modules, pickleFile, pickleMode, logFile, graphFile) + # Generate parse tables. + self._prepare(pickleFile, pickleMode, logFile, graphFile) - def __getPureLR(self): + def __getPureLR(self) -> int: return self._nConflicts + self._nImpure == 0 - def __setPureLR(self): - raise AttributeError + pureLR = property(__getPureLR) - pureLR = property(__getPureLR, __setPureLR) - - def __getConflicts(self): + def __getConflicts(self) -> int: return self._nConflicts - def __setConflicts(self): - raise AttributeError - - conflicts = property(__getConflicts, __setConflicts) + conflicts = property(__getConflicts) - def __repr__(self): + def __repr__(self) -> str: if self._skinny: # Print a very reduced summary, since most info has been discarded. return "Parsing.Spec: %d states, %d actions (%d split)" % ( @@ -495,22 +571,21 @@ def __repr__(self): lines.append(" %r" % prec) lines.append("Tokens:") - syms = [sym for sym in self._tokens.values()] - syms.sort() - for token in syms: + sym: SymbolSpec + tokens = [sym for sym in self._tokens.values()] + for token in sorted(tokens): lines.append(" %r %r" % (token, token.prec)) lines.append(" First set: %r" % token.firstSet) lines.append(" Follow set: %r" % token.followSet) lines.append("Non-terminals:") - syms = [sym for sym in self._nonterms.values()] - syms.sort() - for sym in syms: + nonterms = [sym for sym in self._nonterms.values()] + for sym in sorted(nonterms): lines.append(" %r %r" % (sym, sym.prec)) lines.append(" First set: %r" % sym.firstSet) lines.append(" Follow set: %r" % sym.followSet) lines.append(" Productions:") - prods = sym.productions[:] + prods = list(sym.productions) prods.sort() for prod in prods: lines.append(" %r" % prod) @@ -562,8 +637,7 @@ def __repr__(self): ) lines.append(" Goto:") syms = [sym for sym in self._goto[i]] - syms.sort() - for sym in syms: + for sym in sorted(syms): lines.append(" %15r : %r" % (sym, self._goto[i][sym])) lines.append(" Action:") syms = [sym for sym in self._action[i]] @@ -599,38 +673,20 @@ def __repr__(self): ret = "\n".join(lines) return ret - def _prepare(self, adapter, pickleFile, pickleMode, logFile, graphFile): + def _prepare( + self, + pickleFile: Optional[str], + pickleMode: PickleMode, + logFile: Optional[str], + graphFile: Optional[str], + ) -> None: """ Compile the specification into data structures that can be used by the Parser class for parsing.""" - # Get the grammar specification. - if isinstance(adapter, types.ModuleType) or ( - isinstance(adapter, list) - and isinstance(adapter[0], types.ModuleType) - ): - adapter = module_spec.ModuleSpecSource(adapter) - elif not is_spec_source(adapter): - raise ValueError( - "%r should be a specification source" % (adapter,) - ) - self._introspect(adapter) - # Augment grammar with a special start symbol and production: # # ::= S <$>. - assert self._startSym is None - assert isinstance(self._userStartSym, NontermSpec) - self._startSym = NontermSpec( - NontermStart, "", "%s.NontermStart" % __name__, self._none - ) - self._startProd = Production( - NontermStart.reduce, - "%s.NontermStart.reduce" % __name__, - self._none, - self._startSym, - [self._userStartSym, eoi], - ) - self._startSym.productions.append(self._startProd) + self._startSym.productions.add(self._startProd) self._nonterms[""] = self._startSym self._productions.append(self._startProd) @@ -685,7 +741,7 @@ def _prepare(self, adapter, pickleFile, pickleMode, logFile, graphFile): # Introspect modules and find special parser declarations. In order to be # a special class, the class must both 1) be subclassed from Token or # Nonterm, and 2) contain the appropriate %foo docstring. - def _introspect(self, adapter): + def _introspect(self, spec_source: SpecSource) -> None: if self._verbose: print( ( @@ -700,7 +756,7 @@ def _introspect(self, adapter): # =========================================================== # Precedence. # - for prec in adapter.get_precedences(): + for prec in spec_source.get_precedences(): name = prec.name if name in self._precedences: raise SpecError("Duplicate precedence name: %s" % (name,)) @@ -717,60 +773,62 @@ def _introspect(self, adapter): # =========================================================== # Token. # - for token in adapter.get_tokens(): + for token in spec_source.get_tokens(): name = token.name - v = token.tokenType + tt = token.tokenType if name in self._precedences: raise SpecError( - "Identical precedence/token names: %s" % v.__doc__ + "Identical precedence/token names: %s" % tt.__doc__ ) if name in self._tokens: - raise SpecError("Duplicate token name: %s" % v.__doc__) + raise SpecError("Duplicate token name: %s" % tt.__doc__) if name in self._nonterms: raise SpecError( - "Identical nonterm/token names: %s" % v.__doc__ + "Identical nonterm/token names: %s" % tt.__doc__ ) self._tokens[name] = token - self._sym2spec[v] = token + self._sym2spec[tt] = token # =========================================================== # Nonterm. # - nonterms, userStart = adapter.get_nonterminals() + nonterms, userStart = spec_source.get_nonterminals() for nonterm in nonterms: name = nonterm.name - v = nonterm.nontermType + nt = nonterm.nontermType if name in self._precedences: raise SpecError( - "Identical precedence/nonterm names: %s" % v.__doc__ + "Identical precedence/nonterm names: %s" % nt.__doc__ ) if name in self._tokens: raise SpecError( - "Identical token/nonterm names: %s" % v.__doc__ + "Identical token/nonterm names: %s" % nt.__doc__ ) if name in self._nonterms: - raise SpecError("Duplicate nonterm name: %s" % v.__doc__) + raise SpecError("Duplicate nonterm name: %s" % nt.__doc__) self._nonterms[name] = nonterm - self._sym2spec[v] = nonterm + self._sym2spec[nt] = nonterm self._userStartSym = userStart if not isinstance(self._userStartSym, NontermSpec): raise SpecError("No start symbol specified") # Resolve all symbolic (named) references. - def _references(self, logFile, graphFile): + def _references( + self, logFile: Optional[str], graphFile: Optional[str] + ) -> None: # Build the graph of Precedence relationships. self._resolvePrec(graphFile) # Resolve Token-->Precedence references. for token in self._tokens.values(): - if type(token.prec) == str: - token.prec = self._precedences[token.prec] + if isinstance(token.prec, PrecedenceRef): + token.prec = self._precedences[token.prec.name] # Resolve Nonterm-->Precedence references. for nonterm in self._nonterms.values(): - if type(nonterm.prec) == str: - nonterm.prec = self._precedences[nonterm.prec] + if isinstance(nonterm.prec, PrecedenceRef): + nonterm.prec = self._precedences[nonterm.prec.name] # Resolve Nonterm-->{Nonterm,Token,Precedence} references. for nonterm in self._nonterms.values(): @@ -782,7 +840,7 @@ def _references(self, logFile, graphFile): ): dirtoks = introspection.parse_docstring(v.__doc__) if dirtoks[0] == "%reduce": - rhs = [] + rhs: List[SymbolSpec] = [] rhs_terms = [] prec = None for i in range(1, len(dirtoks)): @@ -835,7 +893,7 @@ def _references(self, logFile, graphFile): rhs, ) assert prod not in nonterm.productions - nonterm.productions.append(prod) + nonterm.productions.add(prod) self._productions.append(prod) if self._verbose: ntokens = len(self._tokens) - 1 @@ -855,7 +913,7 @@ def _references(self, logFile, graphFile): ) # Build the graph of Precedence relationships. - def _resolvePrec(self, graphFile): + def _resolvePrec(self, graphFile: Optional[str]) -> None: # Resolve symbolic references and populate equiv/dominators. for precA in self._precedences.values(): for precBName in precA.relationships: @@ -962,7 +1020,7 @@ def _resolvePrec(self, graphFile): raise SpecError("\n".join(cycles)) # Store state to a pickle file, if requested. - def _pickle(self, file, mode): + def _pickle(self, file: Optional[str], mode: PickleMode) -> None: if self._skinny: # Discard data that don't need to be pickled. del self._startSym @@ -983,7 +1041,9 @@ def _pickle(self, file, mode): # Restore state from a pickle file, if a compatible one is provided. This # method uses the same set of return values as does _compatible(). - def _unpickle(self, file, mode): + def _unpickle( + self, file: Optional[str], mode: PickleMode + ) -> SpecCompatibility: if file is not None and "r" in mode: if self._verbose: print( @@ -995,7 +1055,7 @@ def _unpickle(self, file, mode): # Any exception at all in unpickling can be assumed to be # due to an incompatible pickle. try: - spec = pickle.load(f) + spec: Spec = pickle.load(f) except Exception: if self._verbose: error = sys.exc_info() @@ -1100,8 +1160,8 @@ def _unpickle(self, file, mode): # not. # # "incompatible" : No useful compatibility. - def _compatible(self, other): - ret = "compatible" + def _compatible(self, other: Spec) -> SpecCompatibility: + ret: SpecCompatibility = "compatible" if (not self._skinny) and other._skinny: return "incompatible" @@ -1277,7 +1337,7 @@ def _compatible(self, other): # Check for unused prececence/token/nonterm/reduce specifications, then # throw a SpecError if any ambiguities exist in the grammar. - def _validate(self, logFile): + def _validate(self, logFile: Optional[str]) -> None: if self._verbose: print("Parsing.Spec: Validating grammar...") @@ -1291,18 +1351,19 @@ def _validate(self, logFile): # Previous code guarantees that all precedence/token/nonterm names are # unique. Therefore, we can build a single dictionary here that keys # on names. - used = {} + used: Dict[str, Any] = {} productions = [] for itemSet in self._itemSets: for item in itemSet: productions.append(item.production) used[item.production.prec.name] = item.production.prec - for sym in [item.production.lhs] + item.production.rhs: + lhs: List[SymbolSpec] = [item.production.lhs] + for sym in lhs + item.production.rhs: used[sym.name] = sym used[sym.prec.name] = sym.prec - for token in item.lookahead.keys(): - used[token.prec.name] = token.prec + for token_spec in item.lookahead.keys(): + used[token_spec.prec.name] = token_spec.prec nUnused = 0 @@ -1382,7 +1443,7 @@ def _validate(self, logFile): sys.stdout.write("%s\n" % "\n".join(lines)) # Compute the first sets for all symbols. - def _firstSets(self): + def _firstSets(self) -> None: # Terminals. # first(X) is X for terminals. for sym in self._tokens.values(): @@ -1396,11 +1457,11 @@ def _firstSets(self): while not done: done = True for name in self._nonterms: - sym = self._nonterms[name] - for prod in sym.productions: + nonterm = self._nonterms[name] + for prod in nonterm.productions: # Merge epsilon if there is an empty production. if len(prod.rhs) == 0: - if not sym.firstSetMerge(epsilon): + if not nonterm.firstSetMerge(epsilon): done = False # Iterate through the RHS and merge the first sets into @@ -1409,7 +1470,7 @@ def _firstSets(self): for elm in prod.rhs: containsEpsilon = False for elmSym in elm.firstSet: - if not sym.firstSetMerge(elmSym): + if not nonterm.firstSetMerge(elmSym): done = False if elmSym == epsilon: containsEpsilon = True @@ -1417,8 +1478,8 @@ def _firstSets(self): break # Compute the follow sets for all symbols. - def _followSets(self): - self._startSym.followSet = [epsilon] + def _followSets(self) -> None: + self._startSym.followSet = {epsilon} # Repeat the following loop until no more symbols can be added to any # follow set. @@ -1447,7 +1508,7 @@ def _followSets(self): break # Compute the collection of sets of LR(1) items. - def _items(self): + def _items(self) -> None: # Add {[S' ::= * S $., ]} to _itemSets. tItemSet = ItemSet() tItem = Item(self._startProd, 0, [epsilon]) @@ -1470,7 +1531,8 @@ def _items(self): # indices; these itemsets are the ones referred to the key itemset. itemSetsHash = {tItemSet: [0]} - syms = list(self._tokens.values()) + list(self._nonterms.values()) + syms: List[SymbolSpec] = list(self._tokens.values()) + syms += list(self._nonterms.values()) while len(worklist) > 0: if self._verbose: if abs(len(worklist) - nwork) >= 10: @@ -1520,7 +1582,7 @@ def _items(self): self._itemSetsHash = itemSetsHash # Compute LR parsing tables. - def _lr(self): + def _lr(self) -> None: # The collection of sets of LR(1) items already exists. assert len(self._itemSets) > 0 assert len(self._action) == 0 @@ -1538,6 +1600,7 @@ def _lr(self): sys.stdout.flush() itemSetsHash = self._itemSetsHash + assert itemSetsHash is not None for itemSet in self._itemSets: if self._verbose: @@ -1545,7 +1608,7 @@ def _lr(self): sys.stdout.flush() # ============================================================== # _action. - state = {} + state: ActionState = {} self._action.append(state) for item in itemSet: # X ::= a*Ab @@ -1577,16 +1640,16 @@ def _lr(self): assert False # ============================================================= # _goto. - state = {} - self._goto.append(state) + gstate: GotoState = {} + self._goto.append(gstate) for nonterm in self._nonterms.values(): itemSetB = itemSet.goto(nonterm) if itemSetB in itemSetsHash: for i in itemSetsHash[itemSetB]: itemSetC = self._itemSets[i] if itemSetC.weakCompat(itemSetB): - assert nonterm not in state - state[nonterm] = i + assert nonterm not in gstate + gstate[nonterm] = i break if self._verbose: @@ -1594,11 +1657,12 @@ def _lr(self): sys.stdout.flush() # Add a symbol action to state, if the action doesn't already exist. - def _actionAppend(self, state, sym, action): - assert type(state) == dict - assert isinstance(sym, SymbolSpec) - assert isinstance(action, Action) - + def _actionAppend( + self, + state: dict[SymbolSpec, list[Action]], + sym: SymbolSpec, + action: Action, + ) -> None: if sym not in state: state[sym] = [action] else: @@ -1607,7 +1671,7 @@ def _actionAppend(self, state, sym, action): state[sym].append(action) # Look for action ambiguities and resolve them if possible. - def _disambiguate(self): + def _disambiguate(self) -> None: assert self._nActions == 0 assert self._nConflicts == 0 assert self._nImpure == 0 @@ -1703,7 +1767,11 @@ def _disambiguate(self): # "both" : Keep both. # "new" : Keep new. # "err" : Unresolvable conflict. - def _resolve(self, sym, oldAct, newAct): + def _resolve( + self, sym: SymbolSpec, oldAct: Action, newAct: Action + ) -> ConflictResolution: + ret: ConflictResolution + if type(oldAct) == ShiftAction: oldPrec = sym.prec elif type(oldAct) == ReduceAction: diff --git a/parsing/grammar.py b/parsing/grammar.py index d05cad8..e8739dc 100644 --- a/parsing/grammar.py +++ b/parsing/grammar.py @@ -23,14 +23,30 @@ This module contains classes that are used in the specification of grammars. """ +from __future__ import annotations +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Iterable, + Mapping, + List, + Optional, + Tuple, + Type, +) + import re import sys from parsing.ast import Token, Nonterm from parsing.errors import SpecError from parsing import introspection +if TYPE_CHECKING: + import types + -class Precedence(object): +class Precedence: """ Precedences can be associated with tokens, non-terminals, and productions. Precedence isn't as important for GLR parsers as for LR @@ -103,20 +119,24 @@ class P4(Parsing.Precedence): assoc_tok_re = re.compile(r"([<>=])([A-Za-z]\w*)") - def __init__(self, name, assoc, relationships): + def __init__( + self, + name: str, + assoc: str, + relationships: Mapping[str, str], + ) -> None: assert assoc in ["fail", "nonassoc", "left", "right", "split"] - assert type(relationships) == dict self.name = name self.assoc = assoc self.relationships = relationships # Raw relationships specification. # Precedences that have equivalent precedence. - self.equiv = set((self,)) + self.equiv: set[Precedence] = set((self,)) # Precedences that have higher precedence. - self.dominators = set() + self.dominators: set[Precedence] = set() - def __repr__(self): + def __repr__(self) -> str: equiv = [prec.name for prec in self.equiv] equiv.sort() domin = [prec.name for prec in self.dominators] @@ -129,40 +149,54 @@ def __repr__(self): ) -class SymbolSpec(int): - seq = 0 +class PrecedenceRef(Precedence): + def __init__(self, name: str) -> None: + super().__init__(name, "fail", {}) - def __new__(cls, *args, **kwargs): - result = int.__new__(cls, SymbolSpec.seq) - result.seq = SymbolSpec.seq - SymbolSpec.seq += 1 - return result - def __init__(self, name, prec): - assert type(name) == str +class SymbolSpec: + seq = 0 + def __init__(self, name: str, prec: Precedence) -> None: self.name = name self.prec = prec - self.firstSet = [] # Set. - self.followSet = [] # Set. + self.firstSet: set[SymbolSpec] = set() + self.followSet: set[SymbolSpec] = set() + self.seq = SymbolSpec.seq + SymbolSpec.seq += 1 + + def __hash__(self) -> int: + return self.seq - def __repr__(self): - return "%s" % self.name + def __eq__(self, other: Any) -> bool: + if isinstance(other, SymbolSpec): + return self.seq == other.seq + else: + return NotImplemented + + def __lt__(self, other: Any) -> bool: + if isinstance(other, SymbolSpec): + return self.seq < other.seq + else: + return NotImplemented + + def __repr__(self) -> str: + return self.name __str__ = __repr__ - def firstSetMerge(self, sym): + def firstSetMerge(self, sym: SymbolSpec) -> bool: if sym not in self.firstSet: - self.firstSet.append(sym) + self.firstSet.add(sym) return False else: return True - def followSetMerge(self, set): + def followSetMerge(self, set: Iterable[SymbolSpec]) -> bool: ret = True for sym in set: if sym != epsilon and sym not in self.followSet: - self.followSet.append(sym) + self.followSet.add(sym) ret = False return ret @@ -171,30 +205,36 @@ class NontermSpec(SymbolSpec): token_re = re.compile(r"([A-Za-z]\w*)") precedence_tok_re = re.compile(r"\[([A-Za-z]\w*)\]") - def __init__(self, nontermType, name, qualified, prec): - assert issubclass(nontermType, Nonterm) # Add forward decl for Lyken. - - SymbolSpec.__init__(self, name, prec) - + def __init__( + self, + nontermType: Type[Nonterm], + name: str, + qualified: str, + prec: Precedence, + ) -> None: + super().__init__(name, prec) self.qualified = qualified self.nontermType = nontermType - self.productions = [] # Set. + self.productions: set[Production] = set() @classmethod - def from_class(cls, nt_subclass, name=None, module=None): + def from_class( + cls, + nt_subclass: type, + name: Optional[str] = None, + module: Optional[types.ModuleType] = None, + ) -> Tuple[NontermSpec, bool]: if name is None: name = nt_subclass.__name__ if module is None: module_name = nt_subclass.__module__ else: module_name = module.__name__ - if nt_subclass.__doc__ is None: - dirtoks = ["%nonterm", name] - else: + if nt_subclass.__doc__ is not None: dirtoks = introspection.parse_docstring(nt_subclass.__doc__) - is_start = dirtoks[0] == "%start" - # if dirtoks[0] in SHORTHAND: - # dirtoks = ['%nonterm', name] + else: + dirtoks = ("%nonterm", name) + is_start = dirtoks[0] == r"%start" symbol_name = None prec = None i = 1 @@ -207,7 +247,7 @@ def from_class(cls, nt_subclass, name=None, module=None): "Precedence must come last in " "non-terminal specification: %s" % nt_subclass.__doc__ ) - prec = m.group(1) + prec = PrecedenceRef(m.group(1)) else: m = NontermSpec.token_re.match(tok) if m: @@ -221,58 +261,76 @@ def from_class(cls, nt_subclass, name=None, module=None): if symbol_name is None: symbol_name = name if prec is None: - prec = "none" + prec = PrecedenceRef("none") - # nonterm = NontermSpec(symbol_name, nt_subclass, - # "%s.%s" % (module_name, name), prec) nonterm = NontermSpec( - nt_subclass, symbol_name, "%s.%s" % (module_name, name), prec + nt_subclass, symbol_name, f"{module_name}.{name}", prec ) return nonterm, is_start # AKA terminal symbol. class TokenSpec(SymbolSpec): - def __init__(self, tokenType, name, prec): - assert issubclass(tokenType, Token) - assert type(name) == str - assert isinstance(prec, Precedence) or type(prec) == str - + def __init__( + self, + tokenType: Type[Token], + name: str, + prec: Precedence, + ) -> None: SymbolSpec.__init__(self, name, prec) self.tokenType = tokenType -class Production(int): +class Production: seq = 0 - def __new__(cls, *args, **kwargs): - result = int.__new__(cls, Production.seq) - result.seq = Production.seq - Production.seq += 1 - return result - - def __init__(self, method, qualified, prec, lhs, rhs): - assert isinstance(prec, Precedence) - assert isinstance(lhs, NontermSpec) - if __debug__: - for elm in rhs: - assert isinstance(elm, SymbolSpec) - + def __init__( + self, + method: Callable[..., Nonterm | None], + qualified: str, + prec: Precedence, + lhs: NontermSpec, + rhs: List[SymbolSpec], + ) -> None: self.method = method self.qualified = qualified self.prec = prec self.lhs = lhs self.rhs = rhs + self.seq = Production.seq + Production.seq += 1 - def __getstate__(self): + def __hash__(self) -> int: + return self.seq + + def __eq__(self, other: Any) -> bool: + if type(other) == Production: + return self.seq == other.seq + else: + return NotImplemented + + def __lt__(self, other: Any) -> bool: + if type(other) == Production: + return self.seq < other.seq + else: + return NotImplemented + + def __getstate__( + self, + ) -> Tuple[str, Precedence, NontermSpec, List[SymbolSpec], int]: return (self.qualified, self.prec, self.lhs, self.rhs, self.seq) - def __setstate__(self, data): + def __setstate__( + self, + data: Tuple[str, Precedence, NontermSpec, List[SymbolSpec], int], + ) -> None: # Convert qualified name to a function reference. (qualified, prec, lhs, rhs, seq) = data elms = qualified.split(".") - method = sys.modules[elms[0]] - for elm in elms[1:]: + assert len(elms) > 1 + module = sys.modules[elms[0]] + method = module.__dict__[elms[1]] + for elm in elms[2:]: method = method.__dict__[elm] # Set state. @@ -283,7 +341,7 @@ def __setstate__(self, data): self.rhs = rhs self.seq = seq - def __repr__(self): + def __repr__(self) -> str: return "%r ::= %s. [%s]" % ( self.lhs, " ".join(["%r" % elm for elm in self.rhs]), @@ -293,7 +351,7 @@ def __repr__(self): # Optional callback method. # # Called when a production is reduced. - def reduce(self, lhs, *rhs): + def reduce(self, lhs: NontermSpec, *rhs: SymbolSpec) -> None: pass @@ -303,8 +361,8 @@ class EndOfInput(Token): class EndOfInputSpec(TokenSpec): - def __init__(self): - TokenSpec.__init__(self, EndOfInput, "<$>", "none") + def __init__(self) -> None: + TokenSpec.__init__(self, EndOfInput, "<$>", PrecedenceRef("none")) eoi = EndOfInputSpec() @@ -316,28 +374,23 @@ class Epsilon(Token): class EpsilonSpec(TokenSpec): - def __init__(self): - TokenSpec.__init__(self, Epsilon, "", "none") + def __init__(self) -> None: + TokenSpec.__init__(self, Epsilon, "", PrecedenceRef("none")) epsilon = EpsilonSpec() class NontermStart(Nonterm): - def reduce(self, userStartSym, eoi): + def reduce(self, userStartSym: SymbolSpec, eoi: EndOfInputSpec) -> None: pass -class Start(Production): - def __init__(self, startSym, userStartSym): - Production.__init__(self, None, startSym, userStartSym) - - -class Action(object): +class Action: """ Abstract base class, subclassed by {Shift,Reduce}Action.""" - def __init__(self): + def __init__(self) -> None: pass @@ -345,14 +398,14 @@ class ShiftAction(Action): """ Shift action, with assocated nextState.""" - def __init__(self, nextState): + def __init__(self, nextState: int) -> None: Action.__init__(self) self.nextState = nextState - def __repr__(self): + def __repr__(self) -> str: return "[shift %r]" % self.nextState - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if not isinstance(other, ShiftAction): return False if self.nextState != other.nextState: @@ -364,14 +417,14 @@ class ReduceAction(Action): """ Reduce action, with associated production.""" - def __init__(self, production): + def __init__(self, production: Production) -> None: Action.__init__(self) self.production = production - def __repr__(self): + def __repr__(self) -> str: return "[reduce %r]" % self.production - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if not isinstance(other, ReduceAction): return False if self.production != other.production: diff --git a/parsing/interfaces.py b/parsing/interfaces.py index 3bf2735..dfac436 100644 --- a/parsing/interfaces.py +++ b/parsing/interfaces.py @@ -3,45 +3,37 @@ that objects or classes can implement to be used in the library """ -__all__ = ["is_parser", "is_symspec", "is_spec_source"] - - -def has_methods(obj, methods): - for method in methods: - if not hasattr(obj, method): - return False - if not callable(getattr(obj, method)): - return False - return True - - -def is_parser(parser): - """ - returns True if `parser` fits the structural interface for a parser. - """ - if not has_methods(parser, ["token", "eoi"]): - return False - if not hasattr(parser, "_spec"): - return False - return True - - -def is_symspec(symspec): - """ - returns True if `symspec` can be used as a symbol specification. - """ - if not hasattr(symspec, "name"): - return False - if not hasattr(symspec, "prec"): - return False - return True - - -def is_spec_source(source): - """ - returns True if `source` can be used to define a grammar. - """ - if not has_methods( - source, ["get_precedences", "get_tokens", "get_nonterminals"] - ): - return False +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import Protocol + from parsing.ast import Token + from parsing.automaton import Spec + from parsing.grammar import NontermSpec, Precedence, TokenSpec + + class Parser(Protocol): + _spec: Spec + + def token(self, token: Token) -> None: + ... + + def eoi( + self, + ) -> None: + ... + + class SymbolSpec(Protocol): + name: str + prec: str + + class SpecSource(Protocol): + def get_precedences(self) -> list[Precedence]: + ... + + def get_tokens(self) -> list[TokenSpec]: + ... + + def get_nonterminals(self) -> tuple[list[NontermSpec], NontermSpec]: + ... diff --git a/parsing/introspection.py b/parsing/introspection.py index 244781d..77a19a9 100644 --- a/parsing/introspection.py +++ b/parsing/introspection.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import re -def parse_docstring(s): - return list(filter(None, re.split(r"\s+", s.replace("\n", " ")))) +def parse_docstring(s: str) -> tuple[str, ...]: + return tuple(filter(None, re.split(r"\s+", s.replace("\n", " ")))) diff --git a/parsing/module_spec.py b/parsing/module_spec.py index 270f18f..c899fc4 100644 --- a/parsing/module_spec.py +++ b/parsing/module_spec.py @@ -2,19 +2,24 @@ This module contains functionality for extracting a grammar from classes in a module. """ +from __future__ import annotations + import types -from parsing.grammar import Precedence, TokenSpec, NontermSpec, SpecError +from parsing.grammar import Precedence, PrecedenceRef, TokenSpec, NontermSpec from parsing.ast import Token, Nonterm from parsing import introspection +from parsing.errors import SpecError -class ModuleSpecSource(object): +class ModuleSpecSource: """ ModuleSpecSource scans one or several modules for subclasses of relevant classes (Precedence, Token, Nonterm) with specific docstrings. """ - def __init__(self, modules): + def __init__( + self, modules: types.ModuleType | list[types.ModuleType] + ) -> None: if isinstance(modules, types.ModuleType): # Wrap single module in a list. modules = [modules] @@ -26,11 +31,13 @@ def __init__(self, modules): dirtoks = introspection.parse_docstring(v.__doc__) items.append((module, k, v, dirtoks)) self.named_objs = items - self._cache_precedences = None - self._cache_tokens = None - self._cache_nonterminals = None + self._cache_precedences: list[Precedence] | None = None + self._cache_tokens: list[TokenSpec] | None = None + self._cache_nonterminals: tuple[ + list[NontermSpec], NontermSpec + ] | None = None - def get_precedences(self): + def get_precedences(self) -> list[Precedence]: if self._cache_precedences is not None: return self._cache_precedences result = [] @@ -77,7 +84,7 @@ def get_precedences(self): self._cache_precedences = result return result - def get_tokens(self): + def get_tokens(self) -> list[TokenSpec]: if self._cache_tokens is not None: return self._cache_tokens result = [] @@ -95,7 +102,7 @@ def get_tokens(self): "Precedence must come last in token " "specification: %s" % v.__doc__ ) - prec = m.group(1) + prec = PrecedenceRef(m.group(1)) else: m = NontermSpec.token_re.match(tok) if m: @@ -106,14 +113,13 @@ def get_tokens(self): ) i += 1 if prec is None: - prec = "none" - # token = TokenSpec(name, v, prec) + prec = PrecedenceRef("none") token = TokenSpec(v, name, prec) result.append(token) self._cache_tokens = result return result - def get_nonterminals(self): + def get_nonterminals(self) -> tuple[list[NontermSpec], NontermSpec]: if self._cache_nonterminals is not None: return self._cache_nonterminals result = [] @@ -131,5 +137,6 @@ def get_nonterminals(self): % v.__doc__ ) startSym = nonterm + assert startSym is not None self._cache_nonterminals = (result, startSym) return result, startSym diff --git a/parsing/py.typed b/parsing/py.typed new file mode 100644 index 0000000..1242d43 --- /dev/null +++ b/parsing/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561. diff --git a/parsing/tests/test_codestyle.py b/parsing/tests/test_codestyle.py index 8c75b92..a7dacc3 100644 --- a/parsing/tests/test_codestyle.py +++ b/parsing/tests/test_codestyle.py @@ -10,7 +10,7 @@ def find_root(): ) -class TestFlake8(unittest.TestCase): +class TestCodeQuality(unittest.TestCase): def test_flake8(self): rootpath = find_root() @@ -30,6 +30,40 @@ def test_flake8(self): "flake8 validation failed:\n{}".format(output) ) + def test_mypy(self): + rootpath = find_root() + config_path = os.path.join(rootpath, "pyproject.toml") + if not os.path.exists(config_path): + raise RuntimeError("could not locate pyproject.toml file") + + try: + import mypy # NoQA + except ImportError: + raise unittest.SkipTest("mypy module is missing") + + try: + subprocess.run( + [ + sys.executable, + "-m", + "mypy", + "--config-file", + config_path, + "parsing", + ], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=rootpath, + ) + except subprocess.CalledProcessError as ex: + output = ex.stdout.decode() + if ex.stderr: + output += "\n\n" + ex.stderr.decode() + raise AssertionError( + f"mypy validation failed:\n{output}" + ) from None + if __name__ == "__main__": unittest.main() diff --git a/pyproject.toml b/pyproject.toml index 54d9127..de60a72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -requires-python = ">=3.6" +requires-python = ">=3.7" [build-system] requires = ["setuptools>=42", "wheel"] @@ -7,4 +7,30 @@ build-backend = "setuptools.build_meta" [tool.black] line-length = 79 -target-version = ["py36"] +target-version = ["py37"] + +[tool.mypy] +python_version = "3.7" +follow_imports = "normal" +ignore_missing_imports = true +warn_redundant_casts = true +warn_unused_configs = true +show_column_numbers = true +disallow_subclassing_any = true +disallow_any_generics = true +disallow_untyped_calls = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true +disallow_untyped_decorators = true +no_implicit_optional = true +warn_unused_ignores = true +warn_return_any = true +no_implicit_reexport = true +strict_equality = true + +[[tool.mypy.overrides]] +module = [ + "parsing.tests.*", +] +ignore_errors = true diff --git a/setup.py b/setup.py index 67bce0b..e3d7efa 100644 --- a/setup.py +++ b/setup.py @@ -12,8 +12,8 @@ setup( name="parsing", - version="1.6.1", - python_requires=">=3.6.0", + version="2.0.0.dev0", + python_requires=">=3.7.0", url="http://www.canonware.com/Parsing/", license="MIT", author="Jason Evans", @@ -30,9 +30,11 @@ "Topic :: Text Processing :: General", ], packages=["parsing", "parsing.tests", "parsing.tests.specs"], + package_data={"parsing": ["py.typed"]}, extras_require={ "test": [ "flake8", + MYPY_DEPENDENCY, ] }, **extra,