Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Document the tokeniser #11

Merged
merged 5 commits into from
Feb 6, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions mathics_scanner/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@


class TranslateError(Exception):
def __init__(self):
pass
"""A generic class of tokenizing errors"""
GarkGarcia marked this conversation as resolved.
Show resolved Hide resolved
pass


class ScanError(TranslateError):
"""A generic scanning error"""
pass


class InvalidSyntaxError(TranslateError):
"""Invalid syntax"""
pass


class IncompleteSyntaxError(TranslateError):
"""More characters were expected to form a valid token"""
pass

62 changes: 54 additions & 8 deletions mathics_scanner/tokeniser.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,22 @@ def compile_tokens(token_list):


def is_symbol_name(text):
"""
Returns ``True`` if ``text`` is a valid identifier. Otherwise returns
``False``.
"""
# Can't we just call match here?
return full_symbol_pattern.sub("", text) == ""


class Token(object):
"A representation of a Wolfram Language token"
def __init__(self, tag, text, pos):
"""
@param: tag A string that indicates which type of token this is.
@param: text The actual contents of the token.
@param: pos The position of the token in the input feed.
"""
self.tag = tag
self.text = text
self.pos = pos
Expand All @@ -326,28 +337,54 @@ def __repr__(self):


class Tokeniser(object):
"""
A tokenizer for the Wolfram Language.

When subclassing ``Tokeniser``, custom tokenisation rules can be defined by
Copy link
Contributor Author

@GarkGarcia GarkGarcia Feb 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this mechanism should be exposed in the public API (and therefore it shouldn't be documented in the docstring). If you think about it, all consumers of this library want is to have a functioning WL tokeniser that they can use as a black-box (that's what I think at-least).

This definitively should be documented somewhere though. @rocky I'd apprciate if we could merge #8 before this, so that I can move this information to implementation.rst. I also plan to convert implementation.rst and the rest of the documentation to a proper Sphinx document before we release the library (which should be pretty easy to do, so it's not gonna take too much time).

declaring methods whose names are preceded by ``t_``, such as in the
following example: ::

class MyTokeniser(Tokeniser):
def t_MyWeirdRule(self, match):
# Your logic goes here...
pass

In this example, ``t_MyWeirdRule`` is supposed to update the internal state
of the tokeniser and return a ``Token`` with an appropriate tag. ``m̀atch``
is expected to be an instance of ``re.Match``.
"""
modes = {
"expr": (tokens, token_indices),
"filename": (filename_tokens, {}),
}

def __init__(self, feeder):
"""
@param: feeder An instance of ``LineFeeder`` which will feed characters
to the tokenizer.
"""
self.pos = 0
self.feeder = feeder
self.prescanner = Prescanner(feeder)
self.code = self.prescanner.scan()
self.change_mode("expr")

# TODO: Turn this into a setter in the future?
def change_mode(self, mode):
"""
Set the mode of the tokenizer
"""
self.mode = mode
self.tokens, self.token_indices = self.modes[mode]

# TODO: Rename this to something that remotetly makes sense?
def incomplete(self):
"get more code from the prescanner and continue"
"Get more code from the prescanner and continue"
self.prescanner.incomplete()
self.code += self.prescanner.scan()

def sntx_message(self, pos=None):
"""Send a message to the feeder."""
if pos is None:
pos = self.pos
pre, post = self.code[:pos], self.code[pos:].rstrip("\n")
Expand All @@ -356,8 +393,9 @@ def sntx_message(self, pos=None):
else:
self.feeder.message("Syntax", "sntxf", pre, post)

# TODO: Convert this to __next__ in the future?
def next(self):
"return next token"
"Returns the next token"
self.skip_blank()
if self.pos >= len(self.code):
return Token("END", "", len(self.code))
Expand Down Expand Up @@ -391,7 +429,7 @@ def next(self):
return Token(tag, text, match.start(0))

def skip_blank(self):
"skip whitespace and comments"
"Skip whitespace and comments"
comment = [] # start positions of comments
while True:
if self.pos >= len(self.code):
Expand All @@ -417,6 +455,7 @@ def skip_blank(self):
break

def t_String(self, match):
"``String`` tokenizer"
start, end = self.pos, None
self.pos += 1 # skip opening '"'
newlines = []
Expand Down Expand Up @@ -444,6 +483,7 @@ def t_String(self, match):
return Token("String", result, start)

def t_Number(self, match):
"Number tag"
text = match.group(0)
pos = match.end(0)
if self.code[pos - 1 : pos + 1] == "..":
Expand All @@ -454,21 +494,27 @@ def t_Number(self, match):
self.pos = pos
return Token("Number", text, match.start(0))

def token_mode(self, match, tag, mode):
# This isn't outside of here so it's considered internal
def _token_mode(self, match, tag, mode):
"consume a token and switch mode"
text = match.group(0)
self.pos = match.end(0)
self.change_mode(mode)
return Token(tag, text, match.start(0))

def t_Get(self, match):
return self.token_mode(match, "Get", "filename")
"Get tag"
return self._token_mode(match, "Get", "filename")

def t_Put(self, match):
return self.token_mode(match, "Put", "filename")
"Put tag"
return self._token_mode(match, "Put", "filename")

def t_PutAppend(self, match):
return self.token_mode(match, "PutAppend", "filename")
"PutAppend tag"
return self._token_mode(match, "PutAppend", "filename")

def t_Filename(self, match):
return self.token_mode(match, "Filename", "expr")
"Filename tag"
return self._token_mode(match, "Filename", "expr")