Skip to content

Commit

Permalink
Added cythonized version of bmrblex.
Browse files Browse the repository at this point in the history
  • Loading branch information
smelandr committed Jan 4, 2017
1 parent 8fa1db8 commit 98e73fd
Showing 1 changed file with 182 additions and 0 deletions.
182 changes: 182 additions & 0 deletions nmrstarlib/cbmrblex.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
nmrstarlib.bmrblex
~~~~~~~~~~~~~~~~~~
This module provides :func:`~nmrstarlib.bmrblex.bmrblex` lexical analyzer for
BMRB NMR-STAR format syntax. It is implemented as Python generator-based
state machine which generates (yields) token one at a time when
:py:func:`next()` is invoked on :func:`~nmrstarlib.bmrblex.bmrblex` instance.
Simplified description of parsing rules:
----------------------------------------
* Each word or number separated by whitespace characters is a separate BMRB token.
* Each single quoted (') string is a separate BMRB token, it should start with a single quote (')
and end with a single quote *always* followed by whitespace character(s).
* Each double quoted (") string is a separate BMRB token, it should start with a double quote (")
and end with a double quote *always* followed by whitespace character(s).
* Single quoted and double quoted strings have to be processed separately.
* Single quoted and double quoted strings are processed one character at a time.
* Multiline strings start with a semicolon *always* followed by new line character and
ending with a semicolon *always* followed by whitespace character(s).
* Multiline strings are processed one line at a time.
.. note::
* For a full description of NMR-STAR file format, see official documentation:
http://www.bmrb.wisc.edu/dictionary/
* For a concise description of the NMR-STAR file format grammar see:
https://github.com/mattfenwick/NMRPyStar#nmr-star-grammar
"""

from collections import deque


def transform_text(input_txt):
"""Transforms text into :py:class:`~collections.deque`, pre-processes
multiline strings, and removes comments.
:param str or bytes input_txt: Input text.
:return: Double-ended queue of single characters and multiline strings.
:rtype: :py:class:`~collections.deque`
"""
if isinstance(input_txt, str):
text = u"{}".format(input_txt)
elif isinstance(input_txt, bytes):
text = input_txt.decode("utf-8")
else:
raise TypeError("Expecting <class 'str'> or <class 'bytes'>, but {} was passed".format(type(input_txt)))

inputq = deque(text.split(u"\n"))
outputq = deque()

cdef unicode line
cdef unicode multiline

while len(inputq) > 0:
line = inputq.popleft()

if line.startswith(u";"):
multiline = u";\n"
line = inputq.popleft()

while not line.startswith(u";"):
multiline += line
line = inputq.popleft()

multiline += line[:1]
outputq.append(multiline)

for character in line[1:]:
outputq.append(character)

elif line.lstrip().startswith(u"#"):
continue
else:
for character in line:
outputq.append(character)

outputq.append(u"\n")

return outputq


def bmrblex(text):
"""A lexical analyzer for the BMRB NMR-STAR format syntax.
:param text: Input text.
:type text: :py:class:`str` or :py:class:`bytes`
:return: Current token.
:rtype: :py:class:`str`
"""
stream = transform_text(text)

cdef unicode wordchars = (u"abcdfeghijklmnopqrstuvwxyz"
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"
u"ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
u"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ"
u"!@$%^&*()_+:;?/>.<,~`|\{[}]-=")
cdef unicode whitespace = u" \t\r\n"
cdef unicode state = u" "
cdef unicode token = u""
cdef unicode nextchar
cdef unicode nextnextchar

while len(stream) > 0:
nextnextchar = stream.popleft()

while True:
nextchar = nextnextchar

if len(stream) > 0:
nextnextchar = stream.popleft()
else:
nextnextchar = u""

# Process multiline string
if len(nextchar) > 1:
state = u" "
token = nextchar
break # emit current token

if state is None:
token = u"" # past end of file
break

elif state == u" ":
if not nextchar:
state = None
break

elif nextchar in whitespace:
if token:
state = u" "
break # emit current token
else:
continue

elif nextchar in wordchars:
token = nextchar
state = u"a"

elif nextchar == u"'" or nextchar == u'"':
token = nextchar
state = nextchar

else:
token = nextchar
if token:
state = u" "
break # emit current token
else:
continue

# Process single-quoted or double-quoted token
elif state == u"'" or state == u'"':
token += nextchar
if nextchar == state:
if nextnextchar in whitespace:
state = u" "
break

# Process regular (unquoted) token
elif state == u"a":
if not nextchar:
state = None
break
elif nextchar in whitespace:
state = u" "
if token:
break # emit current token
else:
continue
else:
token += nextchar

if nextnextchar:
stream.appendleft(nextnextchar)

yield token
token = u""

0 comments on commit 98e73fd

Please sign in to comment.