update-error-constants.py

#!/usr/bin/env python

import sys, os, os.path, re, codecs

BUILD_SOURCE_FILE = os.path.join("src", "lxml", "xmlerror.pxi")
BUILD_DEF_FILE    = os.path.join("src", "lxml", "includes", "xmlerror.pxd")

if len(sys.argv) < 2 or sys.argv[1].lower() in ('-h', '--help'):
    print("This script generates the constants in file %s" % BUILD_SOURCE_FILE)
    print("Call as")
    print(sys.argv[0], "/path/to/libxml2-doc-dir")
    sys.exit(len(sys.argv) > 1)

HTML_DIR = os.path.join(sys.argv[1], 'html')
os.stat(HTML_DIR) # raise an error if we can't find it

sys.path.insert(0, 'src')
from lxml import etree

# map enum name to Python variable name and alignment for constant name
ENUM_MAP = {
    'xmlErrorLevel'       : ('__ERROR_LEVELS',  'XML_ERR_'),
    'xmlErrorDomain'      : ('__ERROR_DOMAINS', 'XML_FROM_'),
    'xmlParserErrors'     : ('__PARSER_ERROR_TYPES',   'XML_'),
#    'xmlXPathError'       : ('__XPATH_ERROR_TYPES',   ''),
#    'xmlSchemaValidError' : ('__XMLSCHEMA_ERROR_TYPES',   'XML_'),
    'xmlRelaxNGValidErr'  : ('__RELAXNG_ERROR_TYPES',   'XML_'),
    }

ENUM_ORDER = (
    'xmlErrorLevel',
    'xmlErrorDomain',
    'xmlParserErrors',
#    'xmlXPathError',
#    'xmlSchemaValidError',
    'xmlRelaxNGValidErr')

COMMENT = """
# This section is generated by the script '%s'.

""" % os.path.basename(sys.argv[0])

def split(lines):
    lines = iter(lines)
    pre = []
    for line in lines:
        pre.append(line)
        if line.startswith('#') and "BEGIN: GENERATED CONSTANTS" in line:
            break
    pre.append('')
    for line in lines:
        if line.startswith('#') and "END: GENERATED CONSTANTS" in line:
            break
    post = ['', line]
    post.extend(lines)
    post.append('')
    return pre, post

def regenerate_file(filename, result):
    # read .pxi source file
    f = codecs.open(filename, 'r', encoding="utf-8")
    pre, post = split(f)
    f.close()

    # write .pxi source file
    f = codecs.open(filename, 'w', encoding="utf-8")
    f.write(''.join(pre))
    f.write(COMMENT)
    f.write('\n'.join(result))
    f.write(''.join(post))
    f.close()

collect_text = etree.XPath("string()")
find_enums = etree.XPath(
    "//html:pre[@class = 'programlisting' and contains(text(), 'Enum')]",
    namespaces = {'html' : 'http://www.w3.org/1999/xhtml'})

def parse_enums(html_dir, html_filename, enum_dict):
    PARSE_ENUM_NAME  = re.compile('\s*enum\s+(\w+)\s*{', re.I).match
    PARSE_ENUM_VALUE = re.compile('\s*=\s+([0-9]+)\s*(?::\s*(.*))?').match
    tree = etree.parse(os.path.join(html_dir, html_filename))
    enums = find_enums(tree)
    for enum in enums:
        enum_name = PARSE_ENUM_NAME(collect_text(enum))
        if not enum_name:
            continue
        enum_name = enum_name.group(1)
        if enum_name not in ENUM_MAP:
            continue
        print("Found enum", enum_name)
        entries = []
        for child in enum:
            name = child.text
            match = PARSE_ENUM_VALUE(child.tail)
            if not match:
                print("Ignoring enum %s (failed to parse field '%s')" % (
                        enum_name, name))
                break
            value, descr = match.groups()
            entries.append((name, int(value), descr))
        else:
            enum_dict[enum_name] = entries
    return enum_dict

enum_dict = {}
parse_enums(HTML_DIR, 'libxml-xmlerror.html',   enum_dict)
#parse_enums(HTML_DIR, 'libxml-xpath.html',      enum_dict)
#parse_enums(HTML_DIR, 'libxml-xmlschemas.html', enum_dict)
parse_enums(HTML_DIR, 'libxml-relaxng.html',    enum_dict)

# regenerate source files
pxi_result = []
append_pxi = pxi_result.append
pxd_result = []
append_pxd = pxd_result.append

append_pxd('cdef extern from "libxml/xmlerror.h":')
append_pxi('''\
# Constants are stored in tuples of strings, for which Cython generates very
# efficient setup code.  To parse them, iterate over the tuples and parse each
# line in each string independently.  Tuples of strings (instead of a plain
# string) are required as some C-compilers of a certain well-known OS vendor
# cannot handle strings that are a few thousand bytes in length.
''')

ctypedef_indent = ' '*4
constant_indent = ctypedef_indent*2

for enum_name in ENUM_ORDER:
    constants = enum_dict[enum_name]
    pxi_name, prefix = ENUM_MAP[enum_name]

    append_pxd(ctypedef_indent + 'ctypedef enum %s:' % enum_name)
    append_pxi('cdef object %s = (u"""\\' % pxi_name)

    prefix_len = len(prefix)
    length = 2 # each string ends with '\n\0'
    for name, val, descr in constants:
        if descr and descr != str(val):
            line = '%-50s = %7d # %s' % (name, val, descr)
        else:
            line = '%-50s = %7d' % (name, val)
        append_pxd(constant_indent + line)

        if name[:prefix_len] == prefix and len(name) > prefix_len:
            name = name[prefix_len:]
        line = '%s=%d' % (name, val)
        if length + len(line) >= 2040: # max string length in MSVC is 2048
            append_pxi('""",')
            append_pxi('u"""\\')
            length = 2 # each string ends with '\n\0'
        append_pxi(line)
        length += len(line) + 2 # + '\n\0'

    append_pxd('')
    append_pxi('""",)')
    append_pxi('')

# write source files
print("Updating file %s" % BUILD_SOURCE_FILE)
regenerate_file(BUILD_SOURCE_FILE, pxi_result)

print("Updating file %s" % BUILD_DEF_FILE)
regenerate_file(BUILD_DEF_FILE,    pxd_result)

print("Done")