Automated Script to insert detailed documentation from Isaac Gym's official HTML docs to the `.pyi` file generated by stubgen.

The generated `.pyi` file is accepted by the original isaacgym-stub's repo [here](https://github.com/yzqin/isaacgym-stubs).

NOTE: The script is made for Isaac Gym's Preview Version 4. The script may not work for future / previous versions without further modification. 

In [9]:
## Utilities for parsing and cleaning the HTML documentation

import re
import sys

def clean_unit(unit):    
    unit = re.sub(r'<.*?>', '', unit)  # remove all tags
    unit = re.sub(r'&#x2192;', '->', unit)  # replace &#x2192; with ->
    unit = re.sub(r'', '', unit)
    unit = re.sub(r'&gt;', '>', unit)
    unit = re.sub(r'&lt;', '<', unit)
    unit = re.sub(r'–', '-', unit) # replace non-ascii symbols
    unit = re.sub(r'’', "'", unit)
    unit = re.sub(r'&amp;', "&", unit)
    
    # clean leading and trailing spaces
    unit = unit.lstrip().rstrip()
    return unit

def clean_html(input_text):
    # Regex pattern to identify all HTML tags
    all_tags_regex = r'<[^>]+>'
    
    # Function to check if the tag should be kept
    def keep_tag(match):
        # Regex pattern for tags to keep
        keep_pattern = r'<dl [=\w\s\"\-]+>|</dl>'
        tag = match.group(0)
        # Keep the tag if it matches the keep pattern, otherwise replace with empty string
        if re.fullmatch(keep_pattern, tag):
            return tag
        else:
            return ''

    # Replace all tags in the input text, keeping only the specified ones
    cleaned_text = re.sub(all_tags_regex, keep_tag, input_text)
    return cleaned_text


def open_close_tag_match(text):
    depth = 0
    for i in range(len(text)):
        if text[i:i+3] == '<dl':
            depth += 1
        if text[i:i+5] == '</dl>':
            if depth == 0:
                return False
            depth -= 1
    return depth == 0

class Node:
    def __init__(self, type_):
        self.type_ = type_
        self.doc = ''
        self.parent = None
        self.children = []
        self.start_idx = None
        
    def __repr__(self):
        doc_compact = self.doc.replace('\n', ' ')
        return f"[Type: {self.type_}, Name: {self.name}, Doc: \"{doc_compact}\"]"

    def __str__(self):
        return self.__repr__()
    
    @property
    def name(self):
        if self.type_ == 'py class':
            return re.match(r'class isaacgym.gymapi.(\w+)', self.doc).group(1)
        
        if self.type_ == 'py method':
            t = self.doc.replace('static ', '')
            return re.match(r'(\w+)\(', t).group(1)

        if self.type_ == 'py attribute':
            t = self.doc.split(' ')[0].replace('gymapi.', '')
            # if exist (, remove parts after this
            if '(' in t:
                t = t.split('(')[0]
            return t
        
        
        if self.type_ == 'py property':
            return re.match(r'property (\w+)', self.doc).group(1)
            

def generate(texts):
    root = Node('root')
    current = root
    idx = 0
    while idx < len(texts):

        # find next opening tag, record necessary information
        match_open = re.search(r'<dl class="([\w\s\-]+)">', texts[idx:])
        match_close = re.search(r'</dl>', texts[idx:])

        if match_close is None:
            print('Parse finished!')
            break

        if match_open and match_open.start() < match_close.start():
            # opening tag first
            start, end, tag = match_open.start(), match_open.end(), match_open.group(1)
            # print(texts[idx+start:idx+end], tag)

            current.children.append(Node(tag))
            current.children[-1].parent = current
            current.children[-1].start_idx = idx + end
            if current.start_idx is not None and current.doc == '':
                current.doc = clean_unit(texts[current.start_idx:idx+start])
            current = current.children[-1]

        else:
            # closing tag first
            start, end = match_close.start(), match_close.end()
            if current.doc == '':
                current.doc = clean_unit(texts[current.start_idx:idx+start])
            # current.doc = clean_unit(texts[open_tag_idx:idx+start])
            current = current.parent
            

        idx += end
    return root

def compact_docs(node):
    # for some nodes, it has children typed `field-list simple`. We want to copy this doc to its parent. 
    # Then we can remove these nodes.
    if node.type_ == 'field-list simple' or node.type_ == 'simple':
        node.parent.doc += '\n' + node.doc
        return

    for child in node.children:
        compact_docs(child)

def print_tree(node, depth=0):
    if depth == 3:
        return
    print('  '*depth, node)
    for child in node.children:
        print_tree(child, depth+1)
    


In [10]:
## Main function to parse an HTML file

def parse_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    content = clean_html(content)
    if not open_close_tag_match(content):
        print('Error: open and close tag mismatch')
        return None
    
    root = generate(content)

    compact_docs(root)

    return root

In [11]:
## Usage examples

# root = parse_file('gym_py.html')
# root = parse_file('struct_py.html')
root = parse_file('const_py.html')
print_tree(root)

Parse finished!
 [Type: root, Name: None, Doc: ""]
   [Type: py attribute, Name: INVALID_HANDLE, Doc: "gymapi.INVALID_HANDLE = -1"]
   [Type: py attribute, Name: DEFAULT_VIEWER_WIDTH, Doc: "gymapi.DEFAULT_VIEWER_WIDTH = 1600"]
   [Type: py attribute, Name: DEFAULT_VIEWER_HEIGHT, Doc: "gymapi.DEFAULT_VIEWER_HEIGHT = 900"]
   [Type: py attribute, Name: STATE_NONE, Doc: "gymapi.STATE_NONE = 0"]
   [Type: py attribute, Name: STATE_POS, Doc: "gymapi.STATE_POS = 1"]
   [Type: py attribute, Name: STATE_VEL, Doc: "gymapi.STATE_VEL = 2"]
   [Type: py attribute, Name: STATE_ALL, Doc: "gymapi.STATE_ALL = 3"]
   [Type: py attribute, Name: RIGID_BODY_NONE, Doc: "gymapi.RIGID_BODY_NONE = 0"]
   [Type: py attribute, Name: RIGID_BODY_DISABLE_GRAVITY, Doc: "gymapi.RIGID_BODY_DISABLE_GRAVITY = 1"]
   [Type: py attribute, Name: RIGID_BODY_DISABLE_SIMULATION, Doc: "gymapi.RIGID_BODY_DISABLE_SIMULATION(PhysX only) = 2"]
   [Type: py attribute, Name: AXIS_NONE, Doc: "gymapi.AXIS_NONE = 0"]
   [Type: py attr

In [12]:
class LoggingLevel:
    SHUTUP = -1
    ERROR = 0
    WARN = 1
    INFO = 2
    DEBUG = 3


class LoggingClass:
    def __init__(self, log_level):
        self.__log_level = log_level

    def error(self, message):
        if self.__log_level >= LoggingLevel.ERROR:
            print('[ERROR] :', message, file=sys.stderr)

    def warn(self, message):
        if self.__log_level >= LoggingLevel.WARN:
            print('[WARN] :', message, file=sys.stderr)

    def info(self, message):
        if self.__log_level >= LoggingLevel.INFO:
            print('[INFO] :', message)

    def debug(self, message):
        if self.__log_level >= LoggingLevel.DEBUG:
            print('[DEBUG]', message)


class UniversalInserter(LoggingClass):
    ''' Insert method, attribute, class description into the original pyi file.'''

    def __init__(self, pyi_file_path, log_level):
        super().__init__(log_level)
        self.pyi_file_path = pyi_file_path
        self._read(pyi_file_path)


    def reset(self):
        self._read(self.pyi_file_path)


    def _read(self, pyi_file_path):
        with open(pyi_file_path, 'r', encoding='utf-8') as f:
            self.pyi_lines = f.read()
        self.pyi_lines = self.pyi_lines.split('\n')
        self.info(f'Reading from {pyi_file_path} with {len(self.pyi_lines)} lines')


    def save(self, output_file_path):
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(self.pyi_lines))
        self.info(f'saving to {output_file_path}!')


    def insert(self, root_node):
        ''' Traverse the tree, insert the doc into the pyi file.'''
        if root_node.type_ == 'py class':
            # record the class name
            self._ongoing_class_name = root_node.name

        self._insert_node(root_node)
        for node in root_node.children:
            self.insert(node)

        # leave the class range, reset the class name
        if root_node.type_ == 'py class':
            self._ongoing_class_name = None


    def _insert_node(self, node: Node):
        ''' Insert the doc of the node into the pyi file.'''
        if node.type_ == 'root':
            return
        
        if node.type_ == 'py class':
            class_start_idx, class_end_idx = self._locate_class(node.name)
            self.add_doc(node.doc, class_start_idx+1)
        
        elif node.type_ == 'py method':
            method_start_idx, method_end_idx = self._locate_method(node.name)
            self.add_doc(node.doc, method_end_idx)

        if node.type_ == 'py property':
            property_start_idx, property_end_idx, property_type = self._locate_property(node.name)
            self.add_property_doc(node.doc, property_start_idx, property_end_idx, property_type)

        if node.type_ == 'py attribute':
            attribute_idx, attr_type = self._locate_attribute(node.name)
            self.add_attr_doc(node.doc, attribute_idx, attr_type)

    def add_attr_doc(self, doc, line_idx, type_):
        if type_ == 'normal':
            # simplely append
            doc = doc.replace('\n', ' ')
            self.pyi_lines[line_idx] += f'  # {doc}'
        elif type_ == 'int':
            const_value = re.search(r'= (-?[\d]+)', doc).group(1)
            self.pyi_lines[line_idx].rstrip('\n')
            self.pyi_lines[line_idx] += " = " + const_value 

    def add_property_doc(self, doc, start_line_idx, end_line_idx, type_):
        if type_ == 'normal':
            # simplely append
            doc = doc.replace('\n', ' ')
            self.pyi_lines[start_line_idx] += f'  # {doc}'
        elif type_ == 'method':
            doc = doc.replace('\n', '\n    ')
            self.pyi_lines.insert(end_line_idx, "    ''' " + doc + "\n    '''\n")


    def add_doc(self, doc, line_idx):
        doc = doc.replace('\n', '\n    ')
        self.pyi_lines.insert(line_idx, "    ''' " + doc + "\n    '''\n")


    def _locate_class(self, class_name):
        ''' Locate the class in the pyi file. Return the start and end line number.'''
        if class_name is None:
            # self.warn('class_name is None')
            return 0, len(self.pyi_lines)
        
        for i, line in enumerate(self.pyi_lines):
            if line.startswith(f'class {class_name}:'):
                start = i
                break
        else:
            raise ValueError(f'Class {class_name} not found in the pyi file.')

        for i, line in enumerate(self.pyi_lines[start+1:], start):
            # if this line not start with spaces/tabs, it means the class definition ends.
            if len(line) > 0 and (not line.startswith(' ')) and (not line.startswith('\t')):
                end = i
                break
        else:
            end = len(self.pyi_lines)
        # self.debug(f'_locate_class: {start} {end}')
        return start, end


    def _locate_method(self, method_name):
        ''' Locate the method in the pyi file. Return the start and end line number.'''
        class_start_idx, class_end_idx = self._locate_class(self._ongoing_class_name)

        for i, line in enumerate(self.pyi_lines):
            if i < class_start_idx:
                continue
            if i > class_end_idx:
                continue
            if (f'def {method_name}(') in line:
                start = i
                break
        else:
            raise ValueError(f'Method {method_name} not found in the pyi file.')
        
        i = start + 1
        while len(self.pyi_lines[i]) != 0:
            i += 1
        end = i
        return start, end
    
    def _locate_property(self, property_name):
        ''' Locate the property in the pyi file. Return the start and end line number.'''
        class_start_idx, class_end_idx = self._locate_class(self._ongoing_class_name)

        for i, line in enumerate(self.pyi_lines):
            if i < class_start_idx:
                continue
            if i > class_end_idx:
                continue
            if (f'{property_name}:') in line:
                start = i
                type_ = 'normal'
                break
            if self.pyi_lines[i-1] == '    @property' and (f'{property_name}') in line:
                start = i
                type_ = 'method'
                break
        else:
            raise ValueError(f'Property {property_name} not found in the pyi file.')
        
        i = start + 1
        while len(self.pyi_lines[i]) != 0:
            i += 1
        end = i
        return start, end, type_

    def _locate_attribute(self, attribute):
        ''' Locate the attribute in the pyi file. Return the start and end line number.'''
        class_start_idx, class_end_idx = self._locate_class(self._ongoing_class_name)
        for i, line in enumerate(self.pyi_lines):
            if i < class_start_idx:
                continue
            if i > class_end_idx:
                continue
            if (f'{attribute}: int') in line:
                start = i
                type_ = 'int'
                break
            if (f'{attribute}:') in line:
                start = i
                type_ = 'normal'
                break
        else:
            raise ValueError(f'Attribute {attribute} not found in the pyi file.') 
        return start, type_


## NOTE: Set the HTML_PATH to the path of the HTML documentation, 'isaacgym/docs/api/python'
HTML_PATH = 'htmls/'

inserter = UniversalInserter('isaacgym-stubs/gymapi.pyi', LoggingLevel.DEBUG)

gym_nodes = parse_file(HTML_PATH + 'gym_py.html')
inserter.insert(gym_nodes)

struct_nodes = parse_file(HTML_PATH + 'struct_py.html')
inserter.insert(struct_nodes)

enum_nodes = parse_file(HTML_PATH + 'enum_py.html')
inserter.insert(enum_nodes)

const_nodes = parse_file(HTML_PATH + 'const_py.html')
inserter.insert(const_nodes)

## Set this output filename to a proper place
inserter.save('gymapi-enhanced.pyi')

[INFO] : Reading from isaacgym-stubs/gymapi.pyi with 2406 lines
Parse finished!
Parse finished!
Parse finished!
Parse finished!
[INFO] : saving to gymapi-enhanced.pyi!
