Defines types and **functions for reading RST trees**.

See examples of the use of some of those functions at the end of the notebook.

In [39]:
from collections import namedtuple
from dataclasses import dataclass
import re
from typing import List, Mapping, Optional, Tuple, Union

In [63]:
# A segment is a piece of text corresponding to one of the parts
# of an RST relation or to a whole RST relation.

Segment = namedtuple("Segment", ["type", "start", "end"])
# type is either "N" (nucleus) or "S" (satellite)
# start and end are indices


@dataclass
class Relation:
    """An RST Relation."""

    relation_type: str
    #: e.g. Explanation, Joint, etc.
    left: Segment
    #: the left part of the relation
    right: Segment
    #: the right part of the relation
    left_child: Optional["Relation"]
    #: the left subrelation or None if there is no nested structure
    #: on the left
    right_child: Optional["Relation"]
    #: the right subrelation or None if there is no nested structure
    #: on the right

    def get_first_nucleus(self) -> Tuple[Optional["Relation"], Segment]:
        """Get the relation and the segment of the nucleus.

        If the relation consists of two nuclei, get the relation and
        the segment of the left nucleus. If the nucleus is flat (i.e.
        there is no nested RST structue in it), the returned relation
        is None.
        """
        if self.left.type == "N":
            return self.left_child, self.left
        else:
            return self.right_child, self.right

    def get_satellite(
        self,
    ) -> Union[Tuple[Optional["Relation"], Segment], Tuple[None, None]]:
        """Get the relation and the segment of the satellite.

        If the relation consists of two nuclei, returns a pair of Nones.
        If the satellite is flat (i.e. there is no nested RST structure
        in it), the returned relation is None.
        """
        if self.left.type == "S":
            return self.left_child, self.left
        elif self.right.type == "S":
            return self.right_child, self.right
        else:
            return None, None

In [64]:
def skip_whitespace(text: str, pointer: int) -> int:
    """Get the position of the first non-whitespace character
    in text right of pointer.

    If text[pointer] is not a whitespace character, returns pointer.
    Otherwise, scan text to the right until a non-whitespace character
    is found.

    :param text: the string to scan
    :param pointer: the start position for the scanning process
    :return: a pointer to the first non-whitespace character right of
             the start position or a pointer to the position past the
             end of the string if all the characters right of pointer
             are whitespace
    """
    cnt = 0
    while pointer + cnt < len(text) and text[pointer + cnt].isspace():
        cnt += 1
    return cnt

In [65]:
def check_symbol(text: str, pointer: int, c: str) -> bool:
    """Safely check if pointer points at a given character.

    If pointer is out of bounds, returns False. Otherwise, checks if
    text[pointer] equals c.

    :param text: the string to look at
    :param pointer: the position to look at
    :param c: the character to compare against
    :return: True if text[pointer] == c, False if text[pointer] != c or if
             pointer is out of bounds
    """
    if pointer >= len(text):
        return False
    else:
        return text[pointer] == c

In [66]:
def is_segment_start(tree_text: str, pointer: int) -> bool:
    """Check for the start of an RST segment.

    :param tree_text: the text representation of an RST tree
    :param pointer: the current position (character index) in tree_text
    :return: True if the start of an RST segment is detected, False otherwise.
    """
    return check_symbol(tree_text, pointer, "_") and check_symbol(
        tree_text, pointer + 1, "!"
    )


def is_segment_end(tree_text: str, pointer: int) -> bool:
    """Check for the end of an RST segment.

    :param tree_text: the text representation of an RST tree
    :param pointer: the current position (character index) in tree_text
    :return: True if the end of an RST segment is detected, False otherwise.
    """
    return check_symbol(tree_text, pointer, "!") and check_symbol(
        tree_text, pointer + 1, "_"
    )

In [82]:
head_re = re.compile(r"([^[]*)\[(N|S)\]\[(N|S)\]")


def read_head(tree_text: str, pointer: int) -> Tuple[str, str, str, int]:
    """Reads the head of an RST relation from the text represenation
    of an RST tree.

    For example, an RST relation head can look like
    ```
        (Explanation[N][S]
    ```
    :param tree_text: the text represenation of an RST tree
    :param pointer: the starting position
    :return: the type of the relation (e.g. 'Explanation'),
             the type of the left part ("N" (nucleus) or "S" (satellite)),
             the type of the right part ("N" (nucleus) or "S" (satellite)),
             a new pointer (the position past the head)
    """
    pointer += skip_whitespace(tree_text, pointer)
    assert check_symbol(tree_text, pointer, "(")
    pointer += 1

    head_start = pointer
    while pointer < len(tree_text) and not tree_text[pointer].isspace():
        pointer += 1
    head_end = pointer

    relation_type, left_segment_type, right_segment_type = head_re.match(
        tree_text[head_start:head_end]
    ).groups()

    return (relation_type, left_segment_type, right_segment_type, pointer)

In [83]:
def relation_to_segment(relation: Relation, segment_type: str) -> Segment:
    """Creates a Segment corresponding to a given relation.

    :param relation:
    :param segment_type: the type of the segment ("N" (nucleus) or
                         "S" (satellite)) as read from the head of
                         the parent relation
    :return: an object representing the segment that corresponds to
             the relation
    """
    return Segment(segment_type, relation.left.start, relation.right.end)

In [84]:
def read_segment(
    tree_text: str, pointer: int, text: List[str], segment_type: str
) -> Tuple[Segment, int]:
    """Read a flat segment from the text represenation of an RST tree.

    Appends the respective piece of the original text to a list passed
    as the text parameter.

    :param tree_text: the text represenation of an RST tree
    :pointer: the position to start reading from
    :text: a list for collecting pieces of the original text
    :segment_type: the type of the segment ("N" (nucleus) or "S" (satellite))
                   as read from the head of the relation that the segment
                   belongs to
    :return: an object representing the segment,
             a new pointer
    """
    pointer += skip_whitespace(tree_text, pointer)
    assert is_segment_start(tree_text, pointer)
    pointer += 2

    segment_start = len(text)
    while pointer < len(tree_text) and not (
        check_symbol(tree_text, pointer, "!")
        and check_symbol(tree_text, pointer + 1, "_")
    ):
        text.append(tree_text[pointer])
        pointer += 1
    assert is_segment_end(tree_text, pointer)
    text.append(" ")
    segment_end = len(text)
    pointer += 2

    return (Segment(segment_type, segment_start, segment_end), pointer)

In [85]:
def read_relation_or_segment(
    tree_text: str,
    pointer: int,
    text: List[str],
    segment_type: str,
    relations: Mapping[str, List[Relation]],
) -> Tuple[Segment, int, Optional[Relation]]:
    """Read either an RST relation or a flat segment.

    If the pointer is at the start of an RST relation, reads the relation.
    Otherwise, reads a flat segment. If a relation is read, the function
    creates an object representing the respective segment too. So, a Segment
    is returned in either case.

    :param tree_text: the text represenation of an RST tree
    :pointer: the position to start reading from
    :text: a list for collecting pieces of the original text
    :segment_type: the type of the segment ("N" (nucleus) or "S" (satellite))
                   as read from the head of the relation
    :relations: a mapping between relation types and extracted relations
    :return: an object representing the segment,
             a new pointer, and
             an object representing the relation (None if a flat segment
             has been read)
    """
    pointer += skip_whitespace(tree_text, pointer)
    if check_symbol(tree_text, pointer, "("):
        relation, pointer = read_relation(tree_text, pointer, text, relations)
        segment = relation_to_segment(relation, segment_type)
        return segment, pointer, relation
    else:
        segment, pointer = read_segment(tree_text, pointer, text, segment_type)
        return segment, pointer, None

In [86]:
def read_relation(
    tree_text: str,
    pointer: int,
    text: List[str],
    relations: Mapping[str, List[Relation]],
) -> Tuple[Relation, int]:
    """Read an RST relation from a text representation of an RST tree.

    Reads an RST relation and adds it to relations. Returns the
    created relation together a new pointer. Nested relations
    (in case there are any) are read and added to relations recursively.
    The original text is reconstructed along the way (see the text parameter).

    :param tree_text: the text represenation of an RST tree
    :pointer: the position to start reading from
    :text: a list for collecting pieces of the original text
    :relations: a mapping between relation types and extracted relations
    :return: the read relation and a new pointer. The new pointer is
             the index of the first position past the text of the relation
             in tree_text.
    """
    # read the head of the relation
    (
        relation_type,
        left_segment_type,
        right_segment_type,
        pointer,
    ) = read_head(tree_text, pointer)

    # read the left part (possibly, a relation on its own)
    (left_segment, pointer, left_child,) = read_relation_or_segment(
        tree_text, pointer, text, left_segment_type, relations
    )

    # read the right part (possibly, a relation on its own)
    (right_segment, pointer, right_child,) = read_relation_or_segment(
        tree_text, pointer, text, right_segment_type, relations
    )

    # read to the end of the relation's text
    pointer += skip_whitespace(tree_text, pointer)
    assert check_symbol(tree_text, pointer, ")")
    pointer += 1

    relation = Relation(  # create a Relation object
        relation_type, left_segment, right_segment, left_child, right_child
    )
    # add the created Relation object to the relations mapping
    if relation_type not in relations:
        relations[relation_type] = []
    relations[relation_type].append(relation)

    return relation, pointer

In [87]:
def read_relations(tree_text: str) -> Tuple[str, Mapping[str, List[Relation]]]:
    """Read RST relations from the text representation of an RST tree.

    Recursively reads and collects RST relations. Recreates the original
    text along the way.

    :param tree_text: the text representation of an RST tree
    :return: the reconstructed text,
             the collected relations (grouped by the relation type)
    """
    pointer = 0
    text = []
    relations = {}
    read_relation(tree_text, pointer, text, relations)
    return "".join(text), relations


def extract_relations(
    file_path: str,
) -> Tuple[str, Mapping[str, List[Relation]]]:
    """Read RST relations from a file.

    Recursively reads and collects RST relations. Recreates the original
    text along the way.

    :param file_path: a path to a file with an RST tree
    :return: the reconstructed text,
             the collected relations (grouped by the relation type)
    """
    with open(file_path, "rt") as f:
        tree_text = f.read()

    return read_relations(tree_text.replace("<s>", "").replace("<P>", ""))

In [88]:
def read_relation_tree(tree_text: str) -> Relation:
    """Create an in-memory representation of an RST tree from its
    text represenation.

    Creates a nested data structure representing the tree.

    :param tree_text: the text representation of an RST tree
    :return: an object representing the root relation
    """
    root, _ = read_relation(tree_text, 0, [], {})
    return root


def extract_relation_tree(file_path: str) -> Relation:
    """Create an in-memory representation of an RST tree from a file.

    Creates a nested data structure representing the tree.

    :param tree_text: a path to a file with an RST tree
    :return: an object representing the root relation
    """
    with open(file_path, "rt") as f:
        tree_text = f.read()

    return read_relation_tree(tree_text)


def load_relations(file_path):
    """Read RST relations from a file.

    Recursively reads and collects RST relations. Recreates the original
    text along the way. Additionally, return a text representation of the
    tree that is clear of `<s>` and `<P>` HTML tags.

    :param file_path: a path to a file with an RST tree
    :return: the reconstructed text,
             the collected relations (grouped by the relation type),
             the cleaned text representation of the RST tree
    """
    with open(file_path, "rt") as f:
        tree_text = f.read()

    cleaned_tree_text = tree_text.replace("<s>", "").replace("<P>", "")

    text, relations = read_relations(cleaned_tree_text)

    return text, relations, cleaned_tree_text

In [89]:
if __name__ == "__main__" and "__file__" not in globals():
    tree_text = """
        (Elaboration[N][S]
           _!ha-ha !_
           (Elaboration[N][S] _!this is a  segment !_
           (Join[N][N] _!a .!_ _!b .!_)))
    """
    print(read_relations(tree_text))

('ha-ha  this is a  segment  a . b . ', {'Join': [Relation(relation_type='Join', left=Segment(type='N', start=27, end=31), right=Segment(type='N', start=31, end=35), left_child=None, right_child=None)], 'Elaboration': [Relation(relation_type='Elaboration', left=Segment(type='N', start=7, end=27), right=Segment(type='S', start=27, end=35), left_child=None, right_child=Relation(relation_type='Join', left=Segment(type='N', start=27, end=31), right=Segment(type='N', start=31, end=35), left_child=None, right_child=None)), Relation(relation_type='Elaboration', left=Segment(type='N', start=0, end=7), right=Segment(type='S', start=7, end=35), left_child=None, right_child=Relation(relation_type='Elaboration', left=Segment(type='N', start=7, end=27), right=Segment(type='S', start=27, end=35), left_child=None, right_child=Relation(relation_type='Join', left=Segment(type='N', start=27, end=31), right=Segment(type='N', start=31, end=35), left_child=None, right_child=None)))]})


In [90]:
if __name__ == "__main__" and "__file__" not in globals():
    text = []
    relations = {}
    print(read_relation_or_segment(tree_text, 0, text, "S", relations))

(Segment(type='S', start=0, end=35), 145, Relation(relation_type='Elaboration', left=Segment(type='N', start=0, end=7), right=Segment(type='S', start=7, end=35), left_child=None, right_child=Relation(relation_type='Elaboration', left=Segment(type='N', start=7, end=27), right=Segment(type='S', start=27, end=35), left_child=None, right_child=Relation(relation_type='Join', left=Segment(type='N', start=27, end=31), right=Segment(type='N', start=31, end=35), left_child=None, right_child=None))))


In [91]:
if __name__ == "__main__" and "__file__" not in globals():
    text, relations = extract_relations(
        "../../parsed/race/train/high/10324.txt.tree"
    )
    print(text)
    print()
    print(relations)

It 's no secret that doing good makes others happy - but did you know it can make you happy as well ?  According to a study , people participating in meaningful activities were happier and felt that their lives had more purpose than people who only engaged in pleasure-seeking behaviors .  Try giving these four things to others to start your journey to a happier and healthier lifestyle . 1 .  Your Time With a busy life , it can be hard to find any time to give away .  However , volunteering your time has great benefits , including making new friends and connections , learning new skills and even advancing your career .  According to a paper about the link between health and volunteering , volunteering is connected with lower instances of depression and reduces the risk of dying by 22 percent . 2 .  Your Attention Most of us think we 're good listeners , but according to psychologist Paul Donoghue , most people are aware that others do n't listen as well as they could .  In addition , th