**Generates True statements** from a part of RACE by applying a list of rules. The rules are applied to RST-trees of the respective texts preliminary obtained in other notebooks. The resulting statements are saved inside `OUTPUT_DIR` (see below).

In [1]:
RACE_PART = "test/middle"
OUTPUT_DIR = "statements_7"

### Imports

In [2]:
import json
import os
from typing import List

In [3]:
import import_ipynb  # noqa: F401, needed to make the import below work
import aux.relation_extraction

importing Jupyter notebook from /Users/YK/mt/project/aux/relation_extraction.ipynb


In [4]:
%run explanation_01.ipynb
%run explanation_02.ipynb
%run explanation_03.ipynb
%run explanation_04.ipynb
%run explanation_05.ipynb
# %run explanation_06.ipynb
%run explanation_07.ipynb
# %run explanation_08.ipynb
# %run topic_comment_01.ipynb

importing Jupyter notebook from /Users/YK/mt/project/aux/utils.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/nlp.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/defs.ipynb
importing Jupyter notebook from preparation.ipynb
importing Jupyter notebook from rule_base.ipynb


### Definitions

In [5]:
def create_directories(
    statements_directory: str,
    dataset_subdirectory: str,
    rules: List[rule_base.Rule]  # noqa: F821, rule_base is imported
    # in a child notebook
) -> List[str]:
    """Create subdirectories for saving the output statements.

    The structure of created subdirectories is
    {statement_directory}/{rule.name}/{dataset_subdirectory},
    for example,
    generated_statements/explanation_01/test/middle. The path
    to be created can fully or partially exist. The function will
    not delete any files in the (sub)directories that already exist.

    :param statements_directory: a parent directory (may exist or not exist)
    :param dataset_subdirectory: a (possibly nested) subdirectory,
        e.g. "train/high"
    :param rules: a list of rule_base.Rule objects that is going to be used for
        generating statements
    :return: a list of created subdirectories
    """
    output_directories = {}
    for rule in rules:
        rule_output_directory = os.path.join(
            statements_directory, rule.name, dataset_subdirectory
        )
        os.makedirs(rule_output_directory, exist_ok=True)
        output_directories[rule.name] = rule_output_directory
    return output_directories


def generate_statements(
    dataset_directory: str,
    dataset_subdirectory: str,
    statements_directory: str,
    rules: List[rule_base.Rule],  # noqa: F821
) -> None:
    """Generate True statements by applying a list of rules.

    Generates True statements by applying a list of rules to texts in
    dataset_directory. More precisely, the function expects RST-trees
    instead of the texts themselves, i.e. they need to be parsed beforehand.
    Each RST-tree should be stored in a separate file within
    {dataset_directory}/{dataset_subdirectory}.
    The generated statements are saved in subdirectories inside
    statements_directory. The subdirectory structure is
    {statement_directory}/{rule.name}/{dataset_subdirectory},
    for example,
    generated_statements/explanation_01/test/middle.
    For instance, true statements generated from a file
    {dataset_directory}/test/middle/foo.txt.tree
    by applying a rule named "explanation_01" will be saved as
    {statements_directory}/explanation_01/test/middle/foo.txt.tree.

    :param dataset_directory: a directory containing the RST-trees of texts
        from which the True statments should be generated
    :param dataset_subdirectory: a subdirectory within dataset_directory in
        which the RST-trees are stored (e.g. train/high or test/middle)
    :param statements_directory: the parent directory of the location where
        the generated True statements will be stored
    :param rules: a list of rule_base.Rule objects to be used for generating
        statements
    """
    output_directories = create_directories(
        statements_directory, dataset_subdirectory, rules
    )

    input_directory = os.path.join(dataset_directory, dataset_subdirectory)
    file_names = os.listdir(input_directory)
    for i, file_name in enumerate(file_names):
        print(f"[{i + 1:4d}/{len(file_names)}, {file_name}]")

        (
            text,
            relations,
            cleaned_tree_text,
        ) = aux.relation_extraction.load_relations(
            os.path.join(input_directory, file_name)
        )

        if "Topic-Comment" in relations:
            root = aux.relation_extraction.read_relation_tree(
                cleaned_tree_text
            )
            assert root is not None
            _, main_nucleus_segment = root.get_first_nucleus()
            main_nucleus_text = text[
                main_nucleus_segment.start : main_nucleus_segment.end  # noqa: E203
            ]
        else:
            main_nucleus_text = None

        for rule in rules:
            print(f"  -- {rule.name}")
            if rule.relation_type in relations:
                statements = []
                for relation in relations[rule.relation_type]:
                    statement = rule.generate_statement(
                        text, relation, main_nucleus_text=main_nucleus_text
                    )
                    if statement is not None:
                        statements.append(statement._asdict())
                if len(statements) > 0:
                    with open(
                        os.path.join(output_directories[rule.name], file_name),
                        "wt",
                    ) as f:
                        f.write(json.dumps(statements, indent=2))

### True Statement Generation

In [6]:
rules = [  # populating a list of rules
    RuleExplanation01(),  # noqa: F821
    RuleExplanation02(),  # noqa: F821
    RuleExplanation03(),  # noqa: F821
    RuleExplanation04(),  # noqa: F821
    RuleExplanation05(),  # noqa: F821
    RuleExplanation07(),  # noqa: F821
]

In [6]:
generate_statements(
    dataset_directory="../parsed/race/",
    dataset_subdirectory=RACE_PART,
    statements_directory=OUTPUT_DIR,
    rules=rules,
)

[   1/362, 2550.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[   2/362, 7907.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[   3/362, 5490.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[   4/362, 7159.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[   5/362, 340.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[   6/362, 8212.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[   7/362, 3524.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05

  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[  60/362, 3340.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[  61/362, 357.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[  62/362, 1575.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[  63/362, 3614.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[  64/362, 7775.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[  65/362, 867.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[  66/362, 6853.txt.tree]


  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 116/362, 1470.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 117/362, 1125.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 118/362, 1124.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 119/362, 5912.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 120/362, 5853.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 121/362, 3305.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 122/362, 288.txt.tree]
  -- explanation_01
  -- explanation_02

  -- explanation_07
[ 172/362, 2036.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 173/362, 2869.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 174/362, 1737.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 175/362, 3878.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 176/362, 2447.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 177/362, 2375.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 178/362, 1029.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_0

  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 230/362, 3702.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 231/362, 241.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 232/362, 5689.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 233/362, 1049.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 234/362, 2387.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 235/362, 3329.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 236/362, 2640.txt.tree]

  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 287/362, 5495.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 288/362, 1609.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 289/362, 719.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 290/362, 1964.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 291/362, 6435.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 292/362, 1314.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 293/362, 7549.txt.tree]
  -- explanation_01

  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 344/362, 2084.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 345/362, 1972.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 346/362, 8066.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 347/362, 3610.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 348/362, 7914.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 349/362, 6366.txt.tree]
  -- explanation_01
  -- explanation_02
  -- explanation_03
  -- explanation_04
  -- explanation_05
  -- explanation_07
[ 350/362, 2936.txt.tree