In [1]:
import re

In [2]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [4]:
def print_if_verbose(text, verbose):
    if verbose:
        print(text)

In [5]:
def lowercase_and_tokenize(s):
    return [t.lower() for t in word_tokenize(s)]


def get_first_token(s):
    tokenized = word_tokenize(s)
    if len(tokenized) == 0:
        return None
    else:
        return tokenized[0].lower()

In [6]:
def find_nested_relation(
    relation_type, relation, cur_depth=1, max_depth=3
):
    if relation is None or cur_depth > max_depth:
        return None, None
    elif relation_type == relation.type:
        return relation, cur_depth
    else:
        left_result, depth = find_nested_relation(
            relation_type, relation.left_child, cur_depth + 1
        )
        if left_result is None:
            return find_nested_relation(
                relation_type, relation.right_child, cur_depth + 1
            )
        else:
            return left_result, depth

In [7]:
# Testing

# %run ./relation_extraction.ipynb

# relation = Relation(
#     "Explanation", 
#     1, 
#     None,
#     Relation(
#         "Background",
#         2,
#         None,
#         None,
#         None
#     ),
#     None
# )

# print(find_nested_relation("Background", None))

# print(find_nested_relation("Explanation", relation))

# print(find_nested_relation("Background", relation))

# relation = Relation(
#     "Explanation", 
#     1, 
#     None,
#     Relation(
#         "Background",
#         2,
#         None,
#         None,
#         None
#     ),
#     Relation(
#         "Background",
#         3,
#         None,
#         Relation(
#             "Elaboration",
#             4,
#             None,
#             None,
#             None
#         ),
#         None
#     )
# )

# print(find_nested_relation("Background", relation))

# print(find_nested_relation("Elaboration", relation))

In [None]:
with open("aux/connectives.txt", "rt") as f:
    connectives = set(
        [line.strip() for line in f.readlines()]
    )

In [None]:
punctuation = set([".", ",", "!", "?", ";"])

In [None]:
def remove_trailing_punctuation(string):
    if len(string) == 0 or string[-1] not in punctuation:
        return string
    else:
        return string[:-1]

In [None]:
def fix_spacing(string):
    if len(string) > 0 and string[-1] in punctuation:
        return string[:-1].strip() + string[-1]
    else:
        return string

In [None]:
def remove_leading_punctuation(string):
    if len(string) == 0 or string[0] not in punctuation:
        return string
    else:
        return string[1:].strip()

In [6]:
def uppercase_first_letter(string):
    return string[:1].upper() + string[1:]

In [None]:
def trim_connective(string):
    lower = string.lower()
    for c in connectives:
        if lower[:len(c)] == c:
            if (
                c != "last" 
                or (len(lower) > len(c) and lower[len(c)] == ",")
            ):
                return remove_leading_punctuation(
                    string[len(c):]
                )
            else:
                return string
    return string

In [None]:
def get_relation_type(relation):
    if relation is None:
        return "-"
    else:
        return relation.type

In [None]:
def remove_extra_space(s):
    return " ".join(s.split())


def contains_any_of(s, s_array):
    for other_s in s_array:
        if other_s in s:
            return True
    return False

In [None]:
def is_nn(relation):
    return (
        relation is not None
            and relation.left.type == "N"
            and relation.right.type == "N"
    )

In [10]:
def is_background(relation):
    return relation is not None and relation.type == "Background"


def has_nested_background(relation):
    return (
        relation is not None
        and
        (
            is_background(relation.left_child)
            or is_background(relation.right_child)
            or has_nested_background(relation.left_child)
            or has_nested_background(relation.right_child)
        )
    )


if __name__ == "__main__" and "__file__" not in globals():
    
    import import_ipynb
    import relation_extraction
    
    def test__has_nested_background():
        r010 = relation_extraction.Relation("Background", None, None, None, None)
        r011 = relation_extraction.Relation("-", None, None, None, None)
        r000 = relation_extraction.Relation("-", None, None, None, None)
        r001 = relation_extraction.Relation("-", None, None, None, None)
        r01  = relation_extraction.Relation("-", None, None, r010, r011)
        r00  = relation_extraction.Relation("-", None, None, r000, r001)
        r0   = relation_extraction.Relation("-", None, None, r00, r01)

        assert has_nested_background(r0)

        r010 = relation_extraction.Relation("-", None, None, None, None)
        r011 = relation_extraction.Relation("-", None, None, None, None)
        r000 = relation_extraction.Relation("-", None, None, None, None)
        r001 = relation_extraction.Relation("-", None, None, None, None)
        r01  = relation_extraction.Relation("-", None, None, r010, r011)
        r00  = relation_extraction.Relation("-", None, None, r000, r001)
        r0   = relation_extraction.Relation("-", None, None, r00, r01)

        assert has_nested_background(r0) is False

        r010 = relation_extraction.Relation("-", None, None, None, None)
        r011 = relation_extraction.Relation("-", None, None, None, None)
        r000 = relation_extraction.Relation("-", None, None, None, None)
        r001 = relation_extraction.Relation("-", None, None, None, None)
        r01  = relation_extraction.Relation("Background", None, None, r010, r011)
        r00  = relation_extraction.Relation("-", None, None, r000, r001)
        r0   = relation_extraction.Relation("-", None, None, r00, r01)

        assert has_nested_background(r0)

        assert has_nested_background(None) is False


    test__has_nested_background()

importing Jupyter notebook from relation_extraction.ipynb


In [10]:
def fix_quotes(sent):
#     stripped = sent.strip()
#     if len(stripped) > 0 and stripped[0] in {"'", '"', '`'}:
#         return '"' + stripped.strip("'\"`") + '"'
#     else:
#         return sent
    assert sent is not None
    return '"' + sent.strip("'\"`") + '"'