In [127]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag

In [128]:
%run ./utils.ipynb
%run ./nlp.ipynb
%run ./relation_extraction.ipynb

In [11]:
text, relations = extract_relations(
    "parsed/race/train/middle/2549.txt.tree"
)

In [4]:
rel = relations["Explanation"][2]

In [12]:
prepare_extended_info(rel)

<__main__.ExtendedRelationInfo at 0x10b809128>

In [3]:
def do_subjects_differ(text, rel, n_prev_sent=5):
    new_st, left_segment_sent_no = move_st(text, rel.left.start, n_prev_sent)
    if len({".", "!", "?"}.intersection(
        set(word_tokenize(text[rel.left.start:rel.right.end])))) > 0:
        right_segment_sent_no = left_segment_sent_no + 1
    else:
        right_segment_sent_no = left_segment_sent_no

    if left_segment_sent_no == right_segment_sent_no:
        return True
    else:
        resolved_subjects = get_resolved_subjects(text[new_st:rel.right.end])
        if len(resolved_subjects) < right_segment_sent_no:
            return True
        else:
            return (
                resolved_subjects[left_segment_sent_no] 
                != resolved_subjects[right_segment_sent_no]
            )

        
def test__do_subjects_differ():
    text = """Nika lives in Berlin. She goes to a Kita. Her favourite color is yellow."""
    assert do_subjects_differ(
        text, 
        Relation("", Segment("N", 22, 42), Segment("S", 42, len(text)), None, None)
    )
    text = """Nika lives in Berlin. She goes to a Kita. She likes yellow things."""
    assert not do_subjects_differ(
        text, 
        Relation("", Segment("N", 22, 42), Segment("S", 42, len(text)), None, None)
    )    

    
test__do_subjects_differ()

In [18]:
def safe_access(tokens, i):
    if i < len(tokens):
        return tokens[i]
    else:
        return ""

In [145]:
ItMakesParseResult = collections.namedtuple(
    "ItMakesParseResult", ["success", "subject", "verb", "rest"]
)


def safe_access(tokens, i):
    if i < len(tokens):
        return tokens[i]
    else:
        return ""
    

def normalise_and_identify_person(subject):
    subject = subject.lower()
    if subject == "me":
        return "I", 1
    elif subject == "us":
        return "we", 1
    elif subject == "you":
        return "you", 2
    elif subject == "him":
        return "he", 3
    elif subject == "her":
        return "she", 3
    elif subject == "them":
        return "they", 3
    else:
        return subject, 3
    
    
def parse_it_makes(tokens, verbose=False):
    if safe_access(tokens, 0).lower() == "it":
        make_pos = -1
        
        if normalise_verb(safe_access(tokens, 1).lower()) == "make":
            make_pos = 1
        elif normalise_verb(safe_access(tokens, 2).lower()) == "make":
            make_pos = 2
        if make_pos < 0:
            print_if_verbose("Didn't find 'make'.", verbose)
            return ItMakesParseResult(False, None, None, None)
        else:
            first_verb_position = find_first_verb(" ".join(tokens[make_pos + 1:]))
            if first_verb_position is None:
                print_if_verbose("Didn't find the verb. Will look for an adjective.", verbose)
                first_adj_position = find_first_adjective(" ".join(tokens[make_pos + 1:]))
                if first_adj_position is None:
                    print_if_verbose("Didn't find an adjective either.")
                    return ItMakesParseResult(False, None, None, None)
                else:
                    verb = "be"
                    subject = " ".join(
                        tokens[make_pos + 1:(make_pos + 1 + first_adj_position)]
                    )
                    rest = " ".join(tokens[(make_pos + 1 + first_adj_position):])
            else:
                verb = tokens[make_pos + 1 + first_verb_position]
                subject = " ".join(
                    tokens[make_pos + 1:(make_pos + 1 + first_verb_position)]
                )
                rest = " ".join(tokens[(make_pos + 1 + first_verb_position + 1):])
            
            if tokens[make_pos] in {"make", "makes", "making"}:
                tense = Tense.PRESENT
            elif tokens[make_pos] == "made":
                tense = Tense.PAST
            else:
                assert False
            normalised_subject, person = normalise_and_identify_person(subject)
            if is_plural(normalised_subject):
                number = Number.PLURAL
            else:
                number = Number.SINGULAR
            return ItMakesParseResult(
                True, 
                normalised_subject, 
                conjugate(verb, tense, person, number),
                rest
            )
    else:
        return ItMakesParseResult(False, None, None, None)

        
def check_parse_result(parse_result, true_subject, true_verb):
    return parse_result.subject == true_subject and parse_result.verb == true_verb


def test__parse_it_makes():
    parse_result = parse_it_makes(tokenize("It's making him feel happy."))
    check_parse_result(parse_result, "he", "feels")

    parse_result = parse_it_makes(tokenize("It makes them feel happy."))
    check_parse_result(parse_result, "they", "feel")

    parse_result = parse_it_makes(tokenize("It makes the cats happy."))
    check_parse_result(parse_result, "the cats", "are")

    parse_result = parse_it_makes(tokenize("It makes the cat happy."))
    check_parse_result(parse_result, "the cat", "is")

    parse_result = parse_it_makes(tokenize("It has made the cat happy."))
    check_parse_result(parse_result, "the cat", "was")

    parse_result = parse_it_makes(tokenize("It has made the cats happy."))
    check_parse_result(parse_result, "the cat", "were")

    parse_result = parse_it_makes(tokenize("It has made the cats go home."))
    check_parse_result(parse_result, "the cats", "went")
    
    parse_result = parse_it_makes(tokenize("It has made Mr. Smith go home."))
    check_parse_result(parse_result, "Mr. Smith", "went")

test__parse_it_makes()

In [146]:
def rule_explanation_01(text, relation, verbose=False):
    assert(relation is not None and relation.type == "Explanation")
    info = prepare_extended_info(relation, verbose)
    if info is None:
        print_if_verbose("Extended info preparation wasn't successful.", verbose)
        return None, None

    nucleus_tokens = tokenize(
        info.nucleus_text
    )
    assert len(nucleus_tokens) > 0
    print_if_verbose(f"Satellite's nucleus text: \n{info.sn_text}", verbose)
    sn_tokens = tokenize(
        info.sn_text
    )
    assert len(sn_tokens) > 0
    
    condition_holds = False
    augmented_sn_text = info.sn_text
    
    if nucleus_tokens[0] == "but":
        print_if_verbose("Nucleus starts with 'but'.", verbose)
        condition_holds = True
    elif (
        nucleus_tokens[0] == "if"
            and (
                info.nucleus_info.relation is None
                    or get_relation_type(info.nucleus_info.relation.left_child) != "Condition"
            )
    ):
        print_if_verbose(
            "Nucleus starts with 'if' "
            "and its left subrelation is not 'Condition'.", 
            verbose
        )
        condition_holds = True        
    elif "because" in nucleus_tokens:
        print_if_verbose("Nucleus contains 'because'.", verbose)
        condition_holds = True
    elif sn_tokens[0] == 'but':
        print_if_verbose("Satellite's nucleus starts with 'but'.", verbose)
        condition_holds = True
    elif "because" in set(sn_tokens):
        print_if_verbose("Satellite's nucleus contains 'because'.", verbose)
        condition_holds = True
    else:
        satellite_text = text[
            info.satellite_info.segment.start:info.satellite_info.segment.end
        ]
        print_if_verbose(f"Satellite:\n{satellite_text}", verbose)
        it_make_parse_result = parse_it_makes(word_tokenize(satellite_text))
        print_if_verbose(it_make_parse_result, verbose)
        if it_make_parse_result.success:
            print_if_verbose("Satellite's nucleus starts with 'It makes/made'.", verbose)
            condition_holds = True
            assert it_make_parse_result.subject is not None
            assert it_make_parse_result.verb is not None
            assert it_make_parse_result.rest is not None
            augmented_sn_text = clean(
                    " ".join(
                    [
                        it_make_parse_result.subject, 
                        it_make_parse_result.verb, 
                        it_make_parse_result.rest
                    ]
                )
            )

    if condition_holds:
        processed_sn_text = remove_leading_words(augmented_sn_text, verbose)
        statement = "{nucleus} because {satellite_nucleus}".format(
            nucleus=remove_trailing_punctuation(
                uppercase_first_letter(info.nucleus_text)
            ),
            satellite_nucleus=lowercase_first_letter(
                processed_sn_text if processed_sn_text is not None 
                    else info.sn_text
            )
        )
        return statement, info.nucleus_proximity
    else:
        print_if_verbose("None of the conditions were met.", verbose)
        return None, None

In [147]:
with open("parsed/race/train/middle/8123.txt.tree", "rt") as f:
    tree_text = f.read()

text, relations = read_relations(tree_text.replace("<s>", "").replace("<P>", ""))

In [148]:
expl = relations["Explanation"][1]
text[expl.left.start:expl.right.end]

"I 'm often there for three hours .  It makes me feel great to do something for the environment .  "

In [149]:
rule_explanation_01(text, expl, verbose=True)

Nucleus is on the left.
Satellite's nucleus is on the right.
Nuclei proximity is far
Satellite doesn't have nested relations or its depth is too small.
Satellite doesn't contain a wh-word or 'how'.
Satellite's nucleus text: 
me feel great to do something for the environment.
Satellite:
It makes me feel great to do something for the environment .  
ItMakesParseResult(success=True, subject='I', verb='feel', rest='great to do something for the environment .')
Satellite's nucleus starts with 'It makes/made'.
Removed tokens before the first NP: 
I feel great to do something for the environment. 
---> 
I feel great to do something for the environment.


("I'm often there for three hours because I feel great to do something for the environment.",
 'far')

In [150]:
with open("parsed/race/train/middle/276.txt.tree", "rt") as f:
    tree_text = f.read()

text, relations = read_relations(tree_text.replace("<s>", "").replace("<P>", ""))

In [151]:
expl = relations["Explanation"][1]
text[expl.left.start:expl.right.end]

'Because there is n\'t much oxygen there , trains will have oxygen masks for those who need _ .  It makes passengers feel more comfortable when they have enough oxygen on the famous " roof of the world " .  '

In [152]:
rule_explanation_01(text, expl, verbose=True)

Nucleus is on the left.
Satellite's nucleus is on the right.
Nuclei proximity is far
Satellite doesn't have nested relations or its depth is too small.
Satellite's nucleus contains a wh-word or 'how' in the middle and will be cut at its position.
Satellite's nucleus text: 
passengers feel more comfortable.
Nucleus contains 'because'.
Removed tokens before the first NP: 
passengers feel more comfortable. 
---> 
passengers feel more comfortable.


("Because there is n't much oxygen there, trains will have oxygen masks for those who need _ because passengers feel more comfortable.",
 'far')

In [161]:
with open("parsed/race/train/middle/2459.txt.tree", "rt") as f:
    tree_text = f.read()

text, relations = read_relations(tree_text.replace("<s>", "").replace("<P>", ""))

In [165]:
expl = relations["Explanation"][2]
text[expl.left.start:expl.right.end]

"We helped him go through many medical examinations .  After all kinds of tests , we were told that the final diagnosis was appendicitis .  Luckily , Sam did n't need an operation because we brought him to the hospital in time .  He got timely treatment .  After Sam felt much better , we rushed back to prepare ourselves for the class .  We were a little tired , but we had certainly done something good , something right .  "

In [166]:
rule_explanation_01(text, expl, verbose=True)

Nucleus is on the left.
Satellite's nucleus is on the left.
Nuclei proximity is near
Satellite doesn't have nested relations or its depth is too small.
Satellite doesn't contain a wh-word or 'how'.
Satellite's nucleus text: 
After all kinds of tests, we were told that the final diagnosis was appendicitis.  Luckily, Sam did n't need an operation because we brought him to the hospital in time.  He got timely treatment.
Satellite's nucleus contains 'because'.
Removed tokens before the first NP: 
After all kinds of tests, we were told that the final diagnosis was appendicitis.  Luckily, Sam did n't need an operation because we brought him to the hospital in time.  He got timely treatment. 
---> 
all kinds of tests, we were told that the final diagnosis was appendicitis. Luckily, Sam did n't need an operation because we brought him to the hospital in time. He got timely treatment.


("We helped him go through many medical examinations because all kinds of tests, we were told that the final diagnosis was appendicitis. Luckily, Sam did n't need an operation because we brought him to the hospital in time. He got timely treatment.",
 'near')

In [131]:
OUTPUT_DIR = "statements/explanation_rule_01"

In [133]:
!mkdir -p {os.path.join(OUTPUT_DIR, "train")}

In [153]:
DIRECTORY = "parsed/race/train/middle"

statements_near, statements_far = [], []
for file_name in os.listdir(DIRECTORY):
    path = os.path.join(DIRECTORY, file_name)
    text, relations = extract_relations(path)
    
    if "Explanation" in relations:
        for relation in relations["Explanation"]:
            statement, nucleus_proximity = rule_explanation_01(
                text, 
                relation
            )
            if statement is not None:
                statement_str = f"[{path}]\n{statement}"
                if nucleus_proximity == "near":
                    statements_near.append(statement_str)
                else:
                    statements_far.append(statement_str)

In [154]:
with open(
    os.path.join(OUTPUT_DIR, "train/middle_near.txt"),  
    "wt"
) as f:
    f.write("\n".join(statements_near))

In [155]:
with open(
    os.path.join(OUTPUT_DIR, "train/middle_near.txt"),  
    "wt"
) as f:
    f.write("\n".join(statements_far))

In [156]:
for s in statements_near[:10]:
    print(s)
    print("\n")

[parsed/race/train/middle/2458.txt.tree]
I felt a little angry because Swift is my favourite star because i could n't understand.


[parsed/race/train/middle/2459.txt.tree]
We helped him go through many medical examinations because all kinds of tests, we were told that the final diagnosis was appendicitis. Luckily, Sam did n't need an operation because we brought him to the hospital in time. He got timely treatment.


[parsed/race/train/middle/2459.txt.tree]
I really wanted to go on sleeping, as I had insomnia and had just fallen asleep, but I could n't because Sam groaned louder and louder, showing he was seriously ill because that time, Robbie, a student from China, also got up to help.


[parsed/race/train/middle/4750.txt.tree]
But he is going to have English and Chinese lessons because he thinks hard and finds a way.


[parsed/race/train/middle/1358.txt.tree]
Because he drank too much, he was drunk and fell to the ground because the rich man came back, he could n't find his food an