In [125]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer

In [340]:
%run ./utils.ipynb
%run ./nlp.ipynb
%run ./relation_extraction.ipynb

In [185]:
# text, relations = extract_relations(
#     "parsed/race/train/middle/3260.txt.tree"
# )

In [6]:
def is_attribution(relation):
    return relation is not None and relation.type == "Attribution"


def has_nested_attribution(relation):
    return (
        (
            relation.left == "N" and is_attribution(relation.left_child)
        )
        or (
            relation.right == "N" and is_attribution(relation.right_child)
        )
    )

In [7]:
class SpeechVerbFinder:
    def __init__(self):
        with open("speech_verbs.txt", "rt") as f:
            self.speech_verbs = {line.strip().lower() for line in f.readlines()}
    
    def find_speech_verb(self, tokens):
        found_verbs = self.speech_verbs.intersection({normalise_verb(t) for t in tokens})
        if len(found_verbs) == 0:
            return None
        else:
            return list(found_verbs)[0]

In [342]:
speech_verb_finder = SpeechVerbFinder()

def rule_explanation_02(text, relation, verbose=False):
    assert(relation is not None and relation.type == "Explanation")
    info = prepare_extended_info(relation, verbose)
    if info is None:
        print_if_verbose("Extended info preparation wasn't successful.", verbose)
        return None, None
    
    _, ss_segment = info.satellite_info.relation.get_satellite()
    if ss_segment is not None:
        if info.nucleus_text[:1] == '"' or info.nucleus_text[:2] == "``":
            ss_tokens = tokenize(text[ss_segment.start:ss_segment.end])
            satellite_speech_verb = speech_verb_finder.find_speech_verb(ss_tokens)
            if satellite_speech_verb is not None:
                new_st, sent_no = move_st(text, ss_segment.start, 5)
                resolved_subjects = get_resolved_subjects(text[new_st:ss_segment.end])
                if len(resolved_subjects) > 0:
                    resolved_subject = resolved_subjects[-1]
                    if resolved_subject is not None:
                        statement = (
                            "{nucleus}. That is why "
                            "{subject} {verb} {satellite_nucleus}".format(
                                nucleus=remove_trailing_punctuation(
                                    uppercase_first_letter(info.nucleus_text)
                                ),
                                subject=resolved_subject,
                                verb=conjugate(satellite_speech_verb),
                                satellite_nucleus=lowercase_first_letter(info.sn_text)
                            )
                        )
                        return statement, info.nucleus_proximity
                    else:
                        print_if_verbose(
                            "Failed to find the subject of the satellite's satellite.", verbose
                        ) 
                else:
                    print_if_verbose(
                        "Failed to find the subject of the satellite's satellite.", verbose
                    )
            else:
                 print_if_verbose(
                     "Didn't find any speech verbs in the satellite's satellite.", verbose
                 )  
        else:
            print_if_verbose("Nucleus doesn't start with quotes.", verbose)
    else:
        print_if_verbose("Satellite doesn't have a satellite.", verbose)
    
    condition_holds = False
    
    if get_first_token(
            text[info.satellite_info.segment.start:info.satellite_info.segment.end]
        ) == "so":
        print_if_verbose("Satellite starts with 'so'.", verbose)
        condition_holds = True
    else:
        print_if_verbose(
            f"Nucleus relation type: '{get_relation_type(info.nucleus_info.relation)}'.",
            verbose
        )
        print_if_verbose(
            f"Satellite relation type: '{get_relation_type(info.satellite_info.relation)}'.",
            verbose
        )
        if (
            info.nucleus_info.relation is None 
            and info.satellite_info.relation is not None
            and (
                    info.satellite_info.relation.type in {
                        "Joint", "Elaboration", "Attribution"
                    }
                    or (
                        info.satellite_info.relation.type == "Explanation" 
                        and has_nested_attribution(info.satellite_info.relation)
                    )
            )
        ):
            print_if_verbose(
                "First condition was met.", verbose
            )
            condition_holds = True
        else:
            if (
                info.nucleus_info.relation is not None
                and info.satellite_info.relation is not None
                and (
                    (
                        info.nucleus_info.relation.type == "Elaboration"
                        and info.satellite_info.relation.type == "Attribution"
                    )
                    or (
                        info.nucleus_info.relation.type == "Background"
                        and info.satellite_info.relation.type == "Explanation"
                    )
                    or (
                        info.nucleus_info.relation.type == "Joint"
                        and info.satellite_info.relation.type in {
                            "Elaboration", "Attribution", "Explanation"
                        }
                    )
                )
            ):
                print_if_verbose(
                    "Second condition was met.", verbose
                )
                condition_holds = True
            else:
                print_if_verbose(
                    "None of conditions was met.", verbose
                )
    if condition_holds:
        processed_sn_text = remove_leading_words(info.sn_text, verbose)
        statement = "{nucleus}. That is why {satellite_nucleus}".format(
            nucleus=remove_trailing_punctuation(
                uppercase_first_letter(info.nucleus_text)
            ),
            satellite_nucleus=lowercase_first_letter(
                processed_sn_text if processed_sn_text is not None 
                    else info.sn_text
            )
        )
        return statement, info.nucleus_proximity
    else:
        return None, None

In [343]:
with open("parsed/race/train/middle/1796.txt.tree", "rt") as f:
    tree_text = f.read()

text, relations = read_relations(tree_text.replace("<s>", "").replace("<P>", ""))

In [344]:
expl = relations["Explanation"][2]
text[expl.left.start:expl.right.end]

'They are so beautiful .  Nancy : I love spring .  I love going to the beach and spending a whole day there .  It is so relaxing to lie on the beach .  I also like surfing in the sea .  James : I am twelve years old .  My favorite spring activity is swimming .  I started to learn swimming when I was seven years old .  I always have a good time with my friends in the river .  '

In [345]:
 rule_explanation_02(text, expl, verbose=True)

Nucleus is on the left.
Satellite's nucleus is on the left.
Nuclei proximity is near
Satellite doesn't have nested relations or its depth is too small.
Satellite doesn't contain a wh-word or 'how'.
Nucleus doesn't start with quotes.
Nucleus relation type: '-'.
Satellite relation type: 'Elaboration'.
First condition was met.
Removed tokens before the first NP: 
Nancy : I love spring. 
---> 
Nancy: I love spring.


('They are so beautiful. That is why nancy: I love spring.', 'near')

In [352]:
with open("parsed/race/train/middle/8045.txt.tree", "rt") as f:
    tree_text = f.read()

text, relations = read_relations(tree_text.replace("<s>", "").replace("<P>", ""))

In [353]:
expl = relations["Explanation"][0]
text[expl.left.start:expl.right.end]

"David sent a picture of himself : he was a tall , good-looking young man with big , happy smile .  As time went by , they became good friends and often sent cards and small things to each other .  When Jean 's father told her that he was going on a business trip to San Francisco , she asked him to let her go with him , so that she could give David a surprise for his birthday .  She would take him the latest DVD of the rock singer they liked most .  But when Jean knocked in David 's door in San Francisco , she found that the special friend she had written to was a twelve-year-old boy named Jim !  "

In [354]:
 rule_explanation_02(text, expl, verbose=True)

Nucleus is on the left.
Satellite's nucleus is on the left.
Nuclei proximity is near
Satellite doesn't have nested relations or its depth is too small.
Satellite doesn't contain a wh-word or 'how'.
Nucleus doesn't start with quotes.
Nucleus relation type: 'Attribution'.
Satellite relation type: 'Elaboration'.
None of conditions was met.


(None, None)

In [355]:
with open("parsed/race/train/middle/7236.txt.tree", "rt") as f:
    tree_text = f.read()

text, relations = read_relations(tree_text.replace("<s>", "").replace("<P>", ""))

In [356]:
expl = relations["Explanation"][2]
text[expl.left.start:expl.right.end]

'Your body needs to take in oxygen from the air and give out the unwanted carbon dioxide in the body .  When you breathe , air comes through your nose and mouth into your lungs where oxygen flows to different parts of your body .  What makes me sneeze ?  When dust or a strong smell gets into your nose , you sneeze .  This is the way your body cleans up the dust in your nose .  Your lungs push out the air and try to clear your nose .  What does my heart do ?  Your heart is a very special muscle .  It is slightly bigger than your fist and grows bigger as you do .  If you place your hand on your heart , you will feel your heart beating which forces blood to move around your body .  '

In [357]:
rule_explanation_02(text, expl, verbose=True)

Nucleus is on the left.
Satellite's nucleus is on the left.
Nuclei proximity is near
Satellite doesn't have nested relations or its depth is too small.
Satellite's nucleus contains a wh-word or 'how' in the middle and will be cut at its position.
Nucleus doesn't start with quotes.
Nucleus relation type: 'Joint'.
Satellite relation type: 'Elaboration'.
Second condition was met.
Removed tokens before the first NP: 
when you breathe , air comes through your nose and mouth into your lungs 
---> 
you breathe, air comes through your nose and mouth into your lungs


('Your body needs to take in oxygen from the air and give out the unwanted carbon dioxide in the body. That is why you breathe, air comes through your nose and mouth into your lungs',
 'near')

In [319]:
DIRECTORY = "parsed/race/train/middle"

statements_near, statements_far = [], []
for file_name in os.listdir(DIRECTORY):
    path = os.path.join(DIRECTORY, file_name)
    text, relations = extract_relations(path)
    
    if "Explanation" in relations:
        for relation in relations["Explanation"]:
            statement, nucleus_proximity = rule_explanation_02(
                text, 
                relation
            )
            if statement is not None:
                statement_str = f"[{path}]\n{statement}"
                if nucleus_proximity == "near":
                    statements_near.append(statement_str)
                else:
                    statements_far.append(statement_str)

In [320]:
OUTPUT_DIR = "statements/explanation_rule_02"

In [44]:
!mkdir {os.path.join(OUTPUT_DIR, "train")}

In [321]:
with open(
    os.path.join(OUTPUT_DIR, "train/middle_near.txt"), 
    "wt"
) as f:
    f.write("\n".join(statements_near))

In [322]:
with open(
    os.path.join(OUTPUT_DIR, "train/middle_far.txt"), 
    "wt"
) as f:
    f.write("\n".join(statements_far))

In [323]:
for s in statements_near[11:30]:
    print(s)
    print("\n")

[parsed/race/train/middle/7236.txt.tree]
Your body needs to take in oxygen from the air and give out the unwanted carbon dioxide in the body. That is why you breathe, air comes through your nose and mouth into your lungs


[parsed/race/train/middle/7236.txt.tree]
As a result, your body temperature drops. That is why i move?


[parsed/race/train/middle/7237.txt.tree]
" And he waited quietly for his turn, instead of pushing the others aside ; showing that he was modest ( ). That is why <P> called ``


[parsed/race/train/middle/7051.txt.tree]
Red is the main color. That is why they do n't draw eyebrows.


[parsed/race/train/middle/7051.txt.tree]
When actors perform, they will paint " monkey faces " with colors. That is why red is the main color.


[parsed/race/train/middle/7051.txt.tree]
It shows the happy reunion of families. That is why the monkey stamps were for sale on Jan. 5th.


[parsed/race/train/middle/6299.txt.tree]
Finally, jog the whole way. That is why it will be easy for you 