In [11]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.stem.snowball import SnowballStemmer

import pattern.en as en

In [46]:
%run ./utils.ipynb
%run ./nlp.ipynb
%run ./relation_extraction.ipynb

In [61]:
# text, relations = extract_relations(
#     "parsed/race/train/middle/3260.txt.tree"
# )

In [8]:
def search_segment(text, relation, direction):
    if direction == "left":
        child, segment = relation.left_child, relation.left
    else:
        child, segment = relation.right_child, relation.right
    if child:
        return search_segment(
            text, 
            child, 
            direction
        )
    else:
        return text[segment.start:segment.end]
    
    
def get_depth(relation):
    if relation:
        return 1 + max(
            get_depth(relation.left_child), 
            get_depth(relation.right_child)
        )
    else:
        return 0

In [4]:
def is_attribution(relation):
    return relation is not None and relation.type == "Attribution"


def has_nested_attribution(relation):
    return (
        (
            relation.left == "N" and is_attribution(relation.left_child)
        )
        or (
            relation.right == "N" and is_attribution(relation.right_child)
        )
    )

In [36]:
def tokenize(s):
    return [t.lower() for t in word_tokenize(s)]


def get_first_token(s):
    tokenized = word_tokenize(s)
    if len(tokenized) == 0:
        return None
    else:
        return tokenized[0].lower()
    

def normalise_verb(verb):
    return en.conjugate(
        verb, 
        tense = en.PRESENT,        # INFINITIVE, PRESENT, PAST, FUTURE
        person = 1,              # 1, 2, 3 or None
        number = en.SINGULAR,       # SG, PL
        mood = en.INDICATIVE,     # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
        negated = False,          # True or False
        parse = True
    )


class SpeechVerbFinder:
    def __init__(self):
        with open("speech_verbs.txt", "rt") as f:
            self.speech_verbs = {line.strip().lower() for line in f.readlines()}
    
    def find_speech_verb(self, tokens):
        found_verbs = self.speech_verbs.intersection({normalise_verb(t) for t in tokens})
        if len(found_verbs) == 0:
            return None
        else:
            return list(found_verbs)[0]

In [41]:
def is_plural(word):
    if word == en.pluralize(word):
        return True
    else:
        return False

In [44]:
def conjugate(verb):
    return en.conjugate(verb, tense=en.PAST)

In [57]:
speech_verb_finder = SpeechVerbFinder()


def extract_nuclei(
    nucleus_relation,
    nucleus_segment,
    satellite_relation,
    satellite_nucleus_relation,
    satellite_nucleus_segment,
    nucleus_direction
):
    # if there are nested relations in the nucleus
    # and there are too many of them
    if nucleus_relation and get_depth(nucleus_relation) > 3:
        nucleus_text = clean( 
            search_segment( # take the closest nested segment relative to the satellite
                text, 
                nucleus_relation,
                nucleus_direction
            )
        )
    else: # else take the nucleus as is
        nucleus_text = clean(
            text[nucleus_segment.start:nucleus_segment.end]
        )   

    # checking if the nucleus of the satellite is on the right/left
    if satellite_relation.left.type == "N":
        satellite_nucleus_relation = satellite_relation.left_child
        satellite_nucleus_segment = satellite_relation.left
        if nucleus_direction == "right": # if the expl. nucleus is on the left
            nucleus_proximity = "near"
        else:
            nucleus_proximity = "far"
    else:
        satellite_nucleus_relation = satellite_relation.right_child
        satellite_nucleus_segment = satellite_relation.right
        if nucleus_direction == "right":
            nucleus_proximity = "far"
        else:
            nucleus_proximity = "near"

    # if there are nested relations in the satellite nucleus
    # and there are too many of them
    if (
        satellite_nucleus_relation 
        and get_depth(satellite_nucleus_relation) > 3
    ):
        satellite_nucleus_text = clean(
            search_segment(# take the closest nested segment relative to the nucleus
                text, 
                satellite_nucleus_relation, 
                satellite_direction
            )
        )
    else: # else take the satellite nucleus as is
        satellite_nucleus_text = clean(
            text[
                satellite_nucleus_segment.start
                :satellite_nucleus_segment.end
            ]
        )

    return nucleus_text, satellite_nucleus_text, nucleus_proximity

            
def rule_explanation_02(text, relation):
    assert(relation is not None and relation.type == "Explanation")
    if relation.left.type == "N":
        nucleus_direction = "right"
        nucleus_relation = relation.left_child
        nucleus_segment = relation.left
        satellite_relation = relation.right_child
        satellite_segment = relation.right
    else:
        nucleus_direction = "left"
        nucleus_relation = relation.right_child
        nucleus_segment = relation.right
        satellite_relation = relation.left_child
        satellite_segment = relation.left
    
    if satellite_relation is None:
        return None, None
    
    sn_relation, sn_segment = satellite_relation.get_first_nucleus()
    
    if sn_segment is None:
        return None, None
    
    nucleus_text, satellite_nucleus_text, nucleus_proximity = extract_nuclei(
        nucleus_relation,
        nucleus_segment,
        satellite_relation,
        sn_relation,
        sn_segment,
        nucleus_direction
    )
    
    _, ss_segment = satellite_relation.get_satellite()
    if ss_segment is not None:
        ss_tokens = tokenize(text[ss_segment.start:ss_segment.end])
        satellite_speech_verb = speech_verb_finder.find_speech_verb(ss_tokens)
        if satellite_speech_verb is not None:
            new_st, sent_no = move_st(text, ss_segment.start, 5)
            resolved_subjects = get_resolved_subjects(text[new_st:ss_segment.end])
            if len(resolved_subjects) > 0:
                resolved_subject = resolved_subjects[-1]
                if resolved_subject is not None:
                    statement = "{nucleus}. That is why {subject} {verb} {satellite_nucleus}".format(
                        nucleus=remove_trailing_punctuation(
                            uppercase_first_letter(nucleus_text)
                        ),
                        subject=resolved_subject,
                        verb=conjugate(satellite_speech_verb),
                        satellite_nucleus=satellite_nucleus_text
                    )
                    return statement, nucleus_proximity
                
    if (
        get_first_token(text[satellite_segment.start:satellite_segment.end]) == "so"
        or (
            nucleus_relation is None 
            and satellite_relation is not None
            and (
                    satellite_relation.type in {"Joint", "Elaboration", "Attribution"}
                    or (
                        satellite_relation.type == "Explanation" 
                        and has_nested_attribution(satellite_relation)
                    )
            )
        )
        or (
            nucleus_relation is not None
            and satellite_relation is not None
            and (
                (
                    nucleus_relation.type == "Elaboration"
                    and satellite_relation.type == "Attribution"
                )
                or (
                    nucleus_relation.type == "Background"
                    and satellite_relation.type == "Explanation"
                )
                or (
                    nucleus_relation.type == "Joint"
                    and satellite_relation.type in {"Elaboration", "Attribution", "Explanation"}
                )
            )
        )
    ):
        statement = "{nucleus}. That is why {satellite_nucleus}".format(
            nucleus=remove_trailing_punctuation(
                uppercase_first_letter(nucleus_text)
            ),
            satellite_nucleus=satellite_nucleus_text
        )
        return statement, nucleus_proximity
    else:
        return None, None

In [None]:
with open("parsed/race/train/middle/3260.txt.tree", "rt") as f:
    tree_text = f.read()

text, relations = read_relations(tree_text.replace("<s>", ""))

In [71]:
expl = relations["Explanation"][0]
text[expl.left.start:expl.right.end]

'After the game , Wu Nai , head of the boys \' team , was very unhappy .  " We all thought this would be an easy game , " he said .  '

In [72]:
 rule_explanation_02(text, expl)

('After the game, Wu Nai, head of the boys\' team, was very unhappy. That is why he said " We all thought this would be an easy game, "',
 'near')

In [78]:
with open("parsed/race/train/middle/5646.txt.tree", "rt") as f:
    tree_text = f.read()

text, relations = read_relations(tree_text.replace("<s>", ""))

In [79]:
expl = relations["Explanation"][0]
text[expl.left.start:expl.right.end]

'I landed on a piece of glass that cut my eye badly . from then on , my injured , sightless , cloudy gray eye lived on with me .  Sometimes people asked me embarrassing questions .  Whenever kids played games , I was the " monster " . I was always imagining that everyone looked down on me .  Yet mum would say to me , " Hold your head up high and face the world . "  '

In [80]:
 rule_explanation_02(text, expl)

('I landed on a piece of glass that cut my eye badly. from then on, my injured, sightless, cloudy gray eye lived on with me. That is why mum said Sometimes people asked me embarrassing questions.',
 'near')

In [82]:
expl = relations["Explanation"][2]
text[expl.left.start:expl.right.end]

'I say it to my children .  The gift my mum gave me will live on . <P> '

In [83]:
rule_explanation_02(text, expl)

('I say it to my children. That is why me will live on.', 'far')

In [90]:
with open("parsed/race/train/middle/293.txt.tree", "rt") as f:
    tree_text = f.read()

text, relations = read_relations(tree_text.replace("<s>", "").replace("<P>", ""))

In [91]:
expl = relations["Explanation"][1]
text[expl.left.start:expl.right.end]

'Matt asked .  " Friends and patience . " he answered .  '

In [92]:
rule_explanation_02(text, expl)

('Matt asked. That is why Matt answered " Friends and patience. "', 'near')

In [11]:
# DIRECTORY = "parsed/race/train/middle"

# statements_near, statements_far = [], []
# for file_name in os.listdir(DIRECTORY):
#     path = os.path.join(DIRECTORY, file_name)
#     text, relations = extract_relations(path)
    
#     if "Explanation" in relations:
#         for relation in relations["Explanation"]:
#             statement, nucleus_proximity = rule_explanation_elaboration(
#                 text, 
#                 relation
#             )
#             if statement is not None:
#                 statement_str = f"[{path}]\n{statement}"
#                 if nucleus_proximity == "near":
#                     statements_near.append(statement_str)
#                 else:
#                     statements_far.append(statement_str)

In [12]:
# with open(
#     "statements/explanation_elaboration/train/middle_near.txt", 
#     "wt"
# ) as f:
#     f.write("\n".join(statements_near))

In [13]:
# with open(
#     "statements/explanation_elaboration/train/middle_far.txt", 
#     "wt"
# ) as f:
#     f.write("\n".join(statements_far))