In [1]:
import json

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer

In [3]:
import import_ipynb
from aux import utils
from aux import nlp
from aux import relation_extraction
from aux import defs
import preparation

importing Jupyter notebook from /Users/YK/mt/project/aux/utils.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/nlp.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/relation_extraction.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/defs.ipynb
importing Jupyter notebook from preparation.ipynb


In [4]:
def is_attribution(relation):
    return relation is not None and relation.type == "Attribution"


def has_nested_attribution(relation):
    return (
        (
            relation.left == "N" and is_attribution(relation.left_child)
        )
        or (
            relation.right == "N" and is_attribution(relation.right_child)
        )
    )

In [5]:
class SpeechVerbFinder:
    def __init__(self):
        with open("aux/speech_verbs.txt", "rt") as f:
            self.speech_verbs = {line.strip().lower() for line in f.readlines()}
    
    def find_speech_verb(self, tokens):
        found_verbs = self.speech_verbs.intersection({nlp.normalise_verb(t) for t in tokens})
        if len(found_verbs) == 0:
            return None
        else:
            return list(found_verbs)[0]

In [6]:
class RuleExplanation02(defs.Rule):
    name = "explanation_02"
    relation_type = "Explanation"
    reasons = {
        "SPEECH": 
            defs.Reason(
                1, 
                "Nucleus is surrounded by quotation marks" 
                "and satellite's satellite contains a speech verb."),
        "SATELLITE_STARTS_WITH_SO": 
            defs.Reason(
                2, 
                "Satellite starts with 'so'."
            ),
        "COMMON_PATTERN_SIMPLE_NUCLEUS": 
            defs.Reason(
                3, 
                "Common pattern; nucleus without nested relations."
            ),
        "COMMON_PATTERN_COMPLEX_NUCLEUS": 
            defs.Reason(
                4,
                "Common pattern; nucleus has nested relations."
            )
    }
    
    def __init__(self):
        self.speech_verb_finder = SpeechVerbFinder()
        
    def generate_statement(self, text, relation, verbose=False):
        assert(relation is not None and relation.type == "Explanation")
        info = preparation.Preprocessor.prepare_extended_info(text, relation, verbose)
        if info is None:
            utils.print_if_verbose("Extended info preparation wasn't successful.", verbose)
            return None
        
        if info.satellite_info.relation is None:
            return None
        
        _, ss_segment = info.satellite_info.relation.get_satellite()
        if ss_segment is not None:
            if (
                info.nucleus_preparation_result.prepared_text[:1] == '"' 
                    or info.nucleus_preparation_result.prepared_text[:2] == "``"
            ):
                ss_tokens = utils.tokenize(text[ss_segment.start:ss_segment.end])
                satellite_speech_verb = self.speech_verb_finder.find_speech_verb(ss_tokens)
                if satellite_speech_verb is not None:
                    new_st, sent_no = nlp.move_st(text, ss_segment.start, 5)
                    resolved_subjects = nlp.get_resolved_subjects(
                        text[new_st:ss_segment.end]
                    )
                    if len(resolved_subjects) > 0:
                        resolved_subject = resolved_subjects[-1]
                        if resolved_subject is not None:
                            reason = self.reasons["SPEECH"]
                            prepared_nucleus_text = utils.remove_trailing_punctuation(
                                utils.uppercase_first_letter(
                                    info.nucleus_preparation_result.prepared_text
                                )
                            )
                            prepared_sn_text = utils.lowercase_first_letter(
                                info.satellite_preparation_result.prepared_text
                            )
                            statement_text = (
                                "{nucleus}. That is why "
                                "{subject} {verb} {satellite_nucleus}".format(
                                    nucleus=prepared_nucleus_text,
                                    subject=resolved_subject,
                                    verb=nlp.conjugate(satellite_speech_verb),
                                    satellite_nucleus=prepared_sn_text
                                )
                            )
                            utils.print_if_verbose(reason.explanation, verbose)
                            return defs.Statement(
                                statement_text=statement_text,
                                nucleus=prepared_nucleus_text,
                                satellite_nucleus=prepared_sn_text,
                                left_boundary=relation.left.start,
                                right_boundary=relation.right.end,
                                nucleus_proximity=info.nucleus_proximity.value,
                                rule=self.name,
                                reason=reason
                            )           
                        else:
                            utils.print_if_verbose(
                                "Failed to find the subject of the satellite's satellite.", 
                                verbose
                            ) 
                    else:
                        utils.print_if_verbose(
                            "Failed to find the subject of the satellite's satellite.", 
                            verbose
                        )
                else:
                     utils.print_if_verbose(
                         "Didn't find any speech verbs in the satellite's satellite.", 
                         verbose
                     )  
            else:
                utils.print_if_verbose("Nucleus doesn't start with quotes.", verbose)
        else:
            utils.print_if_verbose("Satellite doesn't have a satellite.", verbose)

        reason = None

        if utils.get_first_token(
                text[info.satellite_info.segment.start:info.satellite_info.segment.end]
            ) == "so":
            reason = self.reasons["SATELLITE_STARTS_WITH_SO"]
            utils.print_if_verbose("Satellite starts with 'so'.", verbose)
            condition_holds = True
        else:
            utils.print_if_verbose(
                "Nucleus relation type: "
                f"'{utils.get_relation_type(info.nucleus_info.relation)}'.",
                verbose
            )
            utils.print_if_verbose(
                "Satellite relation type: "
                f"'{utils.get_relation_type(info.satellite_info.relation)}'.",
                verbose
            )
            if (
                info.nucleus_info.relation is None 
                and info.satellite_info.relation is not None
                and (
                        info.satellite_info.relation.type in {
                            "Joint", "Elaboration", "Attribution"
                        }
                        or (
                            info.satellite_info.relation.type == "Explanation" 
                            and has_nested_attribution(info.satellite_info.relation)
                        )
                )
            ):
                reason = self.reasons["COMMON_PATTERN_SIMPLE_NUCLEUS"]
            else:
                if (
                    info.nucleus_info.relation is not None
                    and info.satellite_info.relation is not None
                    and (
                        (
                            info.nucleus_info.relation.type == "Elaboration"
                            and info.satellite_info.relation.type == "Attribution"
                        )
                        or (
                            info.nucleus_info.relation.type == "Background"
                            and info.satellite_info.relation.type == "Explanation"
                        )
                        or (
                            info.nucleus_info.relation.type == "Joint"
                            and info.satellite_info.relation.type in {
                                "Elaboration", "Attribution", "Explanation"
                            }
                        )
                    )
                ):
                    reason = self.reasons["COMMON_PATTERN_COMPLEX_NUCLEUS"]
                else:
                    utils.print_if_verbose(
                        "None of conditions was met.", verbose
                    )

        if reason is not None:
            utils.print_if_verbose(reason.explanation, verbose)
            
            prepared_nucleus_text = utils.remove_trailing_punctuation(
                utils.uppercase_first_letter(
                    info.nucleus_preparation_result.prepared_text
                )
            )
            processed_sn_text = nlp.remove_leading_words(
                info.satellite_preparation_result.prepared_text, 
                verbose
            )
            prepared_sn_text = utils.lowercase_first_letter(
                processed_sn_text if processed_sn_text is not None 
                    else info.satellite_preparation_result.prepared_text
            )
            statement_text = f"{prepared_nucleus_text}. That is why {prepared_sn_text}"
            return defs.Statement(
                statement_text=statement_text,
                nucleus=prepared_nucleus_text,
                satellite_nucleus=prepared_sn_text,
                left_boundary=relation.left.start,
                right_boundary=relation.right.end,
                nucleus_proximity=info.nucleus_proximity.value,
                rule=self.name,
                reason=reason
            )
        else:
            return None

In [7]:
if __name__ == "__main__" and "__file__" not in globals():
    rule = RuleExplanation02()

    with open("../parsed/race/train/middle/3972.txt.tree", "rt") as f:
        tree_text = f.read()

    text, relations = relation_extraction.read_relations(
        tree_text.replace("<s>", "").replace("<P>", "")
    )

    expl = relations["Explanation"][0]
    print(text[expl.left.start:expl.right.end])

    statement = rule.generate_statement(text, expl, verbose=True)
    print("\nRESULT:")
    if statement is not None:
        print(json.dumps(statement._asdict(), indent=2))

( 4 ) Walk with confidence .  It does n't matter how cool your uniform is ( or is not ) .  If you are not confident , nobody will notice how great you look !  
Nucleus is on the left.
Nucleus's depth <= 3.
Nucleus is flat.
Will use the whole segment.
Satellite's nucleus is on the left.
Nuclei proximity is NucleusProximity.NEAR
Satellite's (left) nucleus contains '.', '!', '?', or ';'.
Will use the whole segment.
Text extracted from the satellite:
It does n't matter how cool your uniform is ( or is not ).  If you are not confident, nobody will notice how great you look!
Nucleus doesn't start with quotes.
Nucleus relation type: '-'.
Satellite relation type: 'Elaboration'.
Common pattern; nucleus without nested relations.
Removing tokens before the first NP:
-- syntactic parsing result
 (ROOT
  (S
    (NP (PRP It))
    (VP (VBZ does) (RB n't)
      (VP (VB matter)
        (SBAR
          (WHADVP (WRB how)
            (ADJP (JJ cool)))
          (S
            (NP (PRP$ your) (NN uniform))