In [1]:
import json
import collections

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag

In [3]:
import import_ipynb
from aux import utils
from aux import nlp
from aux import relation_extraction
from aux import defs
import preparation
import rule_base

importing Jupyter notebook from /Users/YK/mt/project/aux/utils.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/nlp.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/relation_extraction.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/defs.ipynb
importing Jupyter notebook from preparation.ipynb
importing Jupyter notebook from rule_base.ipynb


In [4]:
def do_subjects_differ(text, rel, n_prev_sent=5):
    new_st, left_segment_sent_no = nlp.move_st(
        text, rel.left.start, n_prev_sent
    )
    if len({".", "!", "?"}.intersection(
        set(word_tokenize(text[rel.left.start:rel.right.end])))) > 0:
        right_segment_sent_no = left_segment_sent_no + 1
    else:
        right_segment_sent_no = left_segment_sent_no

    if left_segment_sent_no == right_segment_sent_no:
        return True
    else:
        resolved_subjects = nlp.get_resolved_subjects(text[new_st:rel.right.end])
        if len(resolved_subjects) < right_segment_sent_no:
            return True
        else:
            return (
                resolved_subjects[left_segment_sent_no] 
                != resolved_subjects[right_segment_sent_no]
            )

        
def test__do_subjects_differ():
    text = """Nika lives in Berlin. She goes to a Kita. Her favourite color is yellow."""
    assert do_subjects_differ(
        text, 
        relation_extraction.Relation(
            "", 
            relation_extraction.Segment("N", 22, 42), 
            relation_extraction.Segment("S", 42, len(text)), 
            None, 
            None
        )
    )
    text = """Nika lives in Berlin. She goes to a Kita. She likes yellow things."""
    assert not do_subjects_differ(
        text, 
        relation_extraction.Relation(
            "", 
            relation_extraction.Segment("N", 22, 42), 
            relation_extraction.Segment("S", 42, len(text)), 
            None, 
            None
        )
    )    

    
test__do_subjects_differ()

In [5]:
ItMakesParseResult = collections.namedtuple(
    "ItMakesParseResult", ["success", "subject", "verb", "rest"]
)


def safe_access(tokens, i):
    if i < len(tokens):
        return tokens[i]
    else:
        return ""
    

def normalise_and_identify_person(subject):
    subject = subject.lower()
    if subject == "me":
        return "I", 1
    elif subject == "us":
        return "we", 1
    elif subject == "you":
        return "you", 2
    elif subject == "him":
        return "he", 3
    elif subject == "her":
        return "she", 3
    elif subject == "them":
        return "they", 3
    else:
        return subject, 3
    
    
def parse_it_makes(tokens, verbose=False):
    if safe_access(tokens, 0).lower() == "it":
        make_pos = -1
        
        if nlp.normalise_verb(safe_access(tokens, 1).lower()) == "make": #it makes ...
            make_pos = 1
        elif nlp.normalise_verb(safe_access(tokens, 2).lower()) == "make": #it is making ...
            make_pos = 2
        if make_pos < 0:
            utils.print_if_verbose("Didn't find 'make'.", verbose)
            return ItMakesParseResult(False, None, None, None)
        else:
            first_verb_position = nlp.find_first_verb(" ".join(tokens[make_pos + 1:]))
            if first_verb_position is None:
                utils.print_if_verbose(
                    "Didn't find the verb. Will look for an adjective.", verbose
                ) #It makes me sad.
                first_adj_position = nlp.find_first_adjective(" ".join(tokens[make_pos + 1:]))
                if first_adj_position is None:
                    utils.print_if_verbose("Didn't find an adjective either.")
                    return ItMakesParseResult(False, None, None, None)
                else:
                    verb = "be" #I am sad
                    subject = " ".join(
                        tokens[make_pos + 1:(make_pos + 1 + first_adj_position)]
                    )
                    rest = " ".join(tokens[(make_pos + 1 + first_adj_position):])
            else:
                verb = tokens[make_pos + 1 + first_verb_position]
                subject = " ".join(
                    tokens[make_pos + 1:(make_pos + 1 + first_verb_position)]
                )
                rest = " ".join(tokens[(make_pos + 1 + first_verb_position + 1):])
            
            if tokens[make_pos] in {"make", "makes", "making"}:
                tense = nlp.Tense.PRESENT
            elif tokens[make_pos] == "made":
                tense = nlp.Tense.PAST
            else:
                assert False
            normalised_subject, person = normalise_and_identify_person(subject)
            if nlp.is_plural(normalised_subject):
                number = nlp.Number.PLURAL
            else:
                number = nlp.Number.SINGULAR
            return ItMakesParseResult(
                True, 
                normalised_subject, 
                nlp.conjugate(verb, tense, person, number),
                rest
            )
    else:
        return ItMakesParseResult(False, None, None, None)

        
def check_parse_result(parse_result, true_subject, true_verb):
    return parse_result.subject == true_subject and parse_result.verb == true_verb


def test__parse_it_makes():
    parse_result = parse_it_makes(
        utils.lowercase_and_tokenize("It's making him feel happy.")
    )
    check_parse_result(parse_result, "he", "feels")

    parse_result = parse_it_makes(
        utils.lowercase_and_tokenize("It makes them feel happy.")
    )
    check_parse_result(parse_result, "they", "feel")

    parse_result = parse_it_makes(
        utils.lowercase_and_tokenize("It makes the cats happy.")
    )
    check_parse_result(parse_result, "the cats", "are")

    parse_result = parse_it_makes(
        utils.lowercase_and_tokenize("It makes the cat happy.")
    )
    check_parse_result(parse_result, "the cat", "is")

    parse_result = parse_it_makes(
        utils.lowercase_and_tokenize("It has made the cat happy.")
    )
    check_parse_result(parse_result, "the cat", "was")

    parse_result = parse_it_makes(
        utils.lowercase_and_tokenize("It has made the cats happy.")
    )
    check_parse_result(parse_result, "the cat", "were")

    parse_result = parse_it_makes(
        utils.lowercase_and_tokenize("It has made the cats go home.")
    )
    check_parse_result(parse_result, "the cats", "went")
    
    parse_result = parse_it_makes(
        utils.lowercase_and_tokenize("It has made Mr. Smith go home.")
    )
    check_parse_result(parse_result, "Mr. Smith", "went")

    
test__parse_it_makes()

In [6]:
class RuleExplanation01(rule_base.Rule):
    name = "explanation_01"
    relation_type = "Explanation"
    reasons = {
        "N_STARTS_WITH_BUT": 
            defs.Reason(1, "Nucleus starts with 'but'."),
        "N_STARTS_WITH_IF": 
            defs.Reason(
                2, 
                "Nucleus starts with 'if' and its left subrelation is not 'Condition'."
            ),
        "N_CONTAINS_BECAUSE": 
            defs.Reason(
                3, 
                "Nucleus contains 'because'."
            ),
        "SN_STARTS_WITH_BUT": 
            defs.Reason(4, "Satellite's nucleus starts with 'but'."),
        "SN_CONTAINS_BECAUSE": 
            defs.Reason(
                5, 
                "Satellite's nucleus contains 'because'."
            ),
        "SN_STARTS_WITH_IT_MAKES":
            defs.Reason(
                6,
                "Satellite's nucleus starts with 'It makes/made'."
            )
    }
    
    def generate_statement(self, text, relation, verbose=False, **kwargs):
        assert(relation is not None and relation.type == "Explanation")
        info = preparation.Preprocessor.prepare_extended_info(
            text, relation, verbose
        ) # finding N,S and the nested ones
        if info is None:
            utils.print_if_verbose("Extended info preparation wasn't successful.", verbose)
            return None
        
        if info.satellite_info.relation is None:
            return None
        
        nucleus_tokens = utils.lowercase_and_tokenize(
            info.nucleus_preparation_result.prepared_text
        )
        assert len(nucleus_tokens) > 0
        utils.print_if_verbose(
            "Satellite's nucleus text: \n"
            f"{info.satellite_preparation_result.prepared_text}", 
            verbose
        )
        sn_tokens = utils.lowercase_and_tokenize(
            info.satellite_preparation_result.prepared_text
        )
        assert len(sn_tokens) > 0

        reason = None
        augmented_sn_text = info.satellite_preparation_result.prepared_text

        if nucleus_tokens[0] == "but":
            reason = RuleExplanation01.reasons["N_STARTS_WITH_BUT"]
        elif (
            nucleus_tokens[0] == "if"
                and (
                    info.nucleus_info.relation is None
                        or utils.get_relation_type(
                            info.nucleus_info.relation.left_child
                        ) != "Condition"
                )
        ):
            reason = RuleExplanation01.reasons["N_STARTS_WITH_IF"]       
        elif "because" in nucleus_tokens:
            reason = RuleExplanation01.reasons["N_CONTAINS_BECAUSE"]
        elif sn_tokens[0] == 'but':
            reason = RuleExplanation01.reasons["SN_STARTS_WITH_BUT"]
        elif "because" in set(sn_tokens):
            reason = RuleExplanation01.reasons["SN_CONTAINS_BECAUSE"]
        else:
            satellite_text = preparation.clean(
                text[
                    info.satellite_info.segment.start:info.satellite_info.segment.end
                ]
            )
            utils.print_if_verbose(f"Satellite:\n{satellite_text}", verbose)
            it_make_parse_result = parse_it_makes(word_tokenize(satellite_text))
            utils.print_if_verbose(it_make_parse_result, verbose)
            if it_make_parse_result.success:
                reason = RuleExplanation01.reasons["SN_STARTS_WITH_IT_MAKES"]
                assert it_make_parse_result.subject is not None
                assert it_make_parse_result.verb is not None
                assert it_make_parse_result.rest is not None
                augmented_sn_text = preparation.clean(
                        " ".join(
                        [
                            it_make_parse_result.subject, 
                            it_make_parse_result.verb, 
                            it_make_parse_result.rest
                        ]
                    )
                )

        if reason is not None:
            utils.print_if_verbose(reason.explanation, verbose)
            final_nucleus_text, final_sn_text = self._finalise_statement_parts(
                info.nucleus_preparation_result.prepared_text,
                augmented_sn_text,
                verbose
            )
            return self._generate_statement(
                final_nucleus_text, 
                " because ", 
                final_sn_text, 
                relation, 
                info.nucleus_proximity,
                self.name,
                reason,
                verbose
            )
        else:
            utils.print_if_verbose("None of the conditions were met.", verbose)
            return None

In [7]:
if __name__ == "__main__" and "__file__" not in globals():
    rule = RuleExplanation01()

    with open("../parsed/race/test/middle/1.txt.tree", "rt") as f:
        tree_text = f.read()

    text, relations = relation_extraction.read_relations(
        tree_text.replace("<s>", "").replace("<P>", "")
    )

    expl = relations["Explanation"][1] # for generating just 1 statement
    print(text[expl.left.start:expl.right.end])

    statement = rule.generate_statement(text, expl, verbose=True)
    print("\nRESULT:")
    print(json.dumps(statement._asdict(), indent=2))

But Wang has got used to it and can see the benefits now .  " I used to speak too little .  But being a team leader means you have to talk a lot .  You could even call me an excellent speaker today . "  
Nucleus is on the left.
Nucleus's depth <= 100.
Parsing result:
(ROOT
  (S ('' '')
    (NP (CC But))
    (ADVP (NNP Wang))
    (VP
      (VP (VBZ has)
        (VP (VBN got)
          (S
            (VP (VBN used)
              (PP (TO to)
                (NP (PRP it)))))))
      (CC and)
      (VP (MD can)
        (VP (VB see)
          (NP (DT the) (NNS benefits))
          (ADVP (RB now)))))
    (. .) ('' '')))

Constituencies:
    type  start  end  depth
0     ''      0    1      2
1     CC      1    2      3
2     NP      1    2      2
3    NNP      2    3      3
4   ADVP      2    3      2
5    VBZ      3    4      4
6    VBN      4    5      5
7    VBN      5    6      7
8     TO      6    7      8
9    PRP      7    8      9
10    NP      7    8      8
11    PP      6    8      

In [8]:
nlp.take_first_sentence_and_remove_leading_words("\" I used to speak too little.  But being a team leader means you have to talk a lot.  You could even call me an excellent speaker today. \"")

"''I used to speak little.''"

In [9]:
nlp.fix_quotes("''I used to speak little.")

"''I used to speak little.''"