In [1]:
import json

In [6]:
import nltk.tokenize
from nltk.stem.snowball import SnowballStemmer

In [3]:
import import_ipynb
from aux import utils
from aux import nlp
from aux import relation_extraction
from aux import defs
import preparation

importing Jupyter notebook from /Users/YK/mt/project/aux/utils.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/nlp.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/relation_extraction.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/defs.ipynb
importing Jupyter notebook from preparation.ipynb


In [7]:
class RuleTopicComment01(defs.Rule):
    name = "topic_comment_01"
    relation_type = "Topic-Comment"
    reasons = {
        "S_HAS_WH_WORD_AND_?": 
            defs.Reason(
                1,
                "The satellite starts with a wh-word and ends with '?'."
            ),
        "QUESTION_IN_MAIN_N": 
            defs.Reason(
                2,
                "There is a question in the main nucleus"
            )
    }

    def get_first_question(text):
        tokens = nltk.word_tokenize(text)
        st = 0
        while st < len(tokens):
            if tokens[st].lower() in preparation.wh_words:
                end = st + 1
                while end < len(tokens):
                    if tokens[end] == '?':
                        return " ".join(tokens[st:end]) + "?"
                    elif tokens[end] in {".", "!"}:
                        st = end
                        break
                    end += 1
            st += 1
        return None
    
    def get_first_sentence_with_question(text):
        sentences = nltk.sent_tokenize(text)
        wh_words = set(preparation.wh_words)
        for sentence in sentences:
            sent_tokens = set([t.lower() for t in nltk.word_tokenize(sentence)])
            if len(wh_words.intersection(sent_tokens)) > 0 and "?" in sent_tokens:
                return utils.fix_spacing(sentence[:(sentence.index("?") + 1)])
        return None

    def generate_statement(self, text, relation, main_nucleus_text, verbose=False):
        assert(relation is not None and relation.type == "Topic-Comment")
        info = preparation.Preprocessor.prepare_extended_info(text, relation, verbose)
        if info is None:
            utils.print_if_verbose("Extended info preparation wasn't successful.", verbose)
            return None

        reason = None
        
        cleaned_s = preparation.clean(
            text[info.satellite_info.segment.start:info.satellite_info.segment.end]
        )
        s_tokens = utils.lowercase_and_tokenize(
            cleaned_s
        )
        if (
            len(s_tokens) >= 2 
                and s_tokens[0] in preparation.wh_words 
                and s_tokens[-1] == '?'
        ):
            reason = self.reasons["S_HAS_WH_WORD_AND_?"]
            statement_text = cleaned_s
        else:
            statement_text = RuleTopicComment01.get_first_sentence_with_question(
                main_nucleus_text
            )
            if statement_text is not None:
                reason = self.reasons["QUESTION_IN_MAIN_N"]

        if reason is not None:
            utils.print_if_verbose(reason.explanation, verbose)
            
            return defs.Statement(
                    statement_text=statement_text,
                    nucleus=None,
                    satellite_nucleus=None,
                    left_boundary=None,
                    right_boundary=None,
                    nucleus_proximity=None,
                    rule=self.name,
                    reason=reason
                )
        else:
            return None
        
        
def test__RuleTopicComment01_get_first_question():
    assert (
        RuleTopicComment01.get_first_question(
            "Why! Bla-bla-bla. What is your favourite colour? bla"
        ) == "What is your favourite colour?"
    )
    
    assert (
        RuleTopicComment01.get_first_question(
            "Why ? Bla-bla-bla. What is your favourite colour? bla"
        ) == "Why?"        
    )
    
    assert (
        RuleTopicComment01.get_first_question(
            "Why ! Bla-bla-bla. What is your favourite colour. bla"
        ) is None        
    )
    
    
def test__RuleTopicComment01_get_first_sentence_with_question():
    assert (
        RuleTopicComment01.get_first_sentence_with_question(
            "Bla-bla-bla. Tell me what your favourite colour is! bla"
        ) == None
    )
    
    assert (
        RuleTopicComment01.get_first_sentence_with_question(
            "Bla-bla-bla. What is your favourite colour? bla"
        ) == "What is your favourite colour?"        
    )
    
    assert (
        RuleTopicComment01.get_first_sentence_with_question(
            "Bla-bla-bla. Yellow is her favourite colour. bla"
        ) is None        
    )
    
    assert (
        RuleTopicComment01.get_first_sentence_with_question(
            "Bla-bla-bla. Yellow is her favourite colour. bla"
        ) is None        
    )
    
    
test__RuleTopicComment01_get_first_question()
test__RuleTopicComment01_get_first_sentence_with_question()

In [8]:
if __name__ == "__main__" and "__file__" not in globals():
    rule = RuleTopicComment01()

    with open("../parsed/race/train/middle/6121.txt.tree", "rt") as f:
        tree_text = f.read()
    
    cleaned_tree_text = tree_text.replace("<s>", "").replace("<P>", "")
    
    text, relations = relation_extraction.read_relations(
        cleaned_tree_text
    )
    
    root = relation_extraction.read_relation_tree(cleaned_tree_text)
    assert root is not None
    _, main_nucleus_segment = root.get_first_nucleus()
    main_nucleus_text = text[
        main_nucleus_segment.start:main_nucleus_segment.end
    ]
    
    tc = relations["Topic-Comment"][0]
    print(text[tc.left.start:tc.right.end])
    
    statement =rule.generate_statement(text, tc, main_nucleus_text, verbose=True)
    print("\nRESULT:")
    if statement is not None:
        print(json.dumps(statement._asdict(), indent=2))

Have you ever heard of e-waste ( electronic waste , ) , which can be produced every day ?  How do you deal with your computers , MP4 players and mobile phones when they 're broken or you want a new one ?  
Nucleus is on the left.
Nucleus's depth <= 3.
Parsing result:
(ROOT
  (SQ (VBP Have)
    (NP
      (NP (PRP you))
      (SBAR
        (S
          (VP
            (ADVP (RB ever))
            (VBD heard)
            (PP (IN of)
              (NP
                (NP
                  (NP (NN e-waste))
                  (PRN (-LRB- -LRB-)
                    (ADVP (JJ electronic))
                    (NP (NN waste))
                    (, ,))
                  (-RRB- -RRB-))
                (, ,)
                (SBAR
                  (WHNP (WDT which))
                  (S
                    (VP (MD can)
                      (VP (VB be)))))))))))
    (VP (VBN produced)
      (NP-TMP (DT every) (NN day)))
    (. ?)))

Constituencies:
      type  start  end  depth
0      VBP      0  

In [9]:
if __name__ == "__main__" and "__file__" not in globals():
    rule = RuleTopicComment01()

    with open("../parsed/race/train/middle/6814.txt.tree", "rt") as f:
        tree_text = f.read()
    
    cleaned_tree_text = tree_text.replace("<s>", "").replace("<P>", "")
    
    text, relations = relation_extraction.read_relations(
        cleaned_tree_text
    )
    
    root = relation_extraction.read_relation_tree(cleaned_tree_text)
    assert root is not None
    _, main_nucleus_segment = root.get_first_nucleus()
    main_nucleus_text = text[
        main_nucleus_segment.start:main_nucleus_segment.end
    ]
    
    tc = relations["Topic-Comment"][0]
    print(text[tc.left.start:tc.right.end])
    
    statement =rule.generate_statement(text, tc, main_nucleus_text, verbose=True)
    print("\nRESULT:")
    if statement is not None:
        print(json.dumps(statement._asdict(), indent=2))

If you put together one group of sheep and another group of sheep , how many groups of sheep do you have ? "  " Why !  That 's an easy question .  
Nucleus is on the left.
Nucleus's depth <= 3.
Nucleus's left subsegment is not nucleus.
Will use the whole segment.
Satellite's nucleus is on the left.
Nuclei proximity is NucleusProximity.NEAR
Satellite's (left) nucleus contains '.', '!', '?', or ';'.
Will use the whole segment.
Text extracted from the satellite:
" Why!  That's an easy question.
There is a question in the main nucleus

RESULT:
{
  "statement_text": "Can you tell me why you are so clever?",
  "nucleus": null,
  "satellite_nucleus": null,
  "left_boundary": null,
  "right_boundary": null,
  "nucleus_proximity": null,
  "rule": "topic_comment_01",
  "reason": [
    2,
    "There is a question in the main nucleus"
  ]
}
