In [30]:
import collections
import enum
import re

In [31]:
from nltk.tokenize import word_tokenize

In [32]:
def print_if_verbose(text, verbose):
    if verbose:
        print(text)

In [33]:
clean_re = re.compile("\s+([.,?!'])")


def clean(string):
    cleaned = string.replace("<s>", "").replace("<P>", "")
    cleaned = clean_re.sub("\\1", cleaned)
    return cleaned.strip()

In [34]:
RelationPartInfo = collections.namedtuple(
    "RelationPartInfo", ["direction", "relation", "segment", "name"]
)


@enum.unique
class NucleusProximity(enum.Enum):
    NEAR = "near"
    FAR = "far"
    
    
class RelationInfo:
    def __init__(self, relation, verbose=False):
        assert relation is not None
        if relation.left.type == "N":
            print_if_verbose("Nucleus is on the left.", verbose)
            self.nucleus_info = RelationPartInfo(
                "right", relation.left_child, relation.left, "Nucleus"
            )
            self.satellite_info = RelationPartInfo(
                "left", relation.right_child, relation.right, "Satellite"
            )
        else:
            print_if_verbose("Nucleus is on the right.", verbose)
            self.nucleus_info = RelationPartInfo(
                "left", relation.right_child, relation.right, "Nucleus"
            )
            self.satellite_info = RelationPartInfo(
                "right", relation.left_child, relation.left, "Satellite"
            )

In [35]:
wh_words = {
    "what",
    "when",
    "where",
    "why",
    "which",
    "how"
}


class TextExtractor:
    Result = collections.namedtuple(
        "TextExtractor__Result", 
        ["case", "extracted_text"]
    )
    
    class Case(enum.Enum):
        NO_N = 0
        STOP_IN_N = 1
        NO_STOP_IN_N_WH_WORD = 2
        NO_STOP_IN_N_AND = 3
        NO_STOP_IN_N_OTHER = 4
        LEFT_SUBSEGMENT_NOT_N = 5
        
    messages = {
        Case.NO_N:
            "{0} is flat.",
        Case.STOP_IN_N: 
            "{0}'s (left) nucleus contains '.', '!', '?', or ';'.",
        Case.NO_STOP_IN_N_WH_WORD:
            "{0}'s (left) nucleus doesn't contain '.', '!', '?', or ';'."
            " {0}'s right subsegment starts with a wh-word or 'how'.",
        Case.NO_STOP_IN_N_AND:
            "{0}'s (left) nucleus doesn't contain '.', '!', '?', or ';'."
            " {0}'s right subsegment starts 'and' and its verb belongs to"
            " the same subject.",
        Case.NO_STOP_IN_N_OTHER:
            "{0}'s (left) nucleus doesn't contain '.', '!', '?', or ';'"
            " but none of the other two conditions is met.",
        Case.LEFT_SUBSEGMENT_NOT_N:
            "{0}'s left subsegment is not nucleus."
    }
    
    def extract_text(
        text, relation_part_info, verbose=False
    ):
        if relation_part_info.relation is None:
            case = TextExtractor.Case.NO_N
        else:
            left_subsegment = relation_part_info.relation.left
            if left_subsegment.type != "N":
                case = TextExtractor.Case.LEFT_SUBSEGMENT_NOT_N
            else:
                lss_tokens = nltk.tokenize.word_tokenize(
                    text[left_subsegment.start:left_subsegment.end]
                )
                if len({".", "!", "?", ";"}.intersection(set(lss_tokens))) == 0:
                    right_subsegment = relation_part_info.relation.right
                    rss_tokens = nltk.tokenize.word_tokenize(
                        text[right_subsegment.start:right_subsegment.end]
                    )
                    if rss_tokens[0].lower() in wh_words:
                        print_if_verbose(
                            "Satellite's right subsegment starts with a wh-word or 'how'.",
                            verbose
                        )
                        case = TextExtractor.Case.NO_STOP_IN_N_WH_WORD
                    else:
                        left_subsegment = relation_part_info.relation.left
                        if belong_to_one_vp(
                            text, 
                            left_subsegment.start, 
                            left_subsegment.end, 
                            right_subsegment.start, 
                            right_subsegment.end, 
                            verbose=verbose
                        ):
                            case = TextExtractor.Case.NO_STOP_IN_N_AND
                        else:
                            case = TextExtractor.Case.NO_STOP_IN_N_OTHER
                else:
                    case = TextExtractor.Case.STOP_IN_N

        print_if_verbose(TextExtractor.messages[case].format(relation_part_info.name), verbose)
        if case in {
    #             TextExtractor.Case.NO_STOP_IN_N_WH_WORD,
            TextExtractor.Case.NO_STOP_IN_N_AND  
        }:
            print_if_verbose("Will use only sub-nucleus.", verbose)
            extracted_text = text[
                left_subsegment.start:left_subsegment.end
            ]
        else:
            print_if_verbose("Will use the whole segment.", verbose)
            satellite_segment = relation_part_info.segment
            extracted_text = text[
                relation_part_info.segment.start:relation_part_info.segment.end
            ]

        return TextExtractor.Result(case, extracted_text)

In [42]:
PreparationResult = collections.namedtuple(
    "PreparationResult", ["used_search", "extraction_case", "prepared_text"]
)

class ExtendedRelationInfo(RelationInfo):
    def __init__(
        self, 
        rel_info, 
        nucleus_preparation_result,
        nucleus_proximity, 
        sn_relation, 
        sn_segment,
        satellite_preparation_result,
    ):
        RelationInfo.__init__(self, rel_info.nucleus_info, rel_info.satellite_info)
        self.nucleus_preparation_result = nucleus_preparation_result
        self.nucleus_proximity = nucleus_proximity
        self.sn_relation = sn_relation
        self.sn_segment = sn_segment
        self.satellite_preparation_result = satellite_preparation_result


class Preprocessor:
    SatelliteHandlingResult = collections.namedtuple(
        "SatelliteHandlingResult",
        ["preparation_result", "sn_relation", "sn_segment", "nucleus_proximity"]
    )
    
    MAX_DEPTH = 3
    
    def search_segment(text, relation, direction, verbose=False):
        assert relation is not None
        if direction == "left":
            child, segment = relation.left_child, relation.left
        else:
            child, segment = relation.right_child, relation.right
        print_if_verbose(text[segment.start:segment.end], verbose)
        if child:
            return search_segment(
                text, 
                child, 
                direction
            )
        else:
            return text[segment.start:segment.end]
    
    def get_depth(relation):
        if relation:
            return 1 + max(
                get_depth(relation.left_child), 
                get_depth(relation.right_child)
            )
        else:
            return 0
        
    def ends_with_a_punctuation_mark(text):
        return text[-1:] in {".", "!", "?", ";"}

    def is_ok(text, segment_type, verbose):
        if len(text) == 0:
            print_if_verbose(f"{segment_type} is empty.", verbose)
            return False
        elif not Preprocessor.ends_with_a_punctuation_mark(text):
            print_if_verbose(f"{segment_type} doesn't end with a punctuation mark.", verbose)
            return False
        return True
    
    def handle_nucleus(text, nucleus_info, verbose):
        if Preprocessor.get_depth(nucleus_info.relation) > Preprocessor.MAX_DEPTH:
            print_if_verbose(
                f"Nucleus's depth > {Preprocessor.MAX_DEPTH}.",
                verbose
            )
            used_search = True
            extraction_case = None
            n_text = Preprocessor.search_segment(
                        # take the closest nested segment relative to the satellite
                text, 
                nucleus_info.relation, 
                nucleus_info.direction,
                verbose
            )
        else:
            print_if_verbose(
                f"Nucleus's depth <= {Preprocessor.MAX_DEPTH}.",
                verbose
            )
            used_search = False
            n_text_rextraction_result = TextExtractor.extract_text(
                text, nucleus_info, verbose
            )
            extraction_case = n_text_rextraction_result.case
            n_text = n_text_rextraction_result.extracted_text
        
        cleaned_n_text = clean(n_text)       
        
        if not Preprocessor.is_ok(cleaned_n_text, "Nucleus", verbose):
            return None
        else:
            return PreparationResult(
                used_search=used_search,
                extraction_case=extraction_case,
                prepared_text=cleaned_n_text
            )
        
    def handle_satellite(text, satellite_info, nucleus_direction, verbose):
        if satellite_info.relation is None:
            print_if_verbose(
                f"Satellite doesn't have any nested relations."
            )
            return None
        else:
            sn_relation, sn_segment = satellite_info.relation.get_first_nucleus()
            
            # checking if the nucleus of the satellite is on the right/left
            if satellite_info.relation.left.type == "N":
                print_if_verbose("Satellite's nucleus is on the left.", verbose)
                if nucleus_direction == "right": # if the expl. nucleus is on the left
                    nucleus_proximity = NucleusProximity.NEAR
                else:
                    nucleus_proximity = NucleusProximity.FAR
            else:
                print_if_verbose("Satellite's nucleus is on the right.", verbose)
                if nucleus_direction == "right":
                    nucleus_proximity = NucleusProximity.FAR
                else:
                    nucleus_proximity = NucleusProximity.NEAR
            print_if_verbose(f"Nuclei proximity is {nucleus_proximity}", verbose)
    
    
            if Preprocessor.get_depth(sn_relation) > Preprocessor.MAX_DEPTH:
                print_if_verbose(
                    f"The depth of the satellite's nucleus > {Preprocessor.MAX_DEPTH}.",
                    verbose
                )
                used_search = True
                extraction_case = None
                s_text = Preprocessor.search_segment(
                            # take the closest nested segment relative to the satellite
                    text, 
                    sn_relation, 
                    satellite_info.direction,
                    verbose
                )
            else:
                print_if_verbose(
                    f"The depth of the satellite's nucleus <= {Preprocessor.MAX_DEPTH}.",
                    verbose
                )
                used_search = False
                s_text_rextraction_result = TextExtractor.extract_text(
                    text, satellite_info, verbose
                )
                extraction_case = s_text_rextraction_result.case
                s_text = s_text_rextraction_result.extracted_text
            
            cleaned_s_text = clean(s_text)       

            if not Preprocessor.is_ok(cleaned_s_text, "Satellite", verbose):
                return None
            else:
                return Preprocessor.SatelliteHandlingResult(
                    PreparationResult(
                        used_search=used_search,
                        extraction_case=extraction_case,
                        prepared_text=cleaned_s_text
                    ),
                    sn_relation,
                    sn_segment,
                    nucleus_proximity
                )
        
    def prepare_extended_info(text, relation, verbose=False):
        rel_info = RelationInfo(relation, verbose)

        nucleus_preparation_result = Preprocessor.handle_nucleus(
            text, rel_info.nucleus_info, verbose
        )
        if nucleus_preparation_result is None:
            return None
        else:
            satellite_handling_result = Preprocessor.handle_satellite(
                text, rel_info.satellite_info, rel_info.nucleus_info.direction, verbose
            )
            if satellite_handling_result is None:
                return None
            else:
                return ExtendedRelationInfo(
                    rel_info=rel_info,
                    nucleus_preparation_result=nucleus_preparation_result,
                    nucleus_proximity=satellite_handling_result.nucleus_proximity,
                    sn_relation=satellite_handling_result.sn_relation,
                    sn_segment=satellite_handling_result.sn_segment,
                    satellite_preparation_result=satellite_handling_result.preparation_result
                )

In [43]:
%run relation_extraction.ipynb
%run nlp.ipynb

text, relations = read_relations("""
    (Explanation[N][S]
            (Elaboration[N][S]
              _!I then came to China , a country!_ 
              _!I had always wanted to visit . <s>!_) 
            (Joint[N][N]
              _!I saw Beijing , of course ,!_
              _!and climbed up the Great Wall . <s>!_))
    """
)
extended_rel_info = Preprocessor.prepare_extended_info(
    text, relations["Explanation"][0], True
)
assert (
    extended_rel_info.s_text_extraction_result.case
        == TextExtractor.Case.NO_STOP_IN_N_AND
)

Nucleus is on the left.
Nucleus's depth <= 3.
Parsing result:
(ROOT
  (S
    (NP (PRP I))
    (ADVP (RB then))
    (VP (VBD came)
      (PP (TO to)
        (NP
          (NP (NNP China))
          (, ,)
          (NP
            (NP (DT a) (NN country))
            (SBAR
              (S
                (NP (PRP I))
                (VP (VBD had)
                  (ADVP (RB always))
                  (VP (VBN wanted)
                    (S
                      (VP (TO to)
                        (VP (VB visit))))))))))))
    (. .)))

Constituencies:
    type  start  end  depth
0    PRP      0    1      3
1     NP      0    1      2
2     RB      1    2      3
3   ADVP      1    2      2
4    VBD      2    3      3
5     TO      3    4      4
6    NNP      4    5      6
7     NP      4    5      5
8      ,      5    6      5
9     DT      6    7      7
10    NN      7    8      7
11    NP      6    8      6
12   PRP      8    9      9
13    NP      8    9      8
14   VBD      9   10    

AttributeError: 'NoneType' object has no attribute 's_text_extraction_result'

In [None]:
text, relations = read_relations("""
(Explanation[N][S]
      (Attribution[S][N] _!They knew!_ _!it would be easy . <s>!_)
      (Attribution[S][N]
        _!The professor had said!_
        (Elaboration[N][S]
          _!that they could bring any book or note!_
          _!they wanted . <s>!_)))
    """
)
extended_rel_info = prepare_extended_info(text, relations["Explanation"][0], True)
assert (
    extended_rel_info.s_text_extraction_result.case
        == TextExtractor.Case.LEFT_SUBSEGMENT_NOT_N
)