In [1]:
import re
import csv
import sys
import json
import math
import string
import spacy
import textacy
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from taxonerd import TaxoNERD
from fastcoref import spacy_component
from itertools import permutations, combinations
from spacy.matcher import Matcher, DependencyMatcher, PhraseMatcher
%run "./Main.ipynb"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
VERBOSE_LEVEL = 10

In [13]:
NOUNS = [
    "NOUN", 
    "PROPN", 
    "PRON"
]

DO_NOT_SWAP_PHRASES = [
    "resulted in",
    "results in"
]

PUSH_INCLUDE_LABELS = [
    Unit.LIST,
    Unit.ITEM,
    Unit.P_PHRASE,
]

In [4]:
def power_set(pool, current=None, result=None):
    if current not in result:
        result.append(current)

    for item in pool:
        if item not in current:
            power_set(
                pool,
                current={*current, item},
                result=result
            )

    return result

def interleave(pool, current=None, used=None, skip=None, result=None):
    if current and current not in result:
        result.append(current)
    
    for i in range(len(pool)):
        if i == skip:
            continue
        
        for j in range(len(pool[i])):
            if (i, j) not in used:
                interleave(
                    pool,
                    current=[*current, pool[i][j]],
                    used=[*used, (i, j)],
                    skip=i,
                    result=result
                )

    return result

In [5]:
class Node:
    def __init__(self, main, is_action=False, tokens=None):
        self.main = main
        self.tokens = tokens or []
        self.is_action = is_action
        self.other_tokens = []
    
    
    def start(self):
        i = math.inf
        for token in self.tokens:
            i = min(i, token.i)
        return i
    
    
    def end(self):
        i = -math.inf
        for token in self.tokens:
            i = max(i, token.i)
        return i
    
    
    def get_tokens(self):
        return self.tokens
    
    
    def get_all_tokens(self):
        return [*self.tokens, *self.other_tokens]
    
    
    def get_traits(self):
        tokens = set([*self.tokens, *self.other_tokens])
        tokens = tokens & set(self.main.trait.tokens)
        return tokens
    
    
    def get_species(self):
        tokens = set([*self.tokens, *self.other_tokens])
        tokens = tokens & set(self.main.species.tokens)
        return tokens
    
    
    def copy(self):
        node = Node(self.main)
        node.tokens = [*self.tokens]
        node.is_action = self.is_action
        node.other_tokens = [*self.other_tokens]
        return node
    
    
    def __str__(self, nouns_only=False):
        tokens = [*self.tokens, *self.other_tokens]
        tokens = list(set(tokens))
        if nouns_only:
            tokens = [t for t in tokens if t.pos_ in NOUNS]
        tokens = sorted(tokens, key=lambda token: token.i)
        return f"{','.join([t.text for t in tokens])}"
    
    
    @staticmethod
    def extend_token_ref(main, tokens):
        i = 0
        size = len(tokens)
        while i < size:
            token = tokens[i]
            if token.pos_ == "PRON":
                tokens = [*main.coref_map.get(token, [token])]
                break
            i += 1
        return tokens
    
    
    @staticmethod
    def extend_token_unit(main, tokens):
        i = 0
        size = len(tokens)
        unit_map_items = main.units.unit_map.items()
        while i < size:
            token = tokens[i]
            for unit_bound, unit in unit_map_items:
                token_in_unit = unit_bound[0] <= token.i <= unit_bound[1]
                unit_is_valid = unit.label_has([Unit.ITEM])
                if token_in_unit and unit_is_valid:
                    tokens = [*main.sp_doc[unit_bound[0]:unit_bound[1]+1]]
                    break
            i += 1
        return tokens
    
    
    @staticmethod
    def extend_token_entity(main, tokens):
        i = 0
        size = len(tokens)
        while i < size:
            token = tokens[i]
            if token in main.entity_map:
                tokens.extend([*main.entity_map[token]])
            i += 1
        return tokens
    
    
    @staticmethod
    def extend_token_noun_chunk(main, tokens):
        i = 0
        size = len(tokens)
        while i < size:
            token = tokens[i]
            if token in main.noun_chunk_map:
                tokens.extend([*main.noun_chunk_map[token]])
            i += 1
        return tokens
    
    
    @staticmethod
    def extend_token(main, token):
        tokens = [token]        
        tokens = Node.extend_token_ref(main, tokens)
        tokens = Node.extend_token_unit(main, tokens)
        tokens = Node.extend_token_entity(main, tokens)
        tokens = Node.extend_token_noun_chunk(main, tokens)
        tokens = list(set(tokens))
        tokens = sorted(tokens, key=lambda token: token.i)
        return tokens

In [20]:
class Event:
    def __init__(self, main):
        self.main = main
        self.order = []
    
    
    def start(self):
        i = math.inf
        for item in self.order:
            i = min(i, item.start())
        return i
    
    
    def end(self):
        i = -math.inf
        for item in self.order:
            i = max(i, item.end())
        return i
    
    
    def sent_i(self):
        start = self.start()
        sents = self.main.sp_doc.sents
        for i, sent in enumerate(sents):
            if sent.start <= start < sent.end:
                return i
        return -1
    
    
    def get_tokens(self):
        ret = []
        for item in self.order:
            ret.extend(item.get_tokens())
        return ret
    
    
    def get_all_tokens(self):
        ret = []
        for item in self.order:
            ret.extend(item.get_all_tokens())
        return ret
    
    
    def get_traits(self):
        ret = []
        for item in self.order:
            ret.extend(item.get_traits())
        return ret

    
    def get_species(self):
        ret = []
        for item in self.order:
            ret.extend(item.get_species())
        return ret
    
    
    def attach_event(self, event, del_end=False):
        if del_end:
            del_node = self.order.pop()
            for token in del_node.other_tokens:
                if token.pos_ in ["ADJ"]:
                    event.order[0].tokens.append(token)
        self.order.extend(event.order)
        return self
    

    def copy(self):
        event = Event(self.main)
        event.order = []
        for item in self.order:
            event.order.append(item.copy())
        return event

    
    def __str__(self):
        ret = ""

        i = 0
        size = len(self.order)
        while i < size:
            ret += f"({self.order[i]})"
            if i != size - 1:
                ret += "->"
            i += 1
        
        return ret

In [77]:
class EventManager:
    def __init__(self, main):
        self.main = main
        self.events = []
    
    
    
    def fix_triple_tokens(self, s_node, v_node, o_node, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"fix_triple_tokens")
        
        # Subject Tokens Transferred to Object
        s_transfer_tokens = []
        for token in s_node.tokens:
            if token.pos_ == "VERB":
                s_transfer_tokens.append(token)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Subject Tokens Transferred to Object: {s_transfer_tokens}")
        
        s_node.tokens = [t for t in s_node.tokens if t not in s_transfer_tokens]
        o_node.tokens.extend(s_transfer_tokens)
        
        # Object Tokens Transferred to Verb
        o_transfer_tokens = []
        for token in o_node.tokens:
            if token in main.cause.tokens or token in main.change.tokens:
                o_transfer_tokens.append(token)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Object Tokens Transferred to Verb: {s_transfer_tokens}")
        
        o_node.tokens = [t for t in o_node.tokens if t not in o_transfer_tokens]
        v_node.tokens.extend(o_transfer_tokens)

        return (s_node, v_node, o_node)
    
    
    
    def fix_triple_direction(self, s_node, v_node, o_node, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"fix_triple_direction")
        
        v_tokens = sorted(v_node.tokens, key=lambda t: t.i)
        
        # Do Not Swap Phrases
        # There are some phrases that would result in a swapping
        # of the subject and object node, but shouldn't. So,
        # we flag these phrases to prevent an incorrect swap.
        v_text = " ".join([token.lower_ for token in v_tokens])

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Verb Text: {v_text}")
        
        for phrase in DO_NOT_SWAP_PHRASES:
            if phrase in v_text:
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tVerb Text Contains: '{phrase}'")
                return (s_node, v_node, o_node)

        # Swap Subject and Object
        # If there is an AUX token before the verb ("was caused")
        # or an ADP token after the verb ("caused by"), we swap
        # the subject and object.
        v_tokens_l = v_tokens[+0]
        aux = v_tokens_l.nbor(-1) and v_tokens_l.nbor(-1).pos_ == "AUX"

        v_tokens_r = v_tokens[-1]
        adp = v_tokens_r.nbor(+1) and v_tokens_r.nbor(+1).pos_ == "ADP"

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"AUX: {aux}")
            print(f"ADP: {aux}")
            print(f"Swapped: {aux or adp}")
            
        if aux or adp:
            return (o_node, v_node, s_node)
        return (s_node, v_node, o_node)
            
    
    
    def fix_triple_other_tokens(self, s_node, v_node, o_node, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"fix_triple_other_tokens")
        
        s_node.other_tokens = flatten([Node.extend_token(main, t) for t in s_node.tokens])
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Subject Node Other Tokens: {s_node.other_tokens}")
        
        o_node.other_tokens = flatten([Node.extend_token(main, t) for t in o_node.tokens])
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Object Node Other Tokens: {o_node.other_tokens}")
        
        return (s_node, v_node, o_node)
    
    
    
    def push_bound_l(self, bound, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"push_bound_l")
        
        return self.push_bound(
            bound, 
            reverse=True,
            update_bound_fnc=lambda bound, unit: min(bound, unit.l),
            verbose=verbose
        )


    
    def push_bound_r(self, bound, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"push_bound_r")
        
        return self.push_bound(
            bound, 
            reverse=False,
            update_bound_fnc=lambda bound, unit: max(bound, unit.r),
            verbose=verbose
        )


    
    def push_bound(self, bound, reverse=False, update_bound_fnc=None, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"push_bound")
        
        units = self.main.units.units_at_i(bound)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Units: {units}")
        
        if not units:
            if verbose and VERBOSE_LEVEL >= 1:
                print(f"No Units, Returned: {bound}")
            
            return bound

        start = False
        unit_map_values = list(main.units.unit_map.values())
        unit_map_values = sorted(unit_map_values, key=lambda u: (u.l, u.r), reverse=reverse)

        i = 0
        while i < len(unit_map_values):
            unit = unit_map_values[i]
            
            # Units must be in the same sent
            # as the bound (token position).
            if verbose and VERBOSE_LEVEL >= 1:
                print(f"Checking Unit: {unit}")
                print(f"*Unit Start: {unit.sent_start()}")
                print(f"*Token Unit Start: {units[0].sent_start()}")
                
            if units[0].sent_start() != unit.sent_start():
                i += 1
                continue

            # The bound (token position) must
            # be in the unit.
            if unit not in units:
                i += 1
                continue

            if verbose and VERBOSE_LEVEL >= 1:
                print(f"Starting Unit: {unit}")
            
            bound = update_bound_fnc(bound, unit)
            
            j = i + 1
            while j < len(unit_map_values) and unit_map_values[j] not in units:
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tUnit: {unit_map_values[j]}")
                    print(f"\t*Unit Start: {unit_map_values[j].sent_start()}")
                    print(f"\t*Token Unit Start: {units[0].sent_start()}")

                if units[0].sent_start() != unit_map_values[j].sent_start():
                    j += 1
                    continue
                
                # If we see punctuation we break, we don't want to
                # overstep any boundaries.
                if unit_map_values[j].lower() in string.punctuation:
                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\t\tBreak, Punctuation Found")
                    break

                if unit_map_values[j].label_has(PUSH_INCLUDE_LABELS):
                    bound = update_bound_fnc(bound, unit_map_values[j])
                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\t\tBreak, Unit Found: {unit_map_values[j]}")
                    break
                
                j += 1
            
            break

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Returned: {bound}")
        
        return bound
    
    

    def find_verb_tokens(self, tokens, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"find_verb_tokens")
        
        tokens = sorted(tokens, key=lambda t: t.i)
        verb_tokens = []

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Tokens: {tokens}")
        
        i = 0
        while i < len(tokens):
            token = tokens[i]

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tToken ({token.pos_}): {token}")
                print(f"\tNext Token ({token.nbor() and token.nbor().pos_}): {token.nbor()}")

            if token.pos_ != "VERB":
                if token.pos_ != "AUX" or token.nbor().pos_ != "VERB":
                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tNot a Verb, Continue")
                    
                    i += 1
                    continue
            
            verb_tokens.append(token)
            
            i += 1
            while i < len(tokens) and tokens[i].pos_ in [
                "VERB", 
                "ADV", 
                "AUX", 
                "ADP"
            ]:
                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tAdd Verb Token: {tokens[i]}")
                
                verb_tokens.append(tokens[i])
                i += 1

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Returned: {verb_tokens}")
        
        return verb_tokens
        
    
    
    def split_tokens(self, tokens, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"split_tokens")
        
        # L Bound
        l = tokens[0].i

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"l: {l}")

        # M Bound
        # This is determined by the start and end verbs.
        verb_tokens = self.find_verb_tokens(tokens, verbose=verbose)
        v_l = verb_tokens[+0].i # Do Not Ask
        v_r = verb_tokens[-1].i

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"v_l: {v_l}")
            print(f"v_r: {v_r}")

        # R Bound
        # We look for the first noun after the last verb.
        i = tokens.index(verb_tokens[-1]) + 1
        while i < len(tokens) and tokens[i].pos_ not in NOUNS:
            i += 1

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"i: {i}")
        
        no_noun_r = i <= 0 or i >= len(tokens)
        r = v_r if no_noun_r else tokens[i].i
            
        no_noun_l = not set([token.pos_ for token in self.main.sp_doc[l:v_l]]) & set(NOUNS)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"No Noun R: {no_noun_r}")
            print(f"No Noun L: {no_noun_l}")

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Returned: {((l, v_l, v_r, v_r), (no_noun_l, no_noun_r))}")
        return ((l, v_l, v_r, v_r), (no_noun_l, no_noun_r))

                
        # if no_noun_r and no_noun_l:
        #     split = (-1, -1, -1, -1)
        # elif no_noun_r:
        #     split = (l, v_l, v_r, v_r)
        # else: 
        #     r = tokens[i].i
        #     if no_noun_l:
        #         split = (v_l, v_l, v_r, r)
        #     else:
        #         split = (l, v_l, v_r, r)

        # if verbose and VERBOSE_LEVEL >= 1:
        #     print(f"Returned: {split}")
        
        # return split
    
    
    
    def convert_tokens_no_verb(self, tokens, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"convert_tokens_no_verb")
        
        speech = [token.pos_ for token in tokens]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Speech: {speech}")
        
        # ADP and SCONJ tokens sometimes indicate
        # a cause-and-effect relationship, which is
        # important.
        if speech[0] in ["ADP", "SCONJ"]:
            event = Event(self.main)
            event.order = [Node(self.main, tokens=tokens)]

            if verbose and VERBOSE_LEVEL >= 1:
                print(f"Returned: {event}")
            
            return [event]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Returned: {None}")
        
        return []
    
    
    
    def convert_tokens_skewed_verb(self, tokens, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"convert_tokens_skewed_verb")
            print(f"Tokens: {tokens}")
        
        # Verb Node
        v_node = Node(self.main, is_action=True)
        v_node.tokens = self.find_verb_tokens(tokens)

        # Object Node
        o_node = Node(self.main)
        o_node.tokens = [token for token in tokens if token not in v_node.tokens]

        # Check for Species, Traits
        if not o_node.get_species() and not o_node.get_traits():
            if verbose and VERBOSE_LEVEL >= 1:
                print(f"No Species or Traits, Returned []")
            return []

        event = Event(self.main)
        event.order = [v_node, o_node]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Returned: {event}")
        
        return [event]
        
    
    
    def convert_tokens_multiple_verbs(self, tokens, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"convert_tokens_multiple_verb")
        
        tokens = [token for token in tokens if token.text not in string.punctuation]

        # Number of Verbs
        speech = [token.pos_ for token in tokens]
        number_verbs = speech.count("VERB")
        
        token_units = self.main.units.units_at_i(tokens[0].i)

        # Example:
        # The X increased and the Y decreased.
        # We'd split the list into items and convert
        # "The X increased" and "the Y decreased".
        unit_lists = [unit for unit in token_units if unit.label_has([Unit.LIST])]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Unit Lists: {[l.__str__() for l in unit_lists]}")
        
        if len(unit_lists) == 1:
            unit_list = unit_lists[0]

            # Check if Equal Distribution
            # of Verbs
            number_verbs_in_items = []
            for child in unit_list.children:
                if not child.label_has([Unit.ITEM]):
                    continue
                item_speech = [token.pos_ for token in child.span()]
                number_verbs_in_item = item_speech.count("VERB")
                if number_verbs_in_item != 1:
                    number_verbs_in_items = []
                    break
                number_verbs_in_items.append(number_verbs_in_item)

            # Indirectly contains the number of 
            # items with 1 verb.
            number_verbs_in_items = [n for n in number_verbs_in_items if n]

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tNumber Verbs: {number_verbs}")
                print(f"\tNumber Verbs in Items: {number_verbs_in_items}")
                
            # Split by Item
            if len(number_verbs_in_items) == number_verbs:
                events = []
                for child in unit_list.children:
                    if not child.label_has([Unit.ITEM]):
                        continue

                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\tItem: {child}")
                        print(f"\tItem Tokens: {list(child.span())}")
                        
                    item_events = self.convert_tokens(list(child.span()), verbose=verbose)
                    if item_events is not None:
                        events.extend(item_events)

                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Returned:")
                    for event in events:
                        print(f"\t{event}")
                
                return events
        # Skip
        # This is overfitting the problem, so I'll leave it here for now.
        # I may come back to it later, but I'll lump everything together
        # for now.
        # Example:
        # The X and Y increased and decreased, respectively.
        # We'd attach the X with increased and the Y with decreased.
        # I will work on this later, I need to fix the lists and
        # figure out other things.
        # elif len(unit_lists) == 2:
        #     list_a = unit_lists[0]
        #     list_b = unit_lists[1]

        #     list_a_items = [child for child in list_a.children if child.label_has([Unit.ITEM])]
        #     list_b_items = [child for child in list_b.children if child.label_has([Unit.ITEM])]
            
        #     if len(list_a_items) == len(list_b_items):
        #         list_a_tokens = list(list_a.span())
        #         list_a_speech = [token.pos_ for token in list_a_tokens]
        #         list_a_number_verbs = list_a_speech.count("VERB")
                
        #         list_b_tokens = list(list_b.span())
        #         list_b_speech = [token.pos_ for token in list_b_tokens]
        #         list_b_number_verbs = list_b_speech.count("VERB")

        #         noun_list = None
        #         verb_list = None
                
        #         if list_a_number_verbs == number_verbs and not list_b_number_verbs:
        #             noun_list = list_b
        #             verb_list = list_a
        #         else:
        #             noun_list = list_a
        #             verb_list = list_b

        #         list_a_items = sorted(list_a_items, key=lambda u: u.l)
        #         list_b_items = sorted(list_b_items, key=lambda u: u.l)

        #         events = []
        #         for i in range(len(list_a_items)):
        #             # Here, I'd have connect each item together,
        #             # but again, missing the point, focusing too
        #             # much on the details.
        #             pass

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Returned: {None}")
            
        return None
    
    
    
    def convert_tokens(self, tokens, split=None, verbose=False):
        if verbose and VERBOSE_LEVEL >=1:
            print(f"convert_tokens")

        tokens = sorted(tokens, key=lambda t: t.i)
        tokens = [token for token in tokens if token.text not in string.punctuation]
        
        if verbose and VERBOSE_LEVEL >=1:
            print(f"Tokens: {tokens}")
        
        if not tokens:
            if verbose and VERBOSE_LEVEL >=1:
                print(f"No Tokens, Returned []")
            return []
        
        # Check Verbs
        speech = [t.pos_ for t in tokens]
        number_verbs = speech.count("VERB")

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Speech: {speech}")
            print(f"Number Verbs: {number_verbs}")
        
        if number_verbs == 0:
            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tNo Verbs")
            
            events = self.convert_tokens_no_verb(tokens, verbose=verbose)
            if events is not None:
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Returned:")
                    for event in events:
                        print(f"\t{event}")
                
                return events
        
        elif number_verbs > 1:
            
            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tMultiple Verbs")
                
            events = self.convert_tokens_multiple_verbs(tokens, verbose=verbose)
            if events is not None:    
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Returned:")
                    for event in events:
                        print(f"\t{event}")
                
                return events
        
        if len(tokens) <= 2:
            return []

        # Check Split
        (l, v_l, v_r, r), (no_noun_l, no_noun_r) = split or self.split_tokens(tokens, verbose=verbose)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Split: ({l}, {v_l}, {v_r}, {r})")
        
        if no_noun_l and no_noun_r:
            if verbose and VERBOSE_LEVEL >= 1:
                print(f"No Split, Returned []")
            
            return []
        elif no_noun_l:
            if verbose and VERBOSE_LEVEL >= 1:
                print(f"Skewed Right")

            r = self.push_bound_r(r, verbose=verbose)
            events = self.convert_tokens_skewed_verb(self.main.sp_doc[l:r+1], verbose=verbose)
            
            if events is not None:
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Returned:")
                    for event in events:
                        print(f"\t{event}")
                
                return events
        elif no_noun_r:
            if verbose and VERBOSE_LEVEL >= 1:
                print(f"Skewed Left")

            l = self.push_bound_l(l, verbose=verbose)
            events = self.convert_tokens_skewed_verb(self.main.sp_doc[l:r+1], verbose=verbose)
            
            if events is not None:
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Returned:")
                    for event in events:
                        print(f"\t{event}")
                
                return events

        # Main
        l = self.push_bound_l(l, verbose=verbose) # Do Not Ask
        r = self.push_bound_r(r, verbose=verbose)
        
        v_node = Node(self.main, is_action=True)
        v_node.tokens = [self.main.sp_doc[i] for i in range(v_l, v_r+1)]
        
        s_node = Node(self.main)
        s_node.tokens = [self.main.sp_doc[i] for i in range(l, v_l)]
            
        o_node = Node(self.main)
        o_node.tokens = [self.main.sp_doc[i] for i in range(v_r+1, r+1)]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Subject: {s_node}")
            print(f"Verb: {v_node}")
            print(f"Object: {o_node}")

        s_node, v_node, o_node = self.fix_triple_direction(s_node, v_node, o_node, verbose=verbose)
        s_node, v_node, o_node = self.fix_triple_tokens(s_node, v_node, o_node, verbose=verbose)
        s_node, v_node, o_node = self.fix_triple_other_tokens(s_node, v_node, o_node, verbose=verbose)

        s_node_valid = s_node.get_species() or s_node.get_traits()
        o_node_valid = o_node.get_species() or o_node.get_traits()
        if not s_node_valid and not o_node_valid:
            return []
        
        event = Event(self.main)
        event.order = [s_node, v_node, o_node]

        events = [event]
        events.extend(self.convert_tokens(s_node.tokens, verbose=verbose))
        events.extend(self.convert_tokens(o_node.tokens, verbose=verbose))
        events = [event for event in events if event]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Returned:")
            for event in events:
                print(f"\t{event}")
        
        return events


    
    def convert_unit(self, unit, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print("convert_unit")
        
        tokens = [*self.main.sp_doc[unit.l:unit.r+1]]
        
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Tokens: {tokens}")
        
        return self.convert_tokens(tokens)
    
    

    def convert_svo_triple(self, triple, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print("convert_svo_triple")
        
        tokens = [*triple.subject, *triple.object]
        tokens = sorted(tokens, key=lambda t: t.i)
    
        l = tokens[+0].i
        v_l = triple.verb[+0].i
        v_r = triple.verb[-1].i
        r = tokens[-1].i

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Split: ({l}, {v_l}, {v_r}, {r})")
        
        tokens = [*self.main.sp_doc[l:r+1]]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Tokens: {tokens}")
        
        return self.convert_tokens(tokens, split=(l, v_l, v_r, r))
    
    
    
    def distinct_events(self, events, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"distinct_events")
        
        event_bounds_mapped = {}
        for event in events:
            event_bound = (event.start(), event.end())
            if event_bound in event_bounds_mapped:
                if len(event.order) > len(event_bounds_mapped[event_bound].order):
                    event_bounds_mapped[event_bound] = event
            else:
                event_bounds_mapped[event_bound] = event
        
        event_bounds = event_bounds_mapped.keys()   

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Event Bounds: {event_bounds}")
        
        distinct_event_bounds = distinct_bounds(event_bounds, larger=True)
        distinct_events = [event_bounds_mapped[b] for b in distinct_event_bounds]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Distinct Event Bounds: {distinct_event_bounds}")
        
        return distinct_events
    

    
    def find_events(self, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"find_events")
        
        sents = list(self.main.sp_doc.sents)
        sents_events = {sent.start: [] for sent in sents}

         # Units
        for unit_tokens in self.main.units.aggregate_units():
            if verbose and VERBOSE_LEVEL >= 2:
                print(f"Finding Events in Tokens: {unit_tokens}")
            
            events = self.convert_tokens(unit_tokens, verbose=verbose)

            if verbose and VERBOSE_LEVEL >= 2:
                if events:
                    print(f"\tEvents:")
                    for event in events:
                        print(f"\t\tEvent: {event}")
                else:
                    print(f"\tNo Events")
            
            sent_start = unit_tokens[0].sent.start
            sents_events[sent_start].extend(events)
        
        # Triples
        for triple in textacy.extract.subject_verb_object_triples(self.main.sp_doc):
            if verbose and VERBOSE_LEVEL >= 2:
                print(f"Finding Events in Triple: {triple}")
            
            events = self.convert_svo_triple(triple, verbose=verbose)

            if verbose and VERBOSE_LEVEL >= 2:
                if events:
                    print(f"\tEvents:")
                    for event in events:
                        print(f"\t\tEvent: {event}")
                else:
                    print(f"\tNo Events")
            
            sent_start = triple.verb[0].sent.start
            sents_events[sent_start].extend(events)
            
        sents_events = {k: self.distinct_events(v) for k, v in sents_events.items()}
        return sents_events
    
    
    
    def overlap_in_species(self, x, y, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"overlap_in_species")
        
        sp_X = [self.main.species.span_at_token(species) for species in x.get_species()]
        sp_Y = [self.main.species.span_at_token(species) for species in y.get_species()]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"sp_X: {sp_X}")
            print(f"sp_Y: {sp_Y}")

        for sp_x in sp_X:
            if self.main.species.find_same_species(sp_Y, sp_x, verbose=verbose):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Returned True")
                
                return True
        
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Returned False")
        
        return False
    
    

    def overlap_in_traits(self, x, y, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"overlap_in_traits")
        
        tr_X = set([trait.lemma_.lower() for trait in x.get_traits()])
        tr_Y = set([trait.lemma_.lower() for trait in y.get_traits()])

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"tr_X: {tr_X}")
            print(f"tr_Y: {tr_Y}")
            print(f"Returned {bool(tr_X & tr_Y)}")
        
        return bool(tr_X & tr_Y)
        

    
    def overlap_in_tokens(self, x, y, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"overlap_in_tokens")
        
        x_nouns = set(self.get_non_species_nouns(x.other_tokens))
        y_nouns = set(self.get_non_species_nouns(y.other_tokens))

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"x_nouns: {x_nouns}")
            print(f"y_nouns: {y_nouns}")
            print(f"Returned {bool(x_nouns & y_nouns)}")
        
        return bool(x_nouns & y_nouns)


    
    def get_non_species_nouns(self, tokens):
        nouns = []
        
        for token in tokens:            
            if token in self.main.species.tokens:
                continue

            if token.pos_ not in ["PROPN", "NOUN"]:
                continue
            
            nouns.append(token.lower_)
            nouns.append(token.lemma_.lower())
        
        return nouns
    
    
    
    def has_cause_phrase(self, x, y, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print("has_cause_phrase")
        
        speech = [token.pos_ for token in x.get_tokens()]
        cause = speech[0] in ["ADP", "SCONJ"]
        order = x.start() < y.start()

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Speech: {speech}")
            print(f"Cause ({bool(cause)}): {cause}")
            print(f"Order ({bool(order)}): {order}")
            print(f"Returned: {cause and order}")
        
        return cause and order
    
    
    
    def has_effect_phrase(self, x, y, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print("has_effect_phrase")

        speech = [token.pos_ for token in y.get_tokens()]
        lowers = [token.lower_ for token in y.get_tokens()]
        
        cause = "PRON" in speech or "SCONJ" in speech
        cause = cause or set(["therefore", "cause", "result", "so"]).intersection(lowers) # TODO: Create Long List
        
        order = x.start() < y.start()
        space = y.start() - x.end() - 1

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Speech: {speech}")
            print(f"Cause ({bool(cause)}): {cause}")
            print(f"Order ({bool(order)}): {order}")
            print(f"Space ({bool(0 <= space <= 1)}): {space}")
            print(f"Returned: {(cause and order) or 0 <= space <= 1}")

        return (cause and order) or 0 <= space <= 1
    
    
    
    def has_cause_phrase_and_pron(self, n, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print("has_cause_phrase_and_pron")
        
        speech = set([token.pos_ for token in n.tokens])
        cause_and_pron = ("ADP" in speech or "SCONJ" in speech) and "PRON" in speech

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Speech: {speech}")
            print(f"Cause and Pronoun: {cause_and_pron}")
            print(f"Returned: {cause_and_pron}")
    
        return cause_and_pron
    
    
    
    def can_merge_intrasent(self, X, Y, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print("can_merge_intrasent")
            print(f"*X: {X}")
            print(f"*Y: {Y}")
        
        x = X.order[-1]
        y = Y.order[+0] # Do Not Ask

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"x: {x}")
            print(f"y: {y}")
        
        can_merge = self.overlap_in_species(x, y, verbose=verbose)
        if can_merge:
            return (True, True)

        can_merge = self.overlap_in_tokens(x, y, verbose=verbose)
        if can_merge:
            return (True, True)

        # Example:
        # "By the ..., X did Y."
        can_merge = len(X.order) == 1 and self.has_cause_phrase(x, y, verbose=verbose)
        if can_merge:
            return (True, False)

        # Example:
        # "..., therefore X did Y"
        can_merge = len(Y.order) == 3 and self.has_effect_phrase(x, y, verbose=verbose)
        if can_merge:
            return (True, True)

        # Example:
        # "..., therefore X increased"
        can_merge = len(Y.order) == 2 and self.has_effect_phrase(x, Y, verbose=verbose)
        if can_merge:
            return (True, False)
        
        return (False, False)
        
    
    
    def can_merge_intersent(self, X, Y, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print("can_merge_intersent")
            print(f"*X: {X}")
            print(f"*Y: {Y}")
        
        x = X.order[-1]
        y = Y.order[+0] # Do Not Ask

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"x: {x}")
            print(f"y: {y}")
        
        can_merge = self.overlap_in_species(x, y, verbose=verbose)
        if can_merge:
            return (True, True)

        can_merge = self.overlap_in_tokens(x, y, verbose=verbose)
        if can_merge:
            return (True, True)

        can_merge = self.has_cause_phrase_and_pron(y, verbose=verbose)
        if can_merge:
            return (True, False)
        
        return (False, False)
    
    
    
    def intrasent_sequences(self, indices_1D):
        power_set_indices = power_set(indices_1D, current=[], result=[])
        sequences = []
        for subset in power_set_indices:
            if subset:
                sequences.extend(list(permutations(subset)))
        return sequences


    
    def intersent_sequences(self, indices_2D):
        sequences = interleave(indices_2D, current=[], used=[], result=[])
        return sequences
    
    
    
    def merge_intrasent(self, sent_events, verbose=False):
        i = 0
        while i + 1 < len(sent_events):
            X = sent_events[i]
            Y = sent_events[i+1]
            
            can_merge, del_end = self.can_merge_intrasent(X, Y, verbose=verbose)            
            if not can_merge:
                return None

            X.attach_event(Y, del_end=del_end)
            sent_events.pop()

        if len(sent_events[0].order) <= 2:
            return None
        return sent_events[0]
    
    
    
    def merge_intersent(self, events, verbose=False):
        i = 0
        while i + 1 < len(events):
            X = events[i]
            Y = events[i+1]

            can_merge, del_end = self.can_merge_intersent(X, Y, verbose=verbose)
            if not can_merge:
                return None

            X.attach_event(Y, del_end=del_end)
            events.pop()

        return events[0]
    
    
    
    def load_events(self, verbose=False):
        if verbose and VERBOSE_LEVEL >= 1:
            print("load_events")
        
        sents_events = self.find_events(verbose=verbose)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Events Found:")
            for events in sents_events.values():
                for event in events:
                    print(f"*{event}")
        
        merged_sents_events = []
        
        for sent_events in sents_events.values():
            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tMerging Sentence Events ({len(sent_events)})")
            
            merged_sents_events.append([])

            # Each event can be represented with its index
            # for simplicity.
            event_indices = list(range(len(sent_events)))
            
            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tEvent Indices (INTRA): {event_indices}")

            # The order in which you merge an event can result
            # in a different outcome. For example, E1 + E2 may
            # look different to E2 + E1, and so on.

            # Idea:
            # We try all the possible ways you can merge, which is
            # the same as trying all the different orders you can
            # merge the events. For example, merging E1, E2, and E3
            # can be done in 8 ways: E1, E2, E3, E1 + E2, E2 + E1,
            # and so on.
            
            # These are all the possible orders (or sequences) in 
            # which you can merge the above events. We will try all
            # possible sequences, adding those that work.
            sequences = self.intrasent_sequences(event_indices)

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tMerge Sequences: {sequences}")
            
            for sequence in sequences:
                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tSequence: {sequence}")
                
                events = [sent_events[i].copy() for i in sequence]
                merged_event = self.merge_intrasent(events, verbose=verbose)

                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tMerged Event: {merged_event}")
                
                if merged_event:
                    merged_sents_events[-1].append(merged_event)
        
        # Idea:
        # Now that we've found all the events that can be made from
        # merging the events within a sentence, we can merge across
        # sentences. To do this, we also need to try all different
        # sequences of events. However, we shouldn't try and merge
        # events from the same sentence again, which means that no
        # two consecutive events in the sequence can be from the same
        # sentence. There's also another issue: we can't use a simple
        # 1D-index for each event as we're dealing with a 2D-list of events.
        # Therefore, we'll use the index equivalent to its 1D-form, or
        # something similar as we don't have a matrix.
        # If this doesn't make sense, I wouldn't be surprised, I'm more
        # of a pictures person.
        i = 0
        event_indices = []
        index_to_event = {}
        for sent_events in merged_sents_events:
            event_indices.append([])
            for event in sent_events:
                index_to_event[i] = event
                event_indices[-1].append(i)
                i += 1

        if verbose and VERBOSE_LEVEL >= 2:
            print(f"\tEvent Indices (INTER): {event_indices}")
            
        merged_events = []
        sequences = self.intersent_sequences(event_indices)

        if verbose and VERBOSE_LEVEL >= 2:
            print(f"\tMerge Sequences: {sequences}")
            
        for sequence in sequences:
            if verbose and VERBOSE_LEVEL >= 3:
                print(f"\t\tSequence: {sequence}")
            
            events = [index_to_event[i].copy() for i in sequence]
            merged_event = self.merge_intersent(events, verbose=verbose)
            
            if verbose and VERBOSE_LEVEL >= 3:
                print(f"\t\tMerged Event: {merged_event}")
            
            if merged_event:
                merged_events.append(merged_event)
        
        return merged_events
    
    
    
    def update(self, verbose=False):
        self.events = self.load_events(verbose=verbose)

In [78]:
# main = Main()

In [79]:
# text = "The dog caused the cat to cry."
# main.update_text(text)

In [80]:
event_manager = EventManager(main)
event_manager.update(verbose=True)

for event in event_manager.events:
    print(event)

load_events
find_events
Finding Events in Tokens: [The, dog, caused, the, cat]
convert_tokens
Tokens: [The, dog, caused, the, cat]
Speech: ['DET', 'NOUN', 'VERB', 'DET', 'NOUN']
Number Verbs: 1
split_tokens
l: 0
find_verb_tokens
Tokens: [The, dog, caused, the, cat]
	Token (DET): The
	Next Token (NOUN): dog
		Not a Verb, Continue
	Token (NOUN): dog
	Next Token (VERB): caused
		Not a Verb, Continue
	Token (VERB): caused
	Next Token (DET): the
	Token (DET): the
	Next Token (NOUN): cat
		Not a Verb, Continue
	Token (NOUN): cat
	Next Token (PART): to
		Not a Verb, Continue
Returned: [caused]
v_l: 2
v_r: 2
i: 4
No Noun R: False
No Noun L: False
Returned: ((0, 2, 2, 2), (False, False))
Split: (0, 2, 2, 2)
push_bound_l
push_bound
Units: [<__main__.Unit object at 0x0000025C30943640>]
Checking Unit: .
*Unit Start: 0
*Token Unit Start: 0
Checking Unit: to cry
*Unit Start: 0
*Token Unit Start: 0
Checking Unit: The dog caused the cat
*Unit Start: 0
*Token Unit Start: 0
Starting Unit: The dog caused