In [1]:
class Entity:
    # Labels
    LIST = 1
    ITEM = 2
    QUOTE = 3
    BREAK = 4
    END = 5
    AND_OR_END = 6
    COLON = 7
    COLON_BREAK = 8
    I_CLAUSE = 9
    D_CLAUSE = 10
    P_PHRASE = 11
    BRACKETS = 12
    FRAGMENT = 13
    CONJ = 14

    def __init__(self, doc, label=None, l=None, r=None, children=None):
        self.doc = doc
        self.label = [] if not label else [label] if not isinstance(label, list) else label
        self.l = l
        self.r = r
        self.children = children or []

    def label_(self):
        labels = []
        if Entity.LIST in self.label:
            labels.append("List")
        if Entity.ITEM in self.label:
            labels.append("Item")
        if Entity.QUOTE in self.label:
            labels.append("Quote")
        if Entity.BREAK in self.label:
            labels.append("Break")
        if Entity.END in self.label:
            labels.append("End")
        if Entity.AND_OR_END in self.label:
            labels.append("And or End")
        if Entity.COLON in self.label:
            labels.append("Colon")
        if Entity.COLON_BREAK in self.label:
            labels.append("Colon Break")
        if Entity.I_CLAUSE in self.label:
            labels.append("Independent Clause")
        if Entity.D_CLAUSE in self.label:
            labels.append("Dependent Clause")
        if Entity.P_PHRASE in self.label:
            labels.append("Prepositional Phrase")
        if Entity.BRACKETS in self.label:
            labels.append("Brackets")
        if Entity.FRAGMENT in self.label:
            labels.append("Fragment")
        if Entity.CONJ in self.label:
            labels.append("Conjunction")
        return ", ".join(labels) or "None"
        
    def size(self):
        return self.r - self.l + 1

    def span(self):
        return self.doc[self.l:self.r+1]

    def lower(self):
        return self.doc[self.l:self.r+1].text.lower()

    def start(self):
        return self.doc[self.l]

    def end(self):
        return self.doc[self.r]

    def label_has(self, labels):
        return set(self.label).intersection(labels)

    @staticmethod
    def tokens(*, ent=None, ents=None):
        if ents:
            tokens = flatten([list(ent.span()) for ent in ents])
            tokens = sorted(tokens, key=lambda token: token.i)
            return tokens
        if ent:
            tokens = list(ent.span())
            return tokens
        return None

    @staticmethod
    def is_conjunction(token):
        return token.lower_ in ["and", "or"]

    @staticmethod
    def same_speech(speech_1, speech_2):
        nouns = ["NOUN", "PRON", "PROPN"]
        if speech_1 in nouns and speech_2 in nouns:
            return True
        return speech_1 == speech_2

    @staticmethod
    def same_speech_list(speech_1, speech_2_list):
        for speech_2 in speech_2_list:
            if Entity.same_speech(speech_1, speech_2):
                return True
        return False

    def __str__(self):
        return self.span().text

In [2]:
class Quotes:
    def __init__(self, main, entities):
        self.main = main
        self.entities = entities

    def is_quote(self, i):
        return i < len(self.entities) and self.entities[i].lower() == "\""
    
    def identify(self):
        i = 0
        
        while i < len(self.entities):
            if not self.is_quote(i):
                i += 1
                continue
            
            self.entities[i].label.append(Entity.QUOTE)
            
            while not self.is_quote(i+1):
                self.entities[i].r = self.entities[i+1].r
                self.entities.pop(i+1)

            if self.is_quote(i+1):
                self.entities[i].r = self.entities[i+1].r
                self.entities.pop(i+1)

        return self.entities

In [3]:
class Brackets:
    MATCHES = {
        "[": "]", 
        "(": ")",
        "—": "—",
    }

    OPENING = MATCHES.keys()
    CLOSING = MATCHES.values()

    def __init__(self, main, entities):
        self.main = main
        self.stack = []
        self.entities = [*entities]

    def is_opening(self, i):
        return i < len(self.entities) and self.entities[i].lower()[0] in Brackets.OPENING

    def is_closing(self, i):
        return i < len(self.entities) and self.entities[i].lower()[0] in Brackets.CLOSING

    def closes(self, i):
        opener = self.entities[self.stack[-1]].lower()[0]
        closer = self.entities[i].lower()[0]
        return Brackets.MATCHES[opener] == closer
    
    def identify(self):
        self.stack = []
        
        i = 0
        while i < len(self.entities):
            # Closing
            if self.is_closing(i) and self.stack:
                j = None if not self.closes(i) else self.stack.pop()
                
                if not self.stack and j != None:
                    self.entities[j].r = self.entities[i].r
                    self.entities.pop(i)
                    continue
                else:
                    i += 1

            # Opening
            elif self.is_opening(i):
                if not self.stack:
                    self.entities[i].label.append(Entity.BRACKETS)
                self.stack.append(i)
                i += 1

            # Consuming
            elif self.stack:
                # If you're at the end of the possible entities,
                # and the list is unclosed, we must stop.
                if i + 1 >= len(self.entities):
                    break
                self.entities[self.stack[0]].r = self.entities[i+1].r
                self.entities.pop(i)

            else:
                i += 1
        
        return self.entities

In [4]:
class Separators:
    def __init__(self, main, entities):
        self.main = main
        self.entities = [*entities]

    def is_break(self, i):
        if i >= len(self.entities):
            return False
        
        if self.entities[i].lower() not in [";", ","]:
            return False

        # Breaks cannot have a following conjunction.
        # Else, it would be an end and not a break.
        return not bool(
            i + 1 < len(self.entities) and 
            self.entities[i+1].size() == 1 and 
            self.entities[i+1].span()[0].pos_ in ["CCONJ"]
        )

    def is_end(self, i):
        if i >= len(self.entities):
            return False
        
        if self.entities[i].lower() not in [";", ","]:
            return False
        
        return not self.is_break(i)

    def identify(self):
        i = 0

        while i < len(self.entities):
            # Break
            if self.is_break(i):
                self.entities[i].label.append(Entity.BREAK)
                i += 1

            # End
            elif self.is_end(i):
                conj = self.entities[i+1].start().lower_

                if conj in ["and", "or"]:
                    self.entities[i].label.append(Entity.AND_OR_END)
                else:
                    self.entities[i].label.append(Entity.END)
                
                self.entities[i].r += 1
                self.entities.pop(i+1)

            elif self.entities[i].start().pos_ == "CCONJ":
                self.entities[i].label.append(Entity.CONJ)
                i += 1

            else:
                i += 1
                
        return self.entities

In [5]:
class Colons:
    def __init__(self, main, entities):
        self.main = main
        self.entities = [*entities]

    def identify(self):
        i = 0

        while i < len(self.entities):
            if self.entities[i].lower()[-1] != ":":
                i += 1
                continue

            if not self.entities[i].label:
                self.entities[i].label.append(Entity.COLON_BREAK)

            if i + 1 < len(self.entities):
                self.entities[i+1].label.append(Entity.COLON)
                self.entities[i+1].r = self.entities[-1].r
                self.entities = self.entities[:i+2]
            
            break

        return self.entities        

In [6]:
class Independent_Clauses:
    def __init__(self, main, entities):
        self.main = main
        self.entities = [*entities]
        self.allowed = []

    def end(self, i):    
        if i >= len(self.entities):
            return True

        if self.entities[i].label_has(self.allowed):
            return True
        
        # Here, we check if the entity after
        # the supposed end is a clause. If it
        # is, then we can end at the current entity.
        return bool(
            i + 1 < len(self.entities) and 
            self.entities[i+1].label_has([
                Entity.COLON,
                Entity.COLON_BREAK,
                Entity.I_CLAUSE,
                Entity.D_CLAUSE,
                Entity.P_PHRASE
            ])
        )

    def identify(self, allowed):
        self.allowed = allowed
        
        i = 0
        
        while i < len(self.entities):
            if not self.entities[i].label_has(self.allowed):
                i += 1
                continue

            # Skip Clause
            if self.entities[i].label_has([
                Entity.I_CLAUSE, 
                Entity.D_CLAUSE, 
                Entity.P_PHRASE
            ]):
                i = entities[i].r + 1
                continue

            # Create Clause
            self.entities[i].label.append(Entity.I_CLAUSE)
            while not self.end(i+1):
                self.entities[i].r = self.entities[i+1].r

                # Add Child
                if self.entities[i+1].label_has([Entity.BRACKETS, Entity.QUOTE]):
                    self.entities[i].children.append(self.entities[i+1])
                    
                self.entities.pop(i+1)

            i += 1
            
        return self.entities

In [7]:
class Dependent_Clauses:
    RELATIVE_NOUNS = [
        "who",
        "whom",
        "which",
        "what",
        "that",
        "whose",
        "whomever",
        "whoever",
        "whichever",
        "whatever"
    ]
    
    def __init__(self, main, entities):
        self.main = main
        self.entities = entities
        self.separator = None

    def end(self, i):
        if i >= len(self.entities):
            return True

        # Here, we check if the entity after
        # is a clause. As we don't combine two
        # clauses, we must end here if that is
        # the case.
        if bool(
            i + 1 < len(self.entities) and 
            self.entities[i+1].label_has([
                Entity.COLON, 
                Entity.COLON_BREAK,
                Entity.I_CLAUSE,
                Entity.D_CLAUSE,
                Entity.P_PHRASE
            ])
        ):
            return True

        return bool(
            self.entities[i].lower()[0] == self.separator or
            self.entities[i].lower() in Dependent_Clauses.RELATIVE_NOUNS or
            self.entities[i].start().pos_ in ["SCONJ"]
        )

    def identify(self, separator):
        self.separator = separator
        
        i = 0
        
        while i < len(self.entities):
            # Skip
            if self.entities[i].label_has([
                Entity.COLON,
                Entity.COLON_BREAK,
                Entity.I_CLAUSE, 
                Entity.D_CLAUSE, 
                Entity.P_PHRASE
            ]):
                i = self.entities[i].r + 1
                continue

            # Indicators of Dependent Clause
            rel = self.entities[i].lower() in Dependent_Clauses.RELATIVE_NOUNS
            sub = self.entities[i].start().pos_ == "SCONJ"
            
            if not sub and not rel:
                i += 1
                continue

            # Create Clause
            self.entities[i].label.append(Entity.D_CLAUSE)
            while not self.end(i+1):
                self.entities[i].r = self.entities[i+1].r

                # Add Child
                if self.entities[i+1].label_has([Entity.BRACKETS, Entity.QUOTE]):
                    self.entities[i].children.append(self.entities[i+1])
                
                self.entities.pop(i+1)

            i += 1
        
        return self.entities

In [8]:
class Prepositional_Phrases:
    
    def __init__(self, main, entities):
        self.main = main
        self.entities = [*entities]

    # A prepositional phrase is typically ended by a noun.
    # Therefore, when we run into a noun, we end the phrase.
    # We must also check that it is the last of the first noun(s)
    # we encounter.
    def last_noun(self, i):
        if bool(
            # 1. End
            i >= len(self.entities) or 
            
            # 2. Noun
            self.entities[i].start().pos_ not in [
                "NOUN", 
                "PROPN", 
                "PRON"
            ]
        ):
            return False
        
        return bool(
            i + 1 > len(self.entities) - 1 or 
            (
                self.entities[i+1].size() == 1 and 
                self.entities[i+1].start().pos_ not in [
                    "NOUN", 
                    "PROPN", 
                    "PRON", 
                    "PART"
                ]
            )
        )
    
    def end(self, i):
        return bool(
            # 1. End of List
            i + 1 >= len(self.entities) or
            
            # 2. Clause
            self.entities[i+1].label_has([
                Entity.COLON,
                Entity.COLON_BREAK,
                Entity.I_CLAUSE,
                Entity.D_CLAUSE,
                Entity.P_PHRASE
            ]) or
            
            # 3. Noun
            self.last_noun(i)
        )

    def identify(self):    
        i = 0
        
        while i < len(self.entities):
            # Skip
            if bool(
                self.entities[i].size() != 1 or
                self.entities[i].start().pos_ != "ADP"
            ):
                i += 1
                continue

            # Create Clause
            self.entities[i].label.append(Entity.P_PHRASE)
            while not self.end(i+1):
                self.entities[i].r = self.entities[i+1].r

                # Add Child
                if self.entities[i+1].label_has([Entity.BRACKETS, Entity.QUOTE]):
                    self.entities[i].children.append(self.entities[i+1])
                
                self.entities.pop(i+1)

            if self.last_noun(i+1):
                self.entities[i].r = self.entities[i+1].r
                self.entities.pop(i+1)
            
            i += 1
        
        return self.entities   

In [46]:
class Lists:
    NOUNS = ["NOUN", "PRON", "PROPN"]
    
    def __init__(self, main, entities, enclosures):
        self.main = main
        self.entities = [*entities]
        self.separator = None
        self.enclosures = enclosures

    def is_stop(self, entity):
        is_break = Entity.BREAK in entity.label and entity.lower()[0] == self.separator
        is_clause = entity.label_has([
            Entity.I_CLAUSE, 
            Entity.D_CLAUSE, 
            Entity.P_PHRASE,
            Entity.COLON,
            Entity.COLON_BREAK
        ])
        return is_break or is_clause

    def find_lists(self, sep):
        self.separator = sep
        
        lists = [
            [
                [None, None]
            ]
        ]

        i = 0
        while i < len(self.entities):
            entity = self.entities[i]

            opened = lists[-1][0] != [None, None]
            remove_list = entity.label_has([Entity.COLON, Entity.COLON_BREAK])
            close_list = entity.label_has([Entity.AND_OR_END]) and entity.lower()[0] == sep
            close_item = entity.label_has([Entity.BREAK]) and entity.lower() == sep
        
            # Close List
            if opened and close_list:
                # Invalid List, Remove
                if len(lists[-1]) < 2:
                    lists[-1] = [[None, None]]
                    i += 1
                    continue
                    
                # Find the L Index of Last Item
                last_item_l = i + 1

                # Find the R Index of Last Item
                last_item_r = last_item_l
                
                length = find_index(self.entities[last_item_l:], lambda e: self.is_stop(e))
                if length > 0:
                    last_item_r += length - 1
                elif length == -1:
                    last_item_r = len(self.entities) - 1

                # Add Last Item
                lists[-1].append([last_item_l, last_item_r])
                lists.append([[None, None]])
                i += 1

            # Close Item
            elif opened and close_item:
                lists[-1].append([i + 1, i])
                i += 1
                
            # Remove List
            elif opened and remove_list:
                lists[-1] = [[None, None]]
                i += 1
            
            # Continue Item
            else:
                if not opened:
                    lists[-1][0] = [i, i]
                else:
                    lists[-1][-1][1] += 1
                i += 1
        
        # If we reach the end of the list and the last
        # list is invalid (< 3 items), we remove it.
        if bool(
            lists and len(lists[-1]) < 3 or 
            (
                lists and
                not find(self.entities[lists[-1][0][0]:], lambda e: e.label_has([Entity.AND_OR_END]) and e.lower()[0] == sep)
            )
        ):
            lists.pop()
        
        # In each item, we look for pairs (e.g. X and Y).
        # We only handle one conjunction.
        num_lists = len(lists)
        for i, lst in enumerate(lists):
            if i >= num_lists:
                break
            
            for l, r in lst:
                tokens = Entity.tokens(ents=self.entities[l:r+1])
                conj = find_all(tokens, lambda t: Entity.is_conjunction(t))
                if len(conj) == 1:
                    lists.append([[l, r]])
        
        # If there's no lists at all, we can take advantage
        # of lax rules.
        if not lists:
            lst = [[None, None]]
            i = 0
            while i < len(self.entities):
                if self.entities[i].label_has([Entity.BREAK, Entity.AND_OR_END, Entity.END]):
                    if lst != [[None, None]]:
                        lists.append(lst)
                    lst = [[None, None]]
                else:
                    if lst == [[None, None]]:
                        lst = [[i, i]]
                    else:
                        lst[-1][1] = i
                
                i += 1
            
            if lst != [[None, None]]:
                lists.append(lst)
        
        # Here we remove duplicates, I'm not sure if duplicates still
        # occur, I observed them once, but this is here in case.
        # Note: I could do a cheeky list(set(...)), at least I think.
        i = 0
        while i < len(lists):
            if lists[i] in lists[i+1:]:
                lists.pop(i)
            else:
                i += 1
        
        # Remove Invalid Lists
        i = 0
        while i < len(lists):
            # The list contains one item and that item only contains one
            # token, or the list has two items.
            if bool(
                (
                    len(lists[i]) == 1 and 
                    lists[i][0][0] == lists[i][0][1]
                ) or
                len(lists[i]) == 2
            ):
                lists.pop(i)
            else:
                i += 1
        
        return lists

    def clean_lists(self, lists):
        overlaps = []

        i = 0
        while i + 1 < len(lists):
            a = lists[i]
            b = lists[i+1]
                  
            if a[-1] != b[0]:
                i += 1
                continue

            if len(a) <= 1 or len(b) <= 1:
                i += 1
                continue

            # No Way to Split
            if a[-1][1] - a[-1][0] <= 1:
                overlaps.extend([i, i + 1])
                i += 2
            else:
                a[-1][1] = a[-1][0]
                b[0][0] = b[0][1]
                i += 2
        
        lists = [l for i, l in enumerate(lists) if i not in overlaps]
        return lists

    def expand_noun(self, tokens, start, direction):
        for group in [*self.main.sp_doc.noun_chunks, *self.main.sp_doc.ents]:
            tokens_i = [t.i for t in group]
            if tokens[start].i in tokens_i:
                while start >= 0 and start < len(tokens) and tokens[start].i in tokens_i:
                    start += 1 * direction
                start += 1 * direction * -1
                break
        
        return start
        
    def char_bound_list(self, lst):
        # We bound each item according to characters or a speech.
        # We find these bounds from the "base item", the second to last item.
        base_tokens = Entity.tokens(ents=self.entities[lst[-2][0]:lst[-2][1]+1])
        
        # As we're bounding by characters, primarily, the left bound is just
        # the characters of the first token
        l_bound = base_tokens[0].lower_

        # The right bound is the first tag, of the below set of tags, that we
        # encounter in the base tokens. If there's not such a token, we cannot
        # bound the items.
        speech = ["NOUN", "PROPN", "PRON", "VERB", "NUM"]
        r_bound = None
        for i in range(len(base_tokens) - 1, -1, -1):
            if base_tokens[i].pos_ in speech:
                r_bound = base_tokens[i]
                break

        if not r_bound:
            return None

        # The inner items are already bounded on the left and right sides.
        # All we need to check is whether the start matches with the left bound.
        inner_items = lst[1:-2]

        for i, item in enumerate(inner_items):
            l = item[0]
            r = item[1]
            
            tokens = Entity.tokens(ents=self.entities[l:r+1])

            # If it doesn't match, we check if the next set of items can be
            # bounded. If not, we cannot bound the list.
            if tokens[0].lower_ != l_bound:
                if len(inner_items) - i - 1 >= 2:
                    return self.bound_list(lst[i+2:])
                return None
            
        # Check for L Bound in Starting Item
        start_tokens = Entity.tokens(ents=self.entities[lst[0][0]:lst[0][1]+1])
        start_l = len(start_tokens) - 1
        while start_l >= 0 and start_tokens[start_l].lower_ != l_bound:
            start_l -= 1

        # L Bound Not Found
        if start_l < 0:
            # If the list is greater than 4 items, we can
            # cut off the starting item, and try again.
            if len(inner_items) >= 2:
                return self.bound_list(lst[1:])
            return None

        # If the first of the start tokens is a noun, there may be more
        # to include.
        if start_tokens[start_l].pos_ in Lists.NOUNS:
            start_l = self.expand_noun(start_tokens, start_l, -1)
                    
        # Check for R Bound in Ending Item
        end_tokens = Entity.tokens(ents=self.entities[lst[-1][0]:lst[-1][1]+1])
        end_r = 0
        num_end_tokens = len(end_tokens)
        while end_r < num_end_tokens and end_tokens[end_r].pos_ not in speech:
            end_r += 1

        if end_r >= num_end_tokens:
            return None

        # If the last of the end tokens is a noun, there may be more
        # to include.
        if end_tokens[end_r].pos_ in Lists.NOUNS:
            end_r = self.expand_noun(end_tokens, end_r, 1)
        
        # Create List
        entity_start_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=start_tokens[start_l].i, r=start_tokens[-1].i)
        entity_end_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=end_tokens[0].i, r=end_tokens[end_r].i)
        
        entity_list = Entity(self.main.sp_doc, label=Entity.LIST, l=start_tokens[start_l].i, r=end_tokens[end_r].i)
        entity_list.children.extend([entity_start_item, entity_end_item])
        
        for item in lst[1:-1]:
            tokens = Entity.tokens(ents=self.entities[item[0]:item[1]+1])
            entity_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=tokens[0].i, r=tokens[-1].i)
            entity_list.children.append(entity_item)

        return entity_list

    def char_bound_pair(self, pair):
        tokens = Entity.tokens(ents=self.entities[pair[0][0]:pair[0][1]+1])
        tokens = sorted(tokens, key=lambda t: t.i)
        num_tokens = len(tokens)
        
        m = find_index(tokens, lambda t: Entity.is_conjunction(t))

        l = m - 1
        r = m + 1

        # Bound L by R Token Characters
        i = m - 1
        while i >= 0 and tokens[i].lower_ != tokens[m + 1].lower_:
            i -= 1

        if i < 0:
            return None

        # Bound R by L Token Speech
        j =  m + 1
        while j < num_tokens and not Entity.same_speech(tokens[m-1].pos_, tokens[j].pos_):
            j += 1

        if j >= num_tokens:
            return None
        
        e_item_l = Entity(self.main.sp_doc, label=Entity.ITEM, l=tokens[i].i, r=tokens[m-1].i)
        e_item_r = Entity(self.main.sp_doc, label=Entity.ITEM, l=tokens[m+1].i, r=tokens[j].i)
        e_list = Entity(self.main.sp_doc, label=Entity.LIST, l=tokens[i].i, r=tokens[j].i, children=[e_item_l, e_item_r])
        return e_list
    
    def bound_list(self, lst):
        # Base Item (2nd to Last Item) Tokens
        # This item is already bounded by the
        # left and right sides, which is useful.
        base_tokens = Entity.tokens(ents=self.entities[lst[-2][0]:lst[-2][1]+1])
        num_base_tokens = len(base_tokens)
        
        # Speech Bounds
        speech = ["NOUN", "PROPN", "PRON", "VERB"]
        adjectives = ["ADJ", "ADV", "NUM"]
        
        # Find L Bound
        l_bound = []
        for i in range(0, num_base_tokens):
            if base_tokens[i].pos_ in speech:
                l_bound = [base_tokens[i].pos_]
                break
            elif base_tokens[i].pos_ in adjectives:
                l_bound = [base_tokens[i].pos_]

                j = i + 1
                while j < num_base_tokens:
                    if base_tokens[j].pos_ in speech:
                        l_bound.append(base_tokens[j].pos_)
                        break
                    j += 1
                
                break
        
        if not l_bound:
            return None
        
        # Find R Bound
        r_bound = []
        for i in range(num_base_tokens - 1, -1, -1):
            if base_tokens[i].pos_ in speech:
                r_bound = [base_tokens[i].pos_]
                break
            elif base_tokens[i].pos_ in adjectives:
                r_bound = [base_tokens[i].pos_]

                j = i - 1
                while j >= 0:
                    if base_tokens[j].pos_ in speech:
                        r_bound.append(base_tokens[j].pos_)
                        break
                    j -= 1
                
                break

        if not r_bound:
            return None
        
        # Check Inner Items
        # The inner items must have the left bound,
        # the right bound isn't as important.
        inner_items = lst[1:-1]

        verb_seen = False
        for i, item in enumerate(inner_items):
            l = item[0]
            r = item[1]
            
            item_tokens = Entity.tokens(ents=self.entities[l:r+1])
            item_speech = [token.pos_ for token in item_tokens]

            # Must be Homogeneous
            if "VERB" not in item_speech and verb_seen:
                if len(inner_items) >= 2:
                    return self.bound_list(lst[1:])  
                else:
                    return None
            elif "VERB" in item_speech:
                verb_seen = True

            # Not Found
            if not set(l_bound).intersection(item_speech):
                # We check if the list starting at the next
                # item has a chance. If it does, that becomes
                # the list.
                if len(inner_items) - i + 1 >= 2:
                    return self.bound_list(lst[i+2:])
                return None
        
        # Check Starting Item
        start_tokens = Entity.tokens(ents=self.entities[lst[0][0]:lst[0][1]+1])
        start_l = len(start_tokens) - 1
        
        while start_l >= 0 and not Entity.same_speech_list(start_tokens[start_l].pos_, l_bound):
            start_l -= 1

        if start_l < 0:
            if len(inner_items) >= 2:
                return self.bound_list(lst[1:])
            return None

        # Adjust Starting Item
        if set(l_bound).intersection(Lists.NOUNS):
            start_l = self.expand_noun(start_tokens, start_l, -1)
        
        # Check Ending Item
        end_tokens = Entity.tokens(ents=self.entities[lst[-1][0]:lst[-1][1]+1])
        end_r = 0
        num_end_tokens = len(end_tokens)

        while end_r < num_end_tokens and not Entity.same_speech_list(end_tokens[end_r].pos_, r_bound):
            end_r += 1

        if end_r >= num_end_tokens:
            return None

        # Adjust Ending Item
        if set(r_bound).intersection(Lists.NOUNS):
            end_r = self.expand_noun(end_tokens, end_r, 1)

        # Create List
        
        # Adjusting Bounds for Start and End Entities
        l_i = start_tokens[start_l].i
        l_label = [Entity.ITEM]
        
        r_i = end_tokens[end_r].i
        r_label = [Entity.ITEM]
        
        for ent in self.enclosures:
            if not ent.label_has([Entity.BRACKETS, Entity.QUOTE]):
                continue
            
            l_overlap = ent.l <= start_tokens[start_l].i <= ent.r
            r_overlap = ent.l <= end_tokens[end_r].i <= ent.r

            # Left Item
            if l_overlap and not r_overlap:
                l_label.extend(list(set(ent.label) & set([Entity.BRACKETS, Entity.QUOTE])))
                l_i = min(ent.l, l_i)

            # Right Item
            if not l_overlap and r_overlap:
                l_label.extend(list(set(ent.label) & set([Entity.BRACKETS, Entity.QUOTE])))
                r_i = max(ent.r, r_i)
      
        entity_list = Entity(self.main.sp_doc, label=Entity.LIST, l=l_i, r=r_i)

        entity_start_item = Entity(self.main.sp_doc, label=l_label, l=l_i, r=start_tokens[-1].i)
        entity_end_item = Entity(self.main.sp_doc, label=r_label, l=end_tokens[0].i, r=r_i)
        entity_list.children.extend([entity_start_item, entity_end_item])

        for item in lst[1:-1]:
            tokens = Entity.tokens(ents=self.entities[item[0]:item[1]+1])
            entity_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=tokens[0].i, r=tokens[-1].i)
            entity_list.children.append(entity_item)

        return entity_list
    
    def bound_pair(self, pair):
        # print(pair)
        tokens = Entity.tokens(ents=self.entities[pair[0][0]:pair[0][1]+1])
        tokens = sorted(tokens, key=lambda t: t.i)
        num_tokens = len(tokens)
        
        # Verb Partitions
        m = find_index(tokens, lambda t: Entity.is_conjunction(t))
        m_i = tokens[m].i

        # Speech for Bounding
        # We handle lists of the types below.
        speech = ["NOUN", "PROPN", "PRON", "VERB"]
        adjectives = ["ADJ", "ADV", "NUM"]

        # Find L Bound
        l_bound = []
        l_bound_i = None
        
        for i in range(m + 1, num_tokens):
            if tokens[i].pos_ in speech:
                l_bound = [tokens[i].pos_]
                l_bound_i = tokens[i].i
                break
            # With adjectives, we can also add the following token
            # as a bound. This allows a list like "X and [ADJ] Y"
            # to be recognized.
            elif tokens[i].pos_ in adjectives:
                l_bound = [tokens[i].pos_]

                j = i + 1
                while j < num_tokens:
                    if tokens[j].pos_ in speech:
                        l_bound.append(tokens[j].pos_)
                        break
                    j += 1
                
                break

        if not l_bound:
            # print(6)
            return None
        
        # Find R Bound
        r_bound = []
        r_bound_i = None

        # print(tokens)
        # print(m)
        
        for i in range(m - 1, -1, -1):
            # print(tokens[i], tokens[i].pos_)
            if tokens[i].pos_ in speech:
                r_bound = [tokens[i].pos_]
                r_bound_i = tokens[i].i
                break
            # With adjectives, we can also list the following token
            # as a bound. This allows a list like "X and [ADJ] Y"
            # to be recognized.
            elif tokens[i].pos_ in adjectives:
                r_bound = [tokens[i].pos_]

                j = i - 1
                while j >= 0:
                    if tokens[j].pos_ in speech:
                        r_bound.append(tokens[j].pos_)
                        break
                    j -= 1
                
                break
        
        if not r_bound:
            # print(7)
            return None

        # Bound L Item
        l = m - 1
        while l >= 0 and not Entity.same_speech_list(tokens[l].pos_, l_bound):
            l -= 1

        if l < 0:
            # print(8)
            return None

        # Adjust L if Noun
        if l_bound in Lists.NOUNS:
            l = self.expand_noun(tokens, l, -1)
        
        # Bound R Item
        r = m + 1
        while r < num_tokens and not Entity.same_speech_list(tokens[r].pos_, r_bound):
            r += 1
        
        if r >= num_tokens:
            # print(9)
            return None

        # Adjust R if Noun
        if r_bound in Lists.NOUNS:
            r = self.expand_noun(tokens, r, 1)

        # Further Adjusting Bounds for Entities
        l_i = tokens[l].i
        l_label = [Entity.ITEM]
        
        r_i = tokens[r].i
        r_label = [Entity.ITEM]
        for ent in self.enclosures:
            if not ent.label_has([Entity.BRACKETS, Entity.QUOTE]):
                continue
            
            l_overlap = ent.l <= tokens[l].i <= ent.r
            r_overlap = ent.l <= tokens[r].i <= ent.r

            # Left Item
            if l_overlap and not r_overlap:
                l_label.extend(list(set(ent.label) & set([Entity.BRACKETS, Entity.QUOTE])))
                l_i = min(ent.l, l_i)

            # Right Item
            if not l_overlap and r_overlap:
                l_label.extend(list(set(ent.label) & set([Entity.BRACKETS, Entity.QUOTE])))
                r_i = max(ent.r, r_i)

        e_item_l = Entity(self.main.sp_doc, label=l_label, l=l_i, r=m_i-1)
        e_item_r = Entity(self.main.sp_doc, label=r_label, l=m_i+1, r=r_i)
        e_list = Entity(self.main.sp_doc, label=Entity.LIST, l=l_i, r=r_i)
        e_list.children.extend([e_item_l, e_item_r])
        
        return e_list

    def bound_lists(self, lists):
        bound_lists = []
        
        for lst in lists:
            bound = None
        
            if len(lst) == 1:
                bound = self.char_bound_pair(lst)
                if not bound:
                    bound = self.bound_pair(lst)
            else:
                bound = self.char_bound_list(lst)
                if not bound:
                    bound = self.bound_list(lst)
            
            if bound:
                bound_lists.append(bound)

        return bound_lists

    def merge_lists(self, bound_lists):
        # Map (L, R) to Entity List
        mapped_bounds = {}
        for lst in bound_lists:
            mapped_bounds[(lst.l, lst.r)] = lst
        bounds = list(mapped_bounds.keys())

        # Find Largest Coverage of Bounds
        max_coverage = []
        
        for bound in bounds:
            overlap = False
            for i, max_bound in enumerate(max_coverage):
                contains = max_bound[0] <= bound[0] <= max_bound[1] or max_bound[0] <= bound[1] <= max_bound[1]
                surround = bound[0] <= max_bound[0] <= bound[1] or bound[0] <= max_bound[1] <= bound[1]
                
                if contains or surround:
                    overlap = True
                
                    if bound[1] - bound[0] > max_bound[1] - max_bound[0]:
                        max_coverage[i] = bound
            
            if not overlap:
                max_coverage.append(bound)
        
        # Integrate Lists
        for bound in max_coverage:
            l_overlap = None
            l_overlap_i = None
            
            r_overlap = None
            r_overlap_i = None
            
            i = 0
            while i < len(self.entities):
                entity = self.entities[i]
                
                # Overlap w/ Left
                if not l_overlap and entity.l <= bound[0] <= entity.r:
                    l_overlap = entity
                    l_overlap_i = i
    
                # Overlap w/ Right
                if entity.l <= bound[1] <= entity.r:
                    r_overlap = entity
                    r_overlap_i = i

                if l_overlap and r_overlap:
                    break

                i += 1

            if l_overlap.label_has([Entity.BRACKETS, Entity.QUOTE]):
                self.entities = self.entities[:l_overlap_i] + self.entities[r_overlap_i+1:]
                self.entities.insert(l_overlap_i, mapped_bounds[bound])
                
                mapped_bounds[bound].l = min(l_overlap.l, mapped_bounds[bound].l)
                mapped_bounds[bound].r = max(l_overlap.r, mapped_bounds[bound].r)
                
            elif l_overlap.label_has([Entity.I_CLAUSE, Entity.D_CLAUSE, Entity.P_PHRASE]):
                if l_overlap.l == mapped_bounds[bound].l:
                    # Add Children
                    l_overlap.r = max(l_overlap.r, mapped_bounds[bound].r)
                    l_overlap.children.append(mapped_bounds[bound])
                    self.entities = self.entities[:l_overlap_i+1] + self.entities[r_overlap_i+1:]
                else:
                    # Add Children
                    l_overlap.r = max(l_overlap.r, mapped_bounds[bound].r)
                    l_overlap.children.append(mapped_bounds[bound])
                    self.entities = self.entities[:l_overlap_i+1] + self.entities[r_overlap_i+1:]
                    
            else:
                self.entities = self.entities[:l_overlap_i] + self.entities[r_overlap_i+1:]
                self.entities.insert(l_overlap_i, mapped_bounds[bound])

        return self.entities
        
    def identify(self, sep):
        lists = self.find_lists(sep)
        # print(1, lists)
        lists = self.clean_lists(lists)
        # print(2, lists)
        lists = self.bound_lists(lists)   
        # print(3, lists)
        lists = self.merge_lists(lists)
        # print(4, lists)
        return lists

In [47]:
class Parts:
    def __init__(self, main):
        self.main = main
        self.root = Entity(self.main.sp_doc)
        self.reg = []

    def load_registry(self, ent):
        reg = {(ent.l, ent.r): ent}
        for child in ent.children:
            if not child.label:
                continue
            reg.update(self.load_registry(child))
        return reg
    
    def update(self):
        reg = []
        for sent in self.main.sp_doc.sents:
            tokens = list(sent)
            reg.append(self.load_entities(tokens))
        self.reg = reg
    
    def load_entities(self, tokens, load_clauses=True):
        entities = []
        for token in tokens:
            entity = Entity(
                self.main.sp_doc, 
                l=token.i, 
                r=token.i
            )
            entities.append(entity)

        # Enclosures
        entities = Quotes(self.main, entities).identify()
        entities = Brackets(self.main, entities).identify()

        # These class of entities are put into other entities later
        # on. However, we still need access to these enclosures for
        # the list identification.
        enclosures = [ent for ent in entities if ent.label_has([Entity.BRACKETS, Entity.QUOTE])]
        
        # Find Separator
        sep = ","
        for entity in entities:
            if ";" == entity.lower()[0]:
                sep = ";"
                break

        # Separators and Colons
        entities = Separators(self.main, entities).identify()
        entities = Colons(self.main, entities).identify()

        if load_clauses:
            entities = Dependent_Clauses(self.main, entities).identify(sep)
            entities = Independent_Clauses(self.main, entities).identify([Entity.END])
            entities = Prepositional_Phrases(self.main, entities).identify()
        
        entities = Lists(self.main, entities, enclosures).identify(sep)

        # There is some overlap between lists and independent
        # clauses because they both can use ", [AND/OR]", but
        # after the lists are identified, we can assume the
        # remaining ", [AND/OR]" are parts of independent clauses.
        if load_clauses:
            entities = Independent_Clauses(self.main, entities).identify([Entity.AND_OR_END])

        # Merge Individual Entities
        i = 0
        while i < len(entities):
            if not entities[i].label:
                while i + 1 < len(entities) and (not entities[i+1].label or entities[i+1].label_has([Entity.CONJ])):
                    entities.pop(i+1)
                    entities[i].r += 1
                entities[i].label = [Entity.FRAGMENT]

            i += 1
        
        # Return Registry
        parent = Entity(self.main.sp_doc, l=-1, r=-1, children=entities)
        reg = self.load_registry(parent)
        if (-1, -1) in reg:
            del reg[(-1, -1)]

        for ent in entities:
            subset = 2 < len(ent.span()) < len(tokens)
            content = ent.label_has([
                Entity.I_CLAUSE, 
                Entity.D_CLAUSE, 
                Entity.FRAGMENT
            ])
            
            if subset and content:
                sub_reg = self.load_entities(ent.span(), load_clauses=False)
                if (-1, -1) in sub_reg:
                    del sub_reg[(-1, -1)]

                for k, v in sub_reg.items():
                    reg[k] = v
            
        return reg

In [48]:
# import spacy
# %run "Helper.ipynb"

class Main:
    def __init__(self):
        self.sp_nlp = spacy.load("en_core_web_lg")
        self.sp_doc = self.sp_nlp("In simple, linear food chains, top predators can have positive indirect effects on basal resources by causing changes in the traits (e.g. behaviour, feeding rates) of intermediate consumers. Although less is known about trait - mediated indirect interactions (TMIIs) in more complex food webs, it has been suggested that such complexity dampens trophic cascades. We examined TMIIs between a predatory crab (Carcinus maenas) and two ecologically important basal resources, fucoid algae (Ascophyllum nodosum) and barnacles (Semibalanus balanoides), which are consumed by herbivorous (Littorina littorea) and carnivorous (Nucella lapillus) snails, respectively. Because crab predation risk suppresses snail feeding rates, we hypothesized that crabs would also shape direct and indirect interactions among the multiple consumers and resources. We found that the magnitude of TMIIs between the crab and each resource depended on the suite of intermediate consumers present in the food web. Carnivorous snails (Nucella) transmitted TMIIs between crabs and barnacles. However, crab – algae TMIIs were transmitted by both herbivorous (Littorina) and carnivorous (Nucella) snails, and these TMIIs were additive. By causing Nucella to consume fewer barnacles, crab predation risk allowed fucoids that had settled on or between barnacles to remain in the community. Hence, positive interactions between barnacles and algae caused crab – algae TMIIs to be strongest when both consumers were present. Studies of TMIIs in more realistic, reticulate food webs will be necessary for a more complete understanding of how predation risk shapes community dynamics.")
        # self.sp_doc = self.sp_nlp("We saw a small dog, a big cat, and a bird.")

main = Main()
# main.sp_doc = main.sp_nlp("fucoid algae (Ascophyllum nodosum) and barnacles (Semibalanus balanoides), which are consumed by herbivorous (Littorina littorea) and carnivorous (Nucella lapillus) snails, respectively")
# main.sp_doc = main.sp_nlp("We examined TMIIs between a predatory crab (Carcinus maenas)")

parts = Parts(main)

res = parts.load_entities(main.sp_doc)
for ent in res.values():
    if ent.label_has([Entity.LIST]):
        print(f"({ent.label_()}) {ent.lower()}")
        print([e.lower() for e in ent.children])

(List) (carcinus maenas) and two ecologically important basal resources
['(carcinus maenas)', 'two ecologically important basal resources']
(List) (ascophyllum nodosum) and barnacles
['(ascophyllum nodosum)', 'barnacles']
(List) direct and indirect
['direct', 'indirect']
(List) (littorina) and carnivorous (nucella)
['(littorina)', 'carnivorous (nucella)']
(List) that had settled on or between barnacles to remain
['that had settled on', 'between barnacles to remain']
(List) barnacles and algae
['barnacles', 'algae']
(List) (littorina littorea) and carnivorous (nucella lapillus)
['(littorina littorea)', 'carnivorous (nucella lapillus)']
(List) consumers and resources
['consumers', 'resources']
(List) crab and each resource
['crab', 'each resource']
(List) crabs and barnacles
['crabs', 'barnacles']
