In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_lg")
# nlp.add_pipe("merge_entities")
# nlp.add_pipe("merge_noun_chunks")

In [None]:
def flatten(arr):
    flat_arr = []

    if not isinstance(arr, list):
        return [arr]

    for element in arr:
        flat_arr.extend(flatten(element))

    return flat_arr

In [None]:
def find(l, f):
    for _ in l:
        if f(_):
            return _
    return None

def find_all(l, f):
    a = []
    for _ in l:
        if f(_):
            a.append(_)
    return a

def find_separator(tokens):
    if find(tokens, lambda t: t.text == ";"):
        return ";"
    return ","

In [None]:
def index(l, f):
    for i, _ in enumerate(l):
        if f(_):
            return i
    return -1

In [None]:
def is_conjunction(t):
    return t.lower_ in ["and", "or"]

In [None]:
def is_noun(tokens):
    accept = ["PROPN", "NOUN", "PRON"]
    ignore = [*accept, "PART", "NUM", "ADJ", "CCONJ", "DET"]

    found = False
    for token in tokens:
        found = found or token in accept
        if token not in ignore:
            return False
    
    return found

def is_verb(tokens):
    accept = ["VERB"]
    ignore = [*accept, "AUX", "ADV", "ADP", "NOUN", "PRON", "PROPN", "CCONJ", "PART", "NUM"]

    found = False
    for token in tokens:
        found = found or token in accept
        if token not in ignore:
            return False

    return found

In [None]:
def bound_by_noun(tokens, noun_chunks):
    number_tokens = len(tokens)
    if not number_tokens:
        return None
    
    l = number_tokens
    while l > 0 and tokens[l].pos_ not in ["PROPN", "NOUN", "PRON"]:
        l -= 1

    for chunk in noun_chunks:
        chunk_l = chunk.start
        chunk_r = chunk.end
        
        if chunk_l <= tokens[l].i < chunk_r:
            while l > 0 and tokens[l].i != chunk_l:
                l -= 1
            break

    r = 0
    number_tokens = len(tokens)
    while r < number_tokens and tokens[r].pos_ not in ["PROPN", "NOUN", "PRON"]:
        r += 1

    for chunk in noun_chunks:
        chunk_l = chunk.start
        chunk_r = chunk.end

        if chunk_l <= tokens[r].i < chunk_r:
            while r < number_tokens and tokens[r].i != chunk_r - 1:
                r += 1
            break

    return (l, r)

def bound_by_verb(tokens):
    number_tokens = len(tokens)
    if not number_tokens:
        return None

    l = number_tokens
    while l > 0 and tokens[l].pos_ not in ["VERB"]:
        l -= 1

    r = 0
    number_tokens = len(tokens)
    while r < number_tokens and tokens[r].pos_ not in ["VERB"]:
        r += 1

    return (l, r)

In [None]:
def same_speech(speech_1, speech_2):
    nouns = ["NOUN", "PRON", "PROPN"]
    if speech_1 in nouns and speech_2 in nouns:
        return True
    return speech_1 == speech_2

In [None]:
class Entity:
    LIST = 1
    QUOTE = 2
    CLAUSE = 3
    BRACKET = 4
    BREAK = 5
    ITEM = 6
    AND_OR_END = 7
    COLON = 8
    END = 9
    COLON_BREAK = 10

    def __init__(self, doc, label=None, l=None, r=None, children=None):
        self.doc = doc
        self.label = label
        self.l = l
        self.r = r
        self.children = children or []

    def length(self):
        return self.r - self.l + 1

    def tokens(self):
        return self.doc[self.l:self.r+1]

    def text(self):
        return self.doc[self.l:self.r+1].text
        
    def lower(self):
        return self.doc[self.l:self.r+1].text.lower()

    def first(self):
        return self.doc[self.l]
    
    def start(self):
        return self.doc[self.l]

    def end(self):
        return self.doc[self.r]

In [None]:
def to_tokens(*, ent=None, ents=None):
    if ents:
        tokens = flatten([list(e.tokens()) for e in ents])
        tokens = sorted(tokens, key=lambda token: token.i)
        return tokens
    if ent:
        return list(ent.tokens())
    return None

In [None]:
class Quotes:
    def __init__(self, entities):
        self.entities = entities

    def is_quote(self, i):
        return i < len(self.entities) and self.entities[i].lower() == "\""
    
    def identify(self):
        i = 0
        
        while i < len(self.entities):
            if not self.is_quote(i):
                i += 1
                continue
            
            self.entities[i].label = Entity.QUOTE
            
            while not self.is_quote(i+1):
                self.entities[i].r += 1
                self.entities.pop(i+1)

            if self.is_quote(i+1):
                self.entities[i].r += 1
                self.entities.pop(i+1)

        return self.entities

In [None]:
class Brackets:
    MATCHES = {
        "[": "]", 
        "(": ")",
        "—": "—",
    }

    OPENING = MATCHES.keys()
    CLOSING = MATCHES.values()

    def __init__(self, entities):
        self.stack = []
        self.entities = [*entities]

    def is_opening(self, i):
        return i < len(self.entities) and self.entities[i].lower() in Brackets.OPENING

    def is_closing(self, i):
        return i < len(self.entities) and self.entities[i].lower()[0] in Brackets.CLOSING

    def closes(self, i):
        opener = self.entities[self.stack[-1]].lower()[0]
        closer = self.entities[i].lower()[0]
        
        return bool(
            i < len(self.entities) and 
            Brackets.MATCHES[opener] == closer
        )
    
    def identify(self):
        self.stack = []
        
        i = 0
        while i < len(self.entities):
            if self.is_closing(i) and self.stack:
                j = None

                if self.closes(i):
                    j = self.stack.pop()

                if not self.stack and j is not None:
                    self.entities[j].r += 1
                    self.entities.pop(i)
                else:
                    i += 1

            elif self.is_opening(i):
                if not self.stack:
                    self.entities[i].label = Entity.BRACKET
                self.stack.append(i)
                i += 1
                continue
                
            else:
                if self.stack:
                    self.entities[self.stack[0]].r += 1
                    self.entities.pop(i)
                else:
                    i += 1
        
        return self.entities

In [None]:
class Separators:
    def __init__(self, entities):
        self.entities = [*entities]

    def is_break(self, i):
        if i >= len(self.entities):
            return False
        if self.entities[i].lower() not in [";", ","]:
            return False
        if bool(
            i + 1 < len(self.entities) and 
            self.entities[i+1].length() == 1 and 
            self.entities[i+1].tokens()[0].pos_ in ["CCONJ"]
        ):
            return False
        return True

    def is_end(self, i):
        if i >= len(self.entities):
            return False
        if self.entities[i].lower() not in [";", ","]:
            return False
        return not self.is_break(i)

    def identify(self):
        i = 0

        while i < len(self.entities):
            if self.is_break(i):
                self.entities[i].label = Entity.BREAK
                i += 1
            elif self.is_end(i):
                conjunction = self.entities[i+1].tokens()[0]
                if conjunction.lower_ in ["and", "or"]:
                    self.entities[i].label = Entity.AND_OR_END
                else:
                    self.entities[i].label = Entity.END
                self.entities[i].r += 1
                self.entities.pop(i+1)
            else:
                i += 1
                
        return self.entities

In [None]:
class Colons:
    def __init__(self, entities):
        self.entities = [*entities]

    def identify(self):
        i = 0

        while i < len(self.entities):
            if self.entities[i].lower()[-1] != ":":
                i += 1
                continue

            if not self.entities[i].label:
                self.entities[i].label = Entity.COLON_BREAK
            
            self.entities[i+1].label = Entity.COLON
            self.entities[i+1].r = self.entities[-1].r
            self.entities = self.entities[:i+2]

            break

        return self.entities        

In [None]:
class Independent_Clauses:
    def __init__(self, entities):
        self.entities = [*entities]
        self.allowed = []

    def end(self, i):    
        if i >= len(self.entities):
            return True
        if i + 1 < len(self.entities) and self.entities[i+1].label in [Entity.CLAUSE, Entity.COLON]:
            return True
        if self.entities[i].label in self.allowed:
            return True
        return False

    def identify(self, allowed):
        self.allowed = allowed
        
        i = 0
        
        while i < len(self.entities):
            if self.entities[i].label not in self.allowed:
                i += 1
                continue

            if self.entities[i].label == Entity.CLAUSE:
                i = entities[i].r + 1
                continue
            
            self.entities[i].label = Entity.CLAUSE
            while not self.end(i+1):
                self.entities[i].r += 1
                self.entities.pop(i+1)

            i += 1
            
        return self.entities

In [None]:
class Dependent_Clauses:
    RELATIVE_NOUNS = [
        "who",
        "whom",
        "which",
        "what",
        "that",
        "whose",
        "whomever",
        "whoever",
        "whichever",
        "whatever"
    ]
    
    def __init__(self, entities):
        self.entities = entities
        self.separator = None

    def end(self, i):
        if i >= len(self.entities):
            return True
        if i + 1 < len(self.entities) and self.entities[i+1].label in [Entity.COLON, Entity.CLAUSE]:
            return True
        if self.entities[i].lower()[0] == self.separator:
            return True
        if self.entities[i].lower() in Dependent_Clauses.RELATIVE_NOUNS:
            return True
        if self.entities[i].first().pos_ in ["SCONJ"]:
            return True
        return False

    def identify(self, separator):
        self.separator = separator
        
        i = 0
        while i < len(self.entities):
            if self.entities[i].label in [Entity.COLON, Entity.CLAUSE]:
                i = self.entities[i].r + 1
                continue

            rel = self.entities[i].lower() in Dependent_Clauses.RELATIVE_NOUNS
            sub = self.entities[i].first().pos_ == "SCONJ"
            
            if not sub and not rel:
                i += 1
                continue

            self.entities[i].label = Entity.CLAUSE
            while not self.end(i+1):
                self.entities[i].r += 1
                self.entities.pop(i+1)

            i += 1
        
        return self.entities

In [None]:
class Prepositional_Clauses:
    NOUN_SPEECH = ["NOUN", "PROPN", "PRON"]

    def __init__(self, entities):
        self.entities = [*entities]

    def last_noun(self, i):
        if i >= len(self.entities):
            return False
            
        if self.entities[i].first().pos_ in Prepositional_Clauses.NOUN_SPEECH:
            if bool(
                i + 1 > len(self.entities) - 1 or 
                (
                    self.entities[i+1].length() == 1 and 
                    self.entities[i+1].first().pos_ not in [*Prepositional_Clauses.NOUN_SPEECH, "PART"]
                )
            ):
                return True
            
        return False

    def end(self, i):
        if i + 1 >= len(self.entities):
            return True
        if self.entities[i+1].label in [Entity.COLON, Entity.CLAUSE]:
            return True
        return self.last_noun(i)

    def identify(self):    
        i = 0
        
        while i < len(self.entities):
            if self.entities[i].length() != 1:
                i += 1
                continue

            if self.entities[i].first().pos_ != "ADP":
                i += 1
                continue

            self.entities[i].label = Entity.CLAUSE
            while not self.end(i+1):
                self.entities[i].r += 1
                self.entities.pop(i+1)

            if self.last_noun(i+1):
                self.entities[i].r += 1
                self.entities.pop(i+1)
            
            i += 1
        
        return self.entities   

In [None]:
class Lists:
    NOUNS = ["NOUN", "PRON", "PROPN"]
    
    def __init__(self, entities):
        self.entities = [*entities]
        self.separator = None

    def is_stop(self, entity):
        is_break = entity.label == Entity.BREAK and entity.lower()[0] == self.separator
        is_clause = entity.label in [Entity.CLAUSE, Entity.COLON, Entity.COLON_BREAK]
        return is_break or is_clause

    def find_lists(self, sep):
        self.separator = sep
        
        lists = [
            [
                [None, None]
            ]
        ]

        i = 0
        while i < len(self.entities):
            print(lists)
            entity = self.entities[i]

            opened = lists[-1][0] != [None, None]

            kill_list = entity.label in [Entity.COLON, Entity.COLON_BREAK]
            close_list = entity.label in [Entity.AND_OR_END] and entity.lower()[0] == sep
            close_item = entity.label in [Entity.BREAK] and entity.lower() == sep

            print(f"\tOpened: {opened}")
            print(f"\tKill List: {kill_list}")
            print(f"\tClose List: {close_list}")
            print(f"\tClose Item: {close_item}")
            
            # Close List, Open List
            if opened and close_list:
                # Invalid List
                if len(lists[-1]) < 2:
                    lists.pop()
                
                # Add Last Item
                else: 
                    last_item_l = i + 1
                    last_item_r = last_item_l
                    
                    length = index(self.entities[last_item_l:], lambda e: self.is_stop(e))
                    
                    if length > 0:
                        last_item_r += length - 1
                    elif length == -1:
                        last_item_r = len(self.entities) - 1

                    print(f"ADDING LAST ITEM: {[last_item_l, last_item_r]}")
                    lists[-1].append([last_item_l, last_item_r])

                # Close List, Open List
                lists.append([[None, None]])
                i += 1

            # Close Item, Open Item
            elif opened and close_item:
                lists[-1].append([i + 1, i])
                i += 1
                
            # Kill List, Open List
            elif opened and kill_list:
                lists.pop()
                lists.append([[None, None]])
                i += 1

            # Restart List
            # elif not opened and kill_list and entity.label in [Entity.CLAUSE]:
            #     lists[-1][0] = [i, i]
            #     i += 1
            
            # Increment Item
            else:
                if not opened:
                    lists[-1][0] = [i, i]
                else:
                    lists[-1][-1][1] += 1
                i += 1

        print(lists)
        print("ENDDDD")

        # In case list hasn't closed
        if len(lists[-1]) < 3:
            lists.pop()
        
        # Looking for Two-Item Lists
        num_lists = len(lists)
        for list_i, list_ in enumerate(lists):
            if list_i >= num_lists:
                break
            
            for l, r in list_:
                tokens = flatten([list(e.tokens()) for e in self.entities[l:r+1]])
                num_conj = len(find_all(tokens, lambda t: is_conjunction(t)))
                if num_conj == 1:
                    lists.append([[l, r]])

        # Removing Duplicates
        i = 0
        while i < len(lists):
            if lists[i] in lists[i+1:]:
                lists.pop(i)
            else:
                i += 1

        # Removing Non-Lists
        i = 0
        while i < len(lists):
            if len(lists[i]) == 1 and lists[i][0][0] == lists[i][0][1]:
                lists.pop(i)
            else:
                i += 1
         
        return lists

    def clean_lists(self, lists):
        print("clean_lists")
        print(lists)
        
        overlaps = []

        i = 0
        while i + 1 < len(lists):
            a = lists[i]
            b = lists[i+1]
                  
            if a[-1] != b[0]:
                i += 1
                continue

            if len(a) <= 1 or len(b) <= 1:
                i += 1
                continue

            # No Way to Split
            if a[-1][1] - a[-1][0] <= 1:
                overlaps.extend([i, i + 1])
                i += 2
            else:
                a[-1][1] = a[-1][0]
                b[0][0] = b[0][1]
                i += 2
        
        lists = [l for i, l in enumerate(lists) if i not in overlaps]

        print(lists)
        
        return lists

    def bound_list_(self, lst):
        # Left Bound
        l_bound_text = to_tokens(ent=self.entities[lst[-2][0]])[0].lower_

        # Right Bound
        b_tokens = to_tokens(ents=self.entities[lst[-2][0]:lst[-2][1]+1])
        num_b_tokens = len(b_tokens)
        
        speech = ["NOUN", "PROPN", "PRON", "VERB", "NUM"]
        
        r_bound = None
        for i in range(num_b_tokens - 1, -1, -1):
            if b_tokens[i].pos_ in speech:
                r_bound = b_tokens[i]
                break

        if not r_bound:
            return None

        # Check Inner Items for Left Bound
        inner_items = lst[1:-2]

        print("Bound Inner Items")
        print(inner_items)
        
        for i, item in enumerate(inner_items):
            l = item[0]
            r = item[1]
            
            tokens = to_tokens(ents=self.entities[l:r+1])
            
            if tokens[0].lower_ != l_bound_text:
                if len(inner_items) - i - 1 > 1:
                    return self.bound_list_(lst[i+2:])
                return None
            
        # Shift Starting Item
        start_tokens = to_tokens(ents=self.entities[lst[0][0]:lst[0][1]+1])
        start_l = len(start_tokens) - 1
        print(start_tokens, start_l)

        while start_l >= 0 and start_tokens[start_l] != l_bound_text:
            start_l -= 1

        if start_l < 0:
            if len(inner_items) >= 2:
                return self.bound_list(lst[1:])
            return None

        # Adjust Starting Item
        if l_bound.pos_ in Lists.NOUNS:
            for c in self.entities[0].doc.noun_chunks:
                tokens_i = [t.i for t in c]
                if start_tokens[start_l].i in tokens_i:
                    while start_l >= 0 and start_tokens[start_l].i in tokens_i:
                        start_l -= 1
                    start_l += 1
                break

            for e in self.entities[0].doc.ents:
                tokens_i = [t.i for t in e]
                if start_tokens[start_l].i in tokens_i:
                    while start_l >= 0 and start_tokens[start_l].i in tokens_i:
                        start_l -= 1
                    start_l += 1
                break

        print(f"start_l: {start_l}")
        print(f"Starting Item: {start_tokens[start_l:]}")
        
        # Shift Ending Item
        end_tokens = to_tokens(ents=self.entities[lst[-1][0]:lst[-1][1]+1])
        end_r = 0
        num_end_tokens = len(end_tokens)

        
        while end_r < num_end_tokens and end_tokens[end_r].pos_ not in SPEECH:
            end_r += 1

        if end_r >= num_end_tokens:
            return None

        # Adjust Ending Item
        if end_tokens[end_r].pos_ in Lists.NOUNS:
            for c in self.entities[0].doc.noun_chunks:
                tokens_i = [t.i for t in c]
                if end_tokens[end_r].i in tokens_i:
                    while end_r < num_tokens and end_tokens[end_r].i in tokens_i:
                        end_r += 1
                    end_r -= 1
                break

            for e in self.entities[0].doc.ents:
                tokens_i = [t.i for t in e]
                if end_tokens[end_r].i in tokens_i:
                    while end_r < num_tokens and end_tokens[end_r].i in tokens_i:
                        end_r += 1
                    end_r -= 1
                break

        print(f"end_r: {end_r}")
        print(f"Ending Item: {end_tokens[:end_r+1]}")
        
        doc = self.entities[0].doc
        entity_list = Entity(doc, label=Entity.LIST, l=start_tokens[start_l].i, r=end_tokens[end_r].i)
        
        entity_start_item = Entity(doc, label=Entity.ITEM, l=start_tokens[start_l].i, r=start_tokens[-1].i)
        entity_list.children.append(entity_start_item)
        
        entity_end_item = Entity(doc, label=Entity.ITEM, l=end_tokens[0].i, r=end_tokens[end_r].i)
        entity_list.children.append(entity_end_item)

        for item in lst[1:-1]:
            tokens = to_tokens(ents=self.entities[item[0]:item[1]+1])
            entity_item = Entity(doc, label=Entity.ITEM, l=tokens[0].i, r=tokens[-1].i)
            entity_list.children.append(entity_item)

        return entity_list
            
    def bound_list(self, lst):
        print("Bound List")
        print(lst)

        tokens = to_tokens(ents=self.entities[lst[0][0]:lst[-1][-1]])
        print(tokens)

        # Base
        b_l = lst[-2][0]
        b_r = lst[-2][1]
        
        b_tokens = to_tokens(ents=self.entities[b_l:b_r+1])
        b_speech = [token.pos_ for token in b_tokens]
        
        num_b_tokens = len(b_tokens)
        
        # Bound
        speech = ["NOUN", "PROPN", "PRON", "VERB", "NUM"]

        # L-Bound
        l_bound = None
        for i in range(0, num_b_tokens):
            if b_tokens[i].pos_ in speech:
                l_bound = b_tokens[i]
                break

        if not l_bound:
            return None
        
        # R-Bound
        r_bound = None
        for i in range(num_b_tokens - 1, -1, -1):
            if b_tokens[i].pos_ in speech:
                r_bound = b_tokens[i]
                break

        if not r_bound:
            return None

        # Bound Inner Items
        inner_items = lst[1:-2]

        print("Bound Inner Items")
        print(inner_items)
        
        for i, item in enumerate(inner_items):
            l = item[0]
            r = item[1]
            
            tokens = to_tokens(ents=self.entities[l:r+1])
            item_speech = [token.pos_ for token in tokens]

            if l_bound.pos_ not in item_speech:
                if len(inner_items) - i - 1 > 1:
                    return self.bound_list(lst[i+2:])
                return None
        
        # Shift Starting Item
        start_tokens = to_tokens(ents=self.entities[lst[0][0]:lst[0][1]+1])
        start_l = len(start_tokens) - 1
        print(start_tokens, start_l)

        while start_l >= 0 and not same_speech(start_tokens[start_l].pos_, l_bound.pos_):
            start_l -= 1

            if start_l < 0:
                if len(inner_items) >= 2:
                    return self.bound_list(lst[1:])
                return None

        # Adjust Starting Item
        if l_bound.pos_ in Lists.NOUNS:
            for c in self.entities[0].doc.noun_chunks:
                tokens_i = [t.i for t in c]
                if start_tokens[start_l].i in tokens_i:
                    while start_l >= 0 and start_tokens[start_l].i in tokens_i:
                        start_l -= 1
                    start_l += 1
                break

            for e in self.entities[0].doc.ents:
                tokens_i = [t.i for t in e]
                if start_tokens[start_l].i in tokens_i:
                    while start_l >= 0 and start_tokens[start_l].i in tokens_i:
                        start_l -= 1
                    start_l += 1
                break

        print(f"start_l: {start_l}")
        print(f"Starting Item: {start_tokens[start_l:]}")
        
        # Shift Ending Item
        end_tokens = to_tokens(ents=self.entities[lst[-1][0]:lst[-1][1]+1])
        end_r = 0
        num_end_tokens = len(end_tokens)

        
        while end_r < num_end_tokens and not same_speech(end_tokens[end_r].pos_, l_bound.pos_):
            end_r += 1

            if end_r >= num_end_tokens:
                return None

        # Adjust Ending Item
        if r_bound.pos_ in Lists.NOUNS:
            for c in self.entities[0].doc.noun_chunks:
                tokens_i = [t.i for t in c]
                if end_tokens[end_r].i in tokens_i:
                    while end_r < num_tokens and end_tokens[end_r].i in tokens_i:
                        end_r += 1
                    end_r -= 1
                break

            for e in self.entities[0].doc.ents:
                tokens_i = [t.i for t in e]
                if end_tokens[end_r].i in tokens_i:
                    while end_r < num_tokens and end_tokens[end_r].i in tokens_i:
                        end_r += 1
                    end_r -= 1
                break

        print(f"end_r: {end_r}")
        print(f"Ending Item: {end_tokens[:end_r+1]}")
        
        doc = self.entities[0].doc
        entity_list = Entity(doc, label=Entity.LIST, l=start_tokens[start_l].i, r=end_tokens[end_r].i)
        
        entity_start_item = Entity(doc, label=Entity.ITEM, l=start_tokens[start_l].i, r=start_tokens[-1].i)
        entity_list.children.append(entity_start_item)
        
        entity_end_item = Entity(doc, label=Entity.ITEM, l=end_tokens[0].i, r=end_tokens[end_r].i)
        entity_list.children.append(entity_end_item)

        for item in lst[1:-1]:
            tokens = to_tokens(ents=self.entities[item[0]:item[1]+1])
            entity_item = Entity(doc, label=Entity.ITEM, l=tokens[0].i, r=tokens[-1].i)
            entity_list.children.append(entity_item)

        return entity_list

    def bound_pair_by_chars(self, pair):
        print("Bound Pair")
        print(pair)
        
        tokens = flatten([list(e.tokens()) for e in self.entities[pair[0][0]:pair[0][1]+1]])
        tokens = sorted(tokens, key=lambda t: t.i)
        num_tokens = len(tokens)
        
        print(tokens)
        
        m = index(tokens, lambda t: is_conjunction(t))
        l = m - 1
        r = m + 1

        print(l, m, r)

        # Bound Left by Right Characters
        i = m - 1
        while i >= 0 and tokens[i].lower_ != tokens[m + 1].lower_:
            i -= 1

        if i < 0:
            return None

        # Bound Right by Left Token Speech
        j =  m + 1
        while j < num_tokens and not same speech(tokens[m-1].pos_, tokens[j].pos_):
            j += 1

        if j >= num_tokens:
            return None

        doc = self.entities[0].doc
        
        item_l = Entity(doc, label=Entity.ITEM, l=i, r=m-1)
        item_r = Entity(doc, label=Entity.ITEM, l=m+1, r=j)
        list_ = Entity(doc, label=Entity.LIST, l=i, r=j, children=[item_l, item_r])
        
        return list_
    
    def bound_pair(self, pair):
        print("Bound Pair")
        print(pair)
        
        tokens = flatten([list(e.tokens()) for e in self.entities[pair[0][0]:pair[0][1]+1]])
        tokens = sorted(tokens, key=lambda t: t.i)
        num_tokens = len(tokens)
        
        print(tokens)
        
        m = index(tokens, lambda t: is_conjunction(t))
        l = m - 1
        r = m + 1

        print(l, m, r)
        
        speech = ["NOUN", "PROPN", "PRON", "VERB", "NUM"]

        # Find L Bound
        l_bound = None
        l_bound_i = None
        for i in range(m + 1, num_tokens):
            if tokens[i].pos_ in speech:
                l_bound = tokens[i].pos_
                l_bound_i = tokens[i].i
                break

        if not l_bound:
            return None

        # Find R Bound
        r_bound = None
        r_bound_i = None
        for i in range(m - 1, -1, -1):
            if tokens[i].pos_ in speech:
                r_bound = tokens[i].pos_
                r_bound_i = tokens[i].i
                break

        if not r_bound:
            return None

        # Shift L
        while l >= 0 and not same_speech(tokens[l].pos_, l_bound):
            l -= 1
            if l < 0:
                return None

        # Adjust L if Noun
        if l_bound in Lists.NOUNS:
            for c in self.entities[0].doc.noun_chunks:
                tokens_i = [t.i for t in c]
                if tokens[l].i in tokens_i:
                    while l >= 0 and tokens[l].i in tokens_i:
                        l -= 1
                    l += 1
                break

            for e in self.entities[0].doc.ents:
                tokens_i = [t.i for t in e]
                if tokens[l].i in tokens_i:
                    while l >= 0 and tokens[l].i in tokens_i:
                        l -= 1
                    l += 1
                break
            
        # Shift R
        while r < num_tokens and not same_speech(tokens[r].pos_, r_bound):
            r += 1
            if r >= num_tokens:
                return None

        # Adjust R if Noun
        if r_bound in Lists.NOUNS:
            for c in self.entities[0].doc.noun_chunks:
                tokens_i = [t.i for t in c]
                if tokens[r].i in tokens_i:
                    while r < num_tokens and tokens[r].i in tokens_i:
                        r += 1
                    r -= 1
                break

            for e in self.entities[0].doc.ents:
                tokens_i = [t.i for t in e]
                if tokens[r].i in tokens_i:
                    while r < num_tokens and tokens[r].i in tokens_i:
                        r += 1
                    r -= 1
                break

        doc = self.entities[0].doc
        
        entity_list = Entity(doc, label=Entity.LIST, l=l, r=r)
        
        entity_start_item = Entity(doc, label=Entity.ITEM, l=l, r=r_bound_i)
        entity_list.children.append(entity_start_item)

        entity_end_item = Entity(doc, label=Entity.ITEM, l=l_bound_i, r=r)
        entity_list.children.append(entity_end_item)
        
        return entity_list

    def bound_lists(self, lists):
        print(f"bound_lists")
        print(lists)
        bound_lists = []
        
        for lst in lists:
            bound = None 
            
            if len(lst) == 1:
                print(f"Pair: {lst}")
                bound = self.bound_pair_(lst)
                if not bound:
                    bound = self.bound_pair(lst)
                print(f"---")
                print(f"List: {bound.tokens()}")
                for item in bound.children:
                    print(f"\tItem: {item.tokens()}")
            
            else:
                print(f"List: {lst}")
                bound = self.bound_list_(lst)
                if not bound:
                    bound = self.bound_list(lst)
                print(f"---")
                print(f"List: {bound.tokens()}")
                for item in bound.children:
                    print(f"\tItem: {item.tokens()}")
            
            if bound:
                bound_lists.append(bound)

        # Map (L, R) to Entity List
        mapped_bounds = {}
        for lst in bound_lists:
            mapped_bounds[(lst.l, lst.r)] = lst

        bounds = list(mapped_bounds.keys())
        print(f"Bounds: {bounds}")

        max_coverage = []
        
        for bound in bounds:
            overlap = False
            for i, max_bound in enumerate(max_coverage):
                contains = max_bound[0] <= bound[0] <= max_bound[1] or max_bound[0] <= bound[1] <= max_bound[1]
                surround = bound[0] <= max_bound[0] <= bound[1] or bound[0] <= max_bound[1] <= bound[1]
                
                print(f"{bound} v. {max_bound}")
                print(f"Contains: {contains}")
                print(f"Surround: {surround}")
                
                if contains or surround:
                    overlap = True
                
                    if bound[1] - bound[0] > max_bound[1] - max_bound[0]:
                        max_coverage[i] = bound
            
            if not overlap:
                max_coverage.append(bound)

        print("Max Coverage")
        print(max_coverage)

        # Integrate Lists
        # Case 1: No Overlap w/ 'Composite' Entities
        # Add Entity List, Remove Old Entities
        # Case 2: Overlap w/ Entities
        # Split Overlapped Entity to Make Space for List
        # Note: Store L and R Overlaps, Analyze Both Directions at Once
        for bound in max_coverage:
            l_overlap = None
            r_overlap = None
            
            l_overlap_i = None
            r_overlap_i = None
            
            i = 0
            while i < len(self.entities):
                entity = self.entities[i]
                
                # Overlap w/ Left
                if not l_overlap and entity.l <= bound[0] <= entity.r:
                    l_overlap = entity
                    l_overlap_i = i
    
                # Overlap w/ Right
                if not r_overlap and entity.l <= bound[1] <= entity.r:
                    r_overlap = entity
                    r_overlap_i = i

                if l_overlap and r_overlap:
                    break

                i += 1

            if l_overlap.label == Entity.CLAUSE:
                print("A", l_overlap.l, mapped_bounds[bound].l)
                if l_overlap.l == mapped_bounds[bound].l:
                    print("B")
                    self.entities.pop(0)
                    self.entities.insert(0, mapped_bounds[bound])
                else:
                    print("C")
                    l_overlap.r = mapped_bounds[bound].l - 1
                    self.entities = self.entities[:l_overlap_i+1] + self.entities[r_overlap_i+1:]
                    self.entities.insert(l_overlap_i + 1, mapped_bounds[bound])
            else:
                self.entities = self.entities[:l_overlap_i] + self.entities[r_overlap_i+1:]
                self.entities.insert(l_overlap_i, mapped_bounds[bound])
        
        return self.entities
        
    def identify(self, sep):
        lists = self.find_lists(sep)
        lists = self.clean_lists(lists)
        lists = self.bound_lists(lists)   
        return lists

In [None]:
class Sentence:
    def __init__(self, doc):
        self.doc = doc
        self.root = Entity(self.doc)

    def update(self, tokens):
        entities = []
        for token in tokens:
            entity = Entity(
                self.doc, 
                l=token.i, 
                r=token.i
            )
            entities.append(entity)

        entities = Quotes(entities).identify()
        entities = Brackets(entities).identify()
        entities = Separators(entities).identify()
        
        sep = ","
        for entity in entities:
            if ";" == entity.lower()[0]:
                sep = ";"
                break
        print(f"Separator: '{sep}'")
        
        entities = Colons(entities).identify()
        entities = Dependent_Clauses(entities).identify(sep)
        entities = Independent_Clauses(entities).identify([Entity.END])
        entities = Prepositional_Clauses(entities).identify()
        entities = Lists(entities).identify(sep)
        
        return entities

doc = nlp("By causing Nucella to consume fewer barnacles, crab predation risk allowed fucoids that had settled on or between barnacles to remain in the community.")
snt = Sentence(doc)

entities = snt.update(snt.doc)
for entity in entities:
    print(f"({entity.label}) '{doc[entity.l:entity.r+1]}'")

In [None]:
for nc in doc.noun_chunks:
    print(nc)

In [None]:
doc = nlp("By causing Nucella to consume fewer barnacles, crab predation risk allowed fucoids that had settled on or between barnacles to remain in the community. ")

In [None]:
for token in doc:
    print(token)