In [None]:
def find(elements, bool_lambda):
    for element in elements:
        if bool_lambda(element):
            return element
    return None

def same_speech(token_a, token_b):
    nouns = ["NOUN", "PRON", "PROPN"]
    if token_a.pos_ in nouns and token_b.pos_ in nouns:
        return True
    return token_a.pos_ == token_b.pos_

def is_conjunction(token):
    return token.lower_ in ["and", "or"]

def find_separator(tokens):
    if find(tokens, lambda token: token.text == ";")
        return ";"
    return ","

In [None]:
class Entity:
    LIST = 1
    QUOTE = 2
    CLAUSE = 3
    BRACKET = 4
    
    def __init__(self, label=None, position=None, children=None):
        self.label = label
        self.position = position
        self.children = children or []

class Unit:
    END = 1
    ITEM = 2
    BREAK = 3

    def __init__(self, label=None, tokens=None):
        self.label = label
        self.set_tokens(tokens)

    def set_tokens(self, tokens):
        self.tokens = tokens
        self.position = None
        if tokens:
            self.tokens = sorted(self.tokens, key=lambda token: token.i)
            self.position = [tokens[0].i, tokens[-1].i]

class Sentence:
    def __init__(self):
        self.root = Entity()
        self.registry = {}

    def update(self, tokens):
        self.root = Entity()
        self.registry = {}
        self.parse(tokens, self.root)

    def update_registry(self, registry, *entities):
        for entity in entities:
            start, end = entity.position
            if start not in registry:
                registry[start] = []
            registry[start].append(entity)

    def extract_quotes(self, tokens):
        entities = []
        
        i = 0
        start = 0
        found = False
        while i < len(tokens):
            token = tokens[i]
            
            if not found:
                if token.text == "\"":
                    found = True
                    start = token.i + 1
                    stack.append("\"")
            else:
                if token.text == "\"":
                    found = False
                    stack = []
                    entity = Entity(Entity.QUOTE, [start, token.i - 1])
                    entities.append(entity)

            i += 1

        return entities

    def extract_brackets(self, tokens):
        entities = []
        
        i = 0
        stack = []
        found = False

        bracket = {
            "[": "]", 
            "(": ")",
            "—": "—"
            ",": ",", 
        }
        opening = bracket.keys()
        closing = bracket.values()
        
        while i < len(tokens):
            token = tokens[i]

            if token.text in opening:
                stack.append(token)
            
            if stack and token.text in closing:
                closed = None
                if bracket[stack[-1].text] == token.text:
                    closed = stack.pop()

                if not stack and closed:
                    entity = Entity(Entity.BRACKET, [closed.i + 1, token.i - 1])
                    entities.append(entity)

            i += 1

        return entities

    def extract_units(self, tokens, disallowed):
        units = []

        sep = find_separator(tokens)
        
        i = 0
        buffer = []
        while i < len(doc):
            # SKIP
            for interval in disallowed:
                if interval[0] <= i <= interval[1]:
                    buffer.extend([doc[i] for i in range(interval[0], interval[1] + 1)])
                    i = interval[1] + 1
                    break

            if doc[i].text != sep:
                buffer.append(doc[i])
                i += 1
                continue
            else:
                units.append(Unit(Unit.ITEM, buffer))
                buffer = []
        
                if i+1 < len(doc) and is_conjunction(doc[i+1]):
                    units.append(Unit(Unit.END, [doc[i], doc[i+1]]))
                    i += 2
                else:
                    units.append(Unit(Unit.BREAK, [doc[i]]))
                    i += 1
        
        if buffer:
            units.append(Unit(Unit.ITEM, buffer))

        while units and units[0].label in [Unit.BREAK, Unit.END]:
            units.pop(0)

        while units and units[-1].label in [Unit.BREAK, Unit.END]:
            units.pop(-1)
        
        return units

    def extract_lists(self, units):
        lists = []
    
        i = 0
        buffer = []
        while i < len(units):
            unit = units[i]
            if unit.label == Unit.ITEM:
                buffer.append(unit)
            if unit.label == Unit.END:
                if len(buffer) < 2:
                    buffer = []
                elif i < len(units):
                    buffer.append(units[i+1])
                    lists.append(buffer)
                    buffer = []
            i += 1
    
        i = 0
        while i < len(buffer):
            unit = buffer[i]
            simple = len([t for t in unit.tokens if is_conjunction(t)]) == 1
            not_seen = bool(i - 1 >= 0 and buffer[i-1].label != Unit.END)
            
            if (not_seen or not lists) and simple:
                lists.append([unit])
            
            i += 1
            
        return lists

    def clean_lists(self, lists):
        overlaps = []
        
        i = 0
        while i + 1 < len(lists):
            a = lists[i]
            b = lists[i+1]
            
            if a[-1] != b[0]:
                i += 1
                continue
    
            if len(a) <= 1 or len(b) <= 1:
                a[-1].tokens = [*a[-1].tokens]
                b[0].tokens = [*b[0].tokens]
                i += 1
                continue
    
            if len(a[-1].tokens) == 1:
                overlaps.extend([a, b])
                i += 2
            else:
                a[-1] = Unit(Unit.ITEM, [a[-1].tokens[0]])
                b[0] = Unit(Unit.ITEM, [b[0].tokens[-1]])
                i += 1
    
        lists = [l for l in lists if l not in overlaps]
    
        i = 0
        num_lists = len(lists)
        while i < num_lists:
            if len(lists[i]) == 1:
                i += 1
                continue
            
            for unit in lists[i]:
                if len([t for t in unit.tokens if is_conjunction(t)]) == 1:
                    lists.append([unit])
            i += 1
        
        return lists

    def bound_lists(self, lists):
        return None

    def extract_lists(self, tokens, disallowed):
        units = self.extract_units(tokens, disallowed)
        lists = self.extract_lists(units)
        lists = self.clean_lists(l)
        lists = self.bound_lists(l)

        entities = []
        for _ in lists:
            entity_list = Entity(Entity.LIST, [_[0][0].i, _[-1][-1].i])
            for item in _:
                entity_item = Entity(Entity.CLAUSE, item.position)
                entity_list.children.append(entity_item)
            entities.append(entity_list)

        return entities

    def extract_clauses(self, tokens):
        entities = []
        
        buffer = []
        for token in tokens:
            if token.i in self.registry:
                if buffer:
                    l = buffer[0].i
                    r = buffer[-1].i
                    entity = Entity(Entity.CLAUSE, [l, r])
                    entities.append(entity)
                buffer = []
            else:
                buffer.append(token)
        
        if buffer:
            l = buffer[0].i
            r = buffer[-1].i
            entity = Entity(Entity.CLAUSE, [l, r])
            entities.append(entity)

        return entities
           
    def parse(self, tokens, parent):
        # Quotes
        q = self.extract_quotes(tokens)
        self.update_registry(self.registry, q)

        # Brackets
        b = self.extract_brackets(tokens)
        self.update_registry(self.registry, b)

        # Lists
        q_ = [_.position for _ in q]
        b_ = [_.position for _ in b]
        l = self.extract_lists(tokens, [*q_, *b_])
        self.update_registry(self.registry, l)

        # Clauses
        c = self.extract_clauses(tokens)

        # Next Level of Parsing
        for entity in [*q, *b, *[*[for item in _] for _ in l]]:
            l = entity.position[0]
            r = entity.position[1]
            self.parse(doc[l:r+1], entity)