In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
def find(l, f):
    for _ in l:
        if f(_):
            return _
    return None

def find_all(l, f):
    a = []
    for _ in l:
        if f(_):
            a.append(_)
    return a

def find_separator(tokens):
    if find(tokens, lambda t: t.text == ";")
        return ";"
    return ","

In [None]:
def is_conjunction(t):
    return t.lower_ in ["and", "or"]

In [None]:
def is_noun(tokens):
    accept = ["PROPN", "NOUN", "PRON"]
    ignore = [*accept, "PART", "NUM", "ADJ", "CCONJ", "DET"]

    found = False
    for token in tokens:
        found = found or token in accept
        if token not in ignore:
            return False
    
    return found

def is_verb(tokens):
    accept = ["VERB"]
    ignore = [*accept, "AUX", "ADV", "ADP", "NOUN", "PRON", "PROPN", "CCONJ", "PART", "NUM"]

    found = False
    for token in tokens:
        found = found or token in accept
        if token not in ignore:
            return False

    return found

In [None]:
def bound_by_noun(tokens, noun_chunks):
    number_tokens = len(tokens)
    if not number_tokens:
        return None
    
    l = number_tokens
    while l > 0 and tokens[l].pos_ not in ["PROPN", "NOUN", "PRON"]:
        l -= 1

    for chunk in noun_chunks:
        chunk_l = chunk.start
        chunk_r = chunk.end
        
        if chunk_l <= tokens[l].i < chunk_r:
            while l > 0 and tokens[l].i != chunk_l:
                l -= 1
            break

    r = 0
    number_tokens = len(tokens)
    while r < number_tokens and tokens[r].pos_ not in ["PROPN", "NOUN", "PRON"]:
        r += 1

    for chunk in noun_chunks:
        chunk_l = chunk.start
        chunk_r = chunk.end

        if chunk_l <= tokens[r].i < chunk_r:
            while r < number_tokens and tokens[r].i != chunk_r - 1:
                r += 1
            break

    return (l, r)

def bound_by_verb(tokens):
    number_tokens = len(tokens)
    if not number_tokens:
        return None

    l = number_tokens
    while l > 0 and tokens[l].pos_ not in ["VERB"]:
        l -= 1

    r = 0
    number_tokens = len(tokens)
    while r < number_tokens and tokens[r].pos_ not in ["VERB"]:
        r += 1

    return (l, r)

In [48]:
class Entity:
    LIST = 1
    QUOTE = 2
    CLAUSE = 3
    BRACKET = 4
    END = 5
    ITEM = 6
    SEPARATOR = 7
    COLON = 8

    def __init__(self, doc, label=None, l=None, r=None, children=None):
        self.doc = doc
        self.label = label
        self.l = l
        self.r = r
        self.children = children

    def length(self):
        return self.r - self.l

    def tokens(self):
        return self.doc[self.l:self.r+1]

class Sentence:
    def __init__(self, doc):
        self.doc = doc
        self.root = Entity(self.doc)

    def find_quotes(self, entities):
        bounds = []
        
        i = 0
        l = 0
        found = False
        
        while i < len(tokens):
            token = self.doc[i]
            
            if not found:
                if token.text == "\"":
                    found = True
                    l = token.i
            else:
                if token.text == "\"":
                    found = False
                    bounds.append((l, token.i))

            i += 1

        print(f"Quote Bounds: {bounds}")
        
        if bool(
            len(bounds) == 1 and 
            bounds[0][0] == tokens[0].i and 
            bounds[0][-1] == tokens[-1].i
        ):
            return []
        return bounds

    def identify_quotes(self, entities):
        i = 0

        is_quote = lambda i: i < len(entities) and entities[i].tokens().lower_ == "\""
        
        while i < len(entities):
            if not is_quote(i):
                i += 1
                continue
            
            entities[i].label = Entity.QUOTE
            
            while not is_quote(i+1):
                entities[i].r += 1
                entities.pop(i+1)

            if is_quote(i+1):
                entities[i].r += 1
                entities.pop(i + 1)

        return entities

    def identify_brackets(self, entities):
        i = 0

        pairs = {
            "[": "]", 
            "(": ")",
            "—": "—",
            ",": ",",
        }
        
        opening = pairs.keys()
        is_opening = lambda i: i < len(entities) and entities[i].tokens().lower_ in opening

        closing = pairs.values()
        is_closing = lambda i: i < len(entities) and entities[i].tokens().lower_ in closing

        closes = lambda i: bool(
            i < len(entities) and 
            opening[entities[stack[-1]].tokens().lower_] == entities[i].tokens().lower_
        )

        stack = []
        while i < len(entities):
            if is_opening(i):
                if not stack:
                    entities[i].label = Entity.BRACKET
                stack.append(i)
                i += 1
                continue
                
            if is_closing(i) and stack:
                if closes(i):
                    stack.pop()

                if not stack:
                    entities[stack[0]].r += 1
                    entities.pop(i)
                
            if stack:
                entities[stack[0]].r += 1
                entities.pop(i)

            i += 1
        
        return entities

    def find_brackets(self, tokens):
        bounds = []
        
        i = 0
        stack = []
        found = False

        bracket = {
            "[": "]", 
            "(": ")",
            "—": "—",
            ",": ",", 
        }
        opening = bracket.keys()
        closing = bracket.values()
        
        while i < len(tokens):
            token = tokens[i]

            if token.text in opening:
                stack.append(token)
            
            if token.text in closing and stack:
                start = None
                
                if bracket[stack[-1].text] == token.text:
                    start = stack.pop()

                if not stack and start:
                    bounds.append((start.i, token.i))

            i += 1

        if bool(
            len(bounds) == 1 and 
            bounds[0][0] == tokens[0].i and 
            bounds[0][-1] == tokens[-1].i
        ):
            return []
        return bounds

    def merge(self, entities, bound):
        i = 0
        while i < len(entities):
            past_l = entities[i].l < bound[0]
            past_r = entities[i].r > bound[1]

            if not past_l and not past_r:
                break

            i += 1

        while i+1 < len(entities) and entities[i+1].r <= bound[1]:
            entities.pop(i+1)
            entities[i].r += 1

        return entities

    def find_separators(self, entities, sep):
        i = 0
        while i < len(entities):
            
        
    def update(self, tokens):
        entities = []
        for token in tokens:
            entity = Entity(
                self.doc, 
                l=token.i, 
                r=token.i
            )
            entities.append(entity)

        self.identify_quotes(entities)
        self.identify_brackets(entities)
        
        return entities

doc = nlp("\"My name is Bob\", he (the friend of Gary) (not the friend of Joe) replied.")
snt = Sentence(doc)
bnd = snt.update(snt.doc)
for b in bnd:
    print(b.l, b.r)

Quote Bounds: [(0, 5)]
0 5
6 6
7 7
8 13
14 22
23 23
24 24
