In [188]:
import spacy

In [189]:
nlp = spacy.load("en_core_web_sm")

In [504]:
def flatten(arr):
        flat_arr = []

        if not isinstance(arr, list):
            return [arr]

        for element in arr:
            flat_arr.extend(flatten(element))

        return flat_arr

In [190]:
class Data:
    def __init__(self, node_type=None, node_tokens=None):
        self.type = node_type
        self.tokens = node_tokens
        self.complex = False

class ConjunctionNode:
    def __init__(self, node_type=None, node_tokens=None):
        self.children = []
        self.data = Data(
            node_type=node_type, 
            node_tokens=node_tokens or []
        )

    def type(self):
        return self.data.type
    
    def tokens(self):
        return self.data.tokens
        
    def add_token(self, token):
        tokens = self.tokens()

        if self.data.type == "CCONJ":
            self.children[-1].add_token(token)
            return
        
        if self.data.complex:
            tokens[-1].append(token)
        else:
            tokens.append(token)
    
    def add_child(self, child):
        self.children.append(child)

In [510]:
class ConjunctionTree:
    def __init__(self, doc):
        self.root = self.create(doc)
        
    def is_conj(self, token):
        return token.pos_ == "CCONJ" and token.lower_ in ["and", "or"]

    def same_pos(self, pos_1, pos_2):
        nouns = ["PROPN", "PRON", "NOUN"]
        if pos_1 in nouns and pos_2 in nouns:
            return True
        return pos_1 == pos_2

    def create(self, tokens):
        tokens = [*tokens]
        
        while tokens and (tokens[-1].text == "," or tokens[-1].pos_ == "CCONJ"):
            tokens.pop()

        while tokens and (tokens[0].text == "," or tokens[0].pos_ == "CCONJ"):
            tokens.pop(0)

        number_tokens = len(tokens)
        if not number_tokens:
            return None

        sep = ","
        for i, token in enumerate(tokens):
            if token.text == ";" and i < number_tokens - 1 and self.is_conj(tokens[i+1]):
                sep = ";"
                break
        
        root = ConjunctionNode()
        quote = False
    
        for i, token in enumerate(reversed(tokens)):
            i = number_tokens - i - 1

            if self.is_conj(token):
                if root.data.type == "CCONJ":
                    new_root = ConjunctionNode(node_type="CCONJ", node_tokens=[token])
                    new_root.add_child(root)
                    new_root.add_child(self.create(tokens[:i]))
                    root = new_root
                    break
                else:
                    new_root = ConjunctionNode(node_type="CCONJ", node_tokens=[token])
                    new_root.add_child(root)
                    new_root.add_child(ConjunctionNode())
                    root = new_root
            elif token.text == sep and not quote:
                if root.data.type != "CCONJ":
                    if root.data.complex:
                        if i > 0 and self.is_conj(tokens[i-1]):
                            root.data.tokens = root.data.tokens[0]
                        else:
                            root.data.tokens = [*root.data.tokens, []]
                    else:
                        root.data.tokens = [[*root.data.tokens], []]
                        root.data.complex = True
                elif root.children[-1].tokens():
                    root.add_child(ConjunctionNode())
            else:
                root.add_token(token)
                if token.text == "\"":
                    quote = not quote

        return root

    def restrict(self):
        stack = [self.root]

        while stack:
            curr = stack.pop()

            # No Children
            if not curr or not curr.children:
                continue

            # Parent of Sub-Tree
            if curr.children[0].data.type == "CCONJ":
                stack.extend(curr.children)
                continue

            # Sub-Tree
            items = []
            for i, n in enumerate(curr.children):
                # print(i, n.data.tokens)
                items.append(flatten(list(reversed(n.data.tokens))))

            items = list(reversed(items))
            print(items)

            # Case 1: Parallel Structure
            for i in [0, -1]:
                # print(i)
                bounded_items = [*items]
                bounded_items.pop(i)

                bounded_items_pos = [item[i].pos_ for item in bounded_items]
                bounded_items_pos.sort()
                
                bounded_items_txt = [item[i].lower_ for item in bounded_items]
                bounded_items_txt.sort()
                
                # print(bounded_items_pos)
                # print(bounded_items_txt)

                parallel_txt = bounded_items_pos[0] == bounded_items_pos[-1]
                parallel_pos = self.same_pos(bounded_items_pos[0], bounded_items_pos[-1])

                if not parallel_txt and not parallel_pos:
                    continue
            
                unbounded_items = items[i]
                num_unbounded_items = len(unbounded_items)
                j = 0 if not i else num_unbounded_items - 1
                mj = j
                
                while 0 <= j < num_unbounded_items:
                    if (
                        unbounded_items[j].lower_ == bounded_items_txt[i] or
                        self.same_pos(unbounded_items[j].pos_, bounded_items_pos[i])
                    ):
                        mj = j
                    j += -1 if i == -1 else 1

                j = mj
                if j < 0 or j >= num_unbounded_items:
                    continue

                if not i:
                    items[i] = items[i][j:]
                else:
                    items[i] = items[i][:j+1]

                print(items)

In [511]:
doc = nlp("After Chad clutched his heart, swooned, and fell to the floor, Professor Borglum nudged him with her foot, ordering him to the front of the class to make his speech.")

In [512]:
tree = ConjunctionTree(doc)

In [513]:
tree.restrict()

[[After, Chad, clutched, his, heart], [swooned], [floor, the, to, fell, foot, her, with, him, nudged, Borglum, Professor, ., speech, his, make, to, class, the, of, front, the, to, him, ordering]]
