In [1]:
import math
import spacy
import itertools

In [2]:
nlp = spacy.load("en_core_web_lg")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class Unit:
    def __init__(self, sort=None, tokens=None):
        self.sort = sort
        self.set_tokens(tokens)

    def set_tokens(self, tokens):
        self.tokens = tokens
        self.tokens = sorted(self.tokens, key=lambda token: token.i)
        self.l = None if not tokens else tokens[0].i
        self.r = None if not tokens else tokens[-1].i

In [4]:
def find(elements, bool_lambda):
    for element in elements:
        if bool_lambda(element):
            return element
    return None

In [5]:
def is_conjunction(token):
    return token.lower_ in ["and", "or"]

In [6]:
def find_units(doc):
    units = []

    i = 0
    buffer = []
    while i < len(doc):
        if doc[i].text != sep:
            buffer.append(doc[i])
            i += 1
            continue
        else:
            units.append(Unit("I", buffer))
            buffer = []
    
            if i+1 < len(doc) and is_conjunction(doc[i+1]):
                units.append(Unit("E", [doc[i], doc[i+1]]))
                i += 2
            else:
                units.append(Unit("B", [doc[i]]))
                i += 1
    
    if buffer:
        units.append(Unit("I", buffer))
    
    return units

In [7]:
def find_lists(units):
    lists = []

    i = 0
    buffer = []
    while i < len(units):
        unit = units[i]
        if unit.sort == "I":
            buffer.append(unit)
        if unit.sort == "E":
            if len(buffer) < 2:
                buffer = []
            elif i < len(units):
                buffer.append(units[i+1])
                lists.append(buffer)
                buffer = []
        i += 1

    i = 0
    while i < len(buffer):
        unit = buffer[i]
        simple = len([t for t in unit.tokens if is_conjunction(t)]) == 1
        not_seen = bool(i - 1 >= 0 and buffer[i-1].sort != "E")
        
        if (not_seen or not lists) and simple:
            lists.append([unit])
        
        i += 1
        
    return lists

In [8]:
def fix_lists(lists):
    overlaps = []
    
    i = 0
    while i + 1 < len(lists):
        a = lists[i]
        b = lists[i+1]
        
        if a[-1] != b[0]:
            i += 1
            continue

        if len(a) <= 1 or len(b) <= 1:
            a[-1].tokens = [*a[-1].tokens]
            b[0].tokens = [*b[0].tokens]
            i += 1
            continue

        if len(a[-1].tokens) == 1:
            overlaps.extend([a, b])
            i += 2
        else:
            a[-1] = Unit("I", [a[-1].tokens[0]])
            b[0] = Unit("I", [b[0].tokens[-1]])
            i += 1

    lists = [l for l in lists if l not in overlaps]

    i = 0
    num_lists = len(lists)
    while i < num_lists:
        if len(lists[i]) == 1:
            i += 1
            continue
        
        for unit in lists[i]:
            if len([t for t in unit.tokens if is_conjunction(t)]) == 1:
                lists.append([unit])
        i += 1
    
    return lists

In [9]:
def same_speech(a, b):
    nouns = []
    if a.pos_ in nouns and b.pos_ in nouns:
        return True
    return a.pos_ == b.pos_

In [14]:
def bound_list(lst):
    if len(lst) == 1:
        i_conj = 0
        for i, token in enumerate(lst[0].tokens):
            if is_conjunction(token):
                i_conj = i
                break

        a = Unit("I", lst[0].tokens[:i_conj])
        b = Unit("I", lst[0].tokens[i_conj+1:])

        print(a.tokens)
        print(b.tokens)

        ignore = ["ADV", "ADJ", "ADP", "DET", "SYM"]

        l_bound = b.tokens[0].pos_
        if l_bound in ignore:
            bi = 0
            while bi < len(b.tokens) and b.tokens[bi] in ignore:
                l_bound = b.tokens[bi].pos_
                bi += 1
            l_bound = [*ignore, l_bound]
        else:
            l_bound = [l_bound]

        r_bound = a.tokens[-1].pos_
        if r_bound in ignore:
            ai = len(a.tokens) - 1
            while ai >= 0 and a.tokens[ai] in ignore:
                r_bound = a.tokens[ai].pos_
                ai -= 1
            r_bound = [*ignore, r_bound]
        else:
            r_bound = [r_bound]
        
        print(l_bound)
        print(r_bound)
        
        i = len(a.tokens) - 1
        while i >= 0 and a.tokens[i].pos_ not in l_bound:
            i -= 1

        if i < 0:
            return None
        
        a.tokens = [*a.tokens[i:]]
        
        j = 0
        while j < len(b.tokens) and b.tokens[j].pos_ not in r_bound:
            j += 1
        
        if j > len(b.tokens):
            return None
            
        b.tokens = [*b.tokens[:j+1]]
        
        return [a, b]
    else:
        l_bound = [unit.tokens[0].pos_ for unit in lst[-2:]]
        l_bound.sort()
        if l_bound[0] != l_bound[-1]:
            return None
        
        print(f"Left Bound: {l_bound}")
        l_bound = l_bound[0]

        b_lst = [*lst[-2:]]
        for i in range(len(lst) - 3, 0, -1):
            unit = lst[i]
            match = find(unit.tokens, lambda token: token.pos_ == l_bound)
            print(f"...{match}")
            if match:
                b_lst.insert(0, unit)
            else:
                break

        i = 0
        while i >= 0 and b_lst[0].tokens[i].pos != l_bound:
            i -= 1

        b_lst[0].tokens = [*b_lst[0].tokens[i:]] 
        
        r_bound = [unit.tokens[-1].pos_ for unit in b_lst[:-1]]
        print(f"Right Bound: {r_bound}")
        
        r_bound.sort()
        
        if r_bound[0] != r_bound[-1]:
            return None
        
        r_bound = r_bound[0]
        
        j = 0
        while j < len(b_lst[-1].tokens) and b_lst[-1].tokens[j].pos != r_bound:
            j += 1

        b_lst[-1].tokens = [*b_lst[-1].tokens[:j+1]] 
        return b_lst

In [15]:
doc = nlp("Last week, Mr. Macron said his government would recognize a Palestinian state, setting France apart from the United States and most of its close allies, and risking friction with Mr. Trump.")
sep = ";" if find(doc, lambda token: token.text == ";" and (token.nbor() and token.nbor().lower_ in ["and", "or"])) else ","

units = find_units(doc)
for unit in units:
    print(f"({unit.sort}) \'{' '.join([token.text for token in unit.tokens])}\'")

lists = find_lists(units)
for l in lists:
    print(f"List: {[' '.join([token.text for token in unit.tokens]) for unit in l]}")

f_lists = fix_lists(lists)
for l in f_lists:
    print(f"List: {[' '.join([token.text for token in unit.tokens]) for unit in l]}")

f_lists = fix_lists(lists)
for l in f_lists:
    b_l = bound_list(l)
    print(f"Bound List: {[' '.join([token.text for token in unit.tokens]) for unit in b_l]}")

(I) 'Last week'
(B) ','
(I) 'Mr. Macron said his government would recognize a Palestinian state'
(B) ','
(I) 'setting France apart from the United States and most of its close allies'
(E) ', and'
(I) 'risking friction with Mr. Trump .'
List: ['Last week', 'Mr. Macron said his government would recognize a Palestinian state', 'setting France apart from the United States and most of its close allies', 'risking friction with Mr. Trump .']
List: ['Last week', 'Mr. Macron said his government would recognize a Palestinian state', 'setting France apart from the United States and most of its close allies', 'risking friction with Mr. Trump .']
List: ['setting France apart from the United States and most of its close allies']
Bound List: ['state', 'setting France apart from the United States and most of its close allies', 'risking friction with Mr. Trump .']
[setting, France, apart, from, the, United, States]
[most, of, its, close, allies]
['ADV', 'ADJ', 'ADP', 'DET', 'SYM', 'ADJ']
['PROPN']
Boun

In [16]:
for token in doc:
    print(token, token.pos_)

Last ADJ
week NOUN
, PUNCT
Mr. PROPN
Macron PROPN
said VERB
his PRON
government NOUN
would AUX
recognize VERB
a DET
Palestinian ADJ
state NOUN
, PUNCT
setting VERB
France PROPN
apart ADV
from ADP
the DET
United PROPN
States PROPN
and CCONJ
most ADJ
of ADP
its PRON
close ADJ
allies NOUN
, PUNCT
and CCONJ
risking VERB
friction NOUN
with ADP
Mr. PROPN
Trump PROPN
. PUNCT
