In [1]:
import re
import csv
import sys
import json
import math
import spacy
import textacy
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from taxonerd import TaxoNERD
from fastcoref import spacy_component
from spacy.matcher import Matcher, DependencyMatcher, PhraseMatcher
%run "./Main.ipynb"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Colors:
    HEADER = '\033[95m'
    RED = '\033[91m'
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [70]:
# df = pd.read_csv("../Week 16/Datasets/Baseline-1.csv")

# i = 0
# title = df.loc[i, "Title"]
# abstract = df.loc[i, "Abstract"]

title = "Example"
abstract = "AOL caused GMAIL, YAHOO, and OUTLOOK to shut down."

nlp = spacy.load("en_core_web_trf")
doc = nlp(abstract)

# Title
print(f"{Colors.BOLD}{title}{Colors.ENDC}")

# Abstract
verbs = []
objects = []
subjects = []

for triple in textacy.extract.subject_verb_object_triples(doc):
    verbs.extend(triple.verb)
    objects.extend(triple.object)
    subjects.extend(triple.subject)
    
for token in doc:
    color = Colors.ENDC
    if token in objects:
        color = Colors.YELLOW
    if token in subjects:
        color = Colors.BLUE
    if token in verbs:
        color = Colors.RED

    if token.sent.end == token.i + 1:
        print(f"{color}{token.text} ", end=f"{Colors.ENDC}")
    elif token.nbor() and token.nbor().text in [".","?","!",";", ")", ",", "]"]:
        print(f"{color}{token.text}", end=f"{Colors.ENDC}")
    elif token.sent.start == token.i:
        print(f"{color}{token.text} ", end=f"{Colors.ENDC}")
    elif token.text not in ["(", "["]:
        print(f"{color}{token.text} ", end=f"{Colors.ENDC}")
    else:
        print(f"{color}{token.text}", end=f"{Colors.ENDC}")

[1mExample[0m
[0mAOL [0m[0mcaused [0m[0mGMAIL[0m[0m, [0m[0mYAHOO[0m[0m, [0m[0mand [0m[0mOUTLOOK [0m[0mto [0m[0mshut [0m[0mdown[0m[0m. [0m

In [71]:
main = Main()
main.update_text(abstract)

09/04/2025 12:22:30 - INFO - 	 missing_keys: []
09/04/2025 12:22:30 - INFO - 	 unexpected_keys: []
09/04/2025 12:22:30 - INFO - 	 mismatched_keys: []
09/04/2025 12:22:30 - INFO - 	 error_msgs: []
09/04/2025 12:22:30 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M
09/04/2025 12:22:57 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00,  5.28 examples/s]
09/04/2025 12:23:03 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:04<00:00,  4.40s/it]


In [72]:
class Node:
    def __init__(self, main, is_action=False, tokens=None):
        self.main = main
        self.tokens = tokens or []
        self.expanded_tokens = []
        self.is_action = is_action

    def get_traits(self):
        tokens = set([*self.tokens, *self.expanded_tokens])
        tokens = tokens & set(self.main.trait.tokens)
        return tokens

    def get_species(self):
        tokens = set([*self.tokens, *self.expanded_tokens])
        tokens = tokens & set(self.main.species.tokens)
        return tokens

    def copy(self):
        node = Node(self.main)
        node.tokens = [*self.tokens]
        node.expanded_tokens = [*self.expanded_tokens]
        node.is_action = self.is_action
        return node

    def start(self):
        i = math.inf
        for token in self.tokens:
            i = min(i, token.i)
        return i

    def end(self):
        i = -math.inf
        for token in self.tokens:
            i = max(i, token.i)
        return i

    def get_tokens(self):
        return self.tokens

    def get_expanded_tokens(self):
        return [*self.tokens, *self.expanded_tokens]

    def __str__(self):
        tokens = list(set([*self.tokens, *self.expanded_tokens]))
        tokens = sorted(tokens, key=lambda token: token.i)
        return f"{','.join([t.text for t in tokens])}"

class Order:
    def __init__(self, main):
        self.main = main
        self.order = []

    def start(self):
        i = math.inf
        for item in self.order:
            i = min(i, item.start())
        return i

    def end(self):
        i = -math.inf
        for item in self.order:
            i = max(i, item.end())
        return i

    def copy(self):
        order = Order(self.main)
        order.order = []
        for item in self.order:
            order.order.append(item.copy())
        return order
        
    def get_tokens(self):
        ret = []
        for item in self.order:
            ret.extend(item.get_tokens())
        return ret

    def get_expanded_tokens(self):
        ret = []
        for item in self.order:
            ret.extend(item.get_expanded_tokens())
        return ret

    def get_species(self):
        ret = []
        for item in self.order:
            ret.extend(item.get_species())
        return ret

    def get_traits(self):
        ret = []
        for item in self.order:
            ret.extend(item.get_traits())
        return ret

    def __str__(self):
        ret = ""

        i = 0
        size = len(self.order)
        while i < size:
            ret += f"({self.order[i]})"
            if i != size - 1:
                ret += "->"
            i += 1
        
        return ret

In [73]:
def distinct_bounds(bounds, larger=True):
    d_bounds = []

    for bound in bounds:
        overlap = False
        for i, d_bound in enumerate(d_bounds):
            surround = bound[0] <= d_bound[0] <= bound[1] and bound[0] <= d_bound[1] <= bound[1]
            contains = d_bound[0] <= bound[0] <= d_bound[1] and d_bound[0] <= bound[1] <= d_bound[1]

            overlap = surround or contains

            bound_length = bound[1] - bound[0]
            d_bound_length = d_bound[1] - d_bound[0]
            
            if (surround and larger) or (contains and not larger):
                d_bounds[i] = bound
                
        if not overlap:
            d_bounds.append(bound)

    return list(set(d_bounds))

In [113]:




def expand_token(main, token):
    expanded_token = [token]

    # 1. Expand by Coreference
    if token.pos_ == "PRON":
        expanded_token = [*main.coref_map.get(token, [token])]

    # 2. Expand by Unit Map
    i = 0
    size = len(expanded_token)
    while i < size:
        exp_token = expanded_token[i]
        for unit_bound, unit in main.units.unit_map.items():
            if unit_bound[0] <= exp_token.i <= unit_bound[1] and unit.label_has([Unit.ITEM]):
                expanded_token = [*main.sp_doc[unit_bound[0]:unit_bound[1]+1]]
                break
        i += 1
    
    i = 0
    size = len(expanded_token)
    while i < size:
        exp_token = expanded_token[i]

        # 3. Expand by Noun Chunk
        if exp_token in main.noun_chunk_map:
            expanded_token.extend([*main.noun_chunk_map[exp_token]])

        # 4. Expand by Unit
        if exp_token in main.entity_map:
            expanded_token.extend([*main.entity_map[exp_token]])

        i += 1

    # Remove Duplicates and Sort
    expanded_token = list(set(expanded_token))
    expanded_token = sorted(expanded_token, key=lambda token: token.i)
    
    return expanded_token



def create_order(s_node, v_node, o_node):
    # 1. Subject Tokens Transferred to Object
    sub_transfer_tokens = []
    for token in s_node.tokens:
        if token.pos_ == "VERB":
            sub_transfer_tokens.append(token)
    
    s_node.tokens = [tkn for tkn in s_node.tokens if tkn not in sub_transfer_tokens]
    o_node.tokens.extend(sub_transfer_tokens)
    
    # 2. Object Tokens Transferred to Verb
    obj_transfer_tokens = []
    for token in o_node.tokens:
        if token in main.cause.tokens or token in main.change.tokens:
            obj_transfer_tokens.append(token)
    
    o_node.tokens = [tkn for tkn in o_node.tokens if tkn not in obj_transfer_tokens]
    v_node.tokens.extend(obj_transfer_tokens)

    # 3. Expand Tokens in Subject
    s_expanded_tokens = flatten([expand_token(main, tkn) for tkn in s_node.tokens])
    s_node.expanded_tokens = s_expanded_tokens

    # 3. Expand Tokens in Object
    o_expanded_tokens = flatten([expand_token(main, tkn) for tkn in o_node.tokens])
    o_node.expanded_tokens = o_expanded_tokens

    # Create Order S -> V -> 0
    if (
        not s_node.tokens or 
        not v_node.tokens or 
        not o_node.tokens
    ):
        return None
    
    order = Order(main)
    order.order = [s_node, v_node, o_node]
    
    return order



def swap_subject_object(verb):
    aux = verb.nbor(-1) and verb.nbor(-1).pos_ == "AUX"
    adp = verb.nbor(1) and verb.nbor(1).lower_ == "by"
    return aux or adp



def order_tokens(main, tokens):
    if len(tokens) <= 1:
        return None
    
    # Sort Tokens by Position in Doc
    tokens = sorted(tokens, key=lambda token: token.i)
    
    verbs = [token for token in tokens if token.pos_ == "VERB"]
    if not verbs:
        # Check for "(ADP|SCONJ) ..."
        if tokens[0].pos_ in ["ADP", "SCONJ"]:
            order = Order(main)
            order.order = [Node(main, tokens=tokens)]
            return order
        else:
            return None

    if len(tokens) <= 2:
        return None
    
    # Sort Verbs by Token Position in Doc
    verbs = sorted(verbs, key=lambda token: token.i)
    verb = verbs[0]

    if verb == tokens[0]:
        return None

    # Partition Tokens in Doc (L)
    l = tokens[0].i

    # Partition Tokens in Doc (R)
    # As we don't know the S/O, we stop at the closest noun.
    i = tokens.index(verb) + 1
    while i < len(tokens) and tokens[i].pos_ not in ["PROPN", "NOUN", "PRON"]:
        i += 1
    
    if i <= 0 or i >= len(tokens):
        return None
    
    r = tokens[i].i

    r_units = main.units.units_at_i(r)
    if r_units:
        start = False

        unit_map_values = list(main.units.unit_map.values())
        i = 0
        while i < len(unit_map_values):
            unit = unit_map_values[i]
            
            if r_units[0].sent_start() != unit.sent_start():
                i += 1
                continue
            
            if unit in r_units:
                start = True
            
            if not start:
                i += 1
                continue


            print("HELLO!!!!")
            print(unit, unit.label_())
            if unit.label_has([Unit.FRAGMENT, Unit.LIST, Unit.ITEM, Unit.D_CLAUSE, Unit.P_PHRASE]) and unit.r <= tokens[-1].i:
                r = max(r, unit.r)

                brk = True
                if i + 1 < len(unit_map_values):
                    print("HEY VENUS")
                    print(unit_map_values[i+1], unit_map_values[i+1].label_())
                    print("WOO\n")
                    
                if i + 1 < len(unit_map_values) and unit_map_values[i+1].label_has([Unit.LIST, Unit.ITEM, Unit.D_CLAUSE, Unit.P_PHRASE]):
                    brk = False

                if brk:
                    break
            else:
                print("nope")
                break

            i += 1
                
    
    # Create Nodes
    v_node = Node(main, is_action=True)
    v_node.tokens = [verb]
    # print("v_node.tokens", v_node.tokens)

    a_node = Node(main)
    a_node.tokens = [main.sp_doc[i] for i in range(l, verb.i)]
    # print("a_node.tokens", a_node.tokens)
    if not a_node.tokens:
        return None

    b_node = Node(main)
    b_node.tokens = [main.sp_doc[i] for i in range(verb.i+1, r+1)]
    # print("b_node.tokens", b_node.tokens)
    if not b_node.tokens:
        return None

    # Swap Subject and Object
    if swap_subject_object(verb):
        s_node = b_node
        o_node = a_node
    else:
        s_node = a_node
        o_node = b_node

    order = create_order(s_node, v_node, o_node)
    return order



def order_triple(main, triple):
    # Create Verb Node
    v_node = Node(main, is_action=True)
    v_node.tokens = [*triple.verb]

    # Find Subject and Object Nodes
    # We assume that the S and O nodes are on the L and R sides
    # of the verb.
    svo_tokens = [*triple.subject, *triple.object]
    svo_tokens = sorted(svo_tokens, key=lambda token: token.i)

    # Positions
    l = svo_tokens[0].i
    r = svo_tokens[-1].i

    # L and R Positions of Verb (Middle, M) Tokens
    l_m = triple.verb[0].i
    r_m = triple.verb[-1].i

    # Create Nodes
    a_node = Node(main)
    a_node.tokens = [main.sp_doc[i] for i in range(l, l_m)]
    a_node.expanded_tokens =  flatten([expand_token(main, tkn) for tkn in a_node.tokens])
    
    b_node = Node(main)
    b_node.tokens = [main.sp_doc[i] for i in range(r_m+1, r+1)]
    b_node.expanded_tokens =  flatten([expand_token(main, tkn) for tkn in b_node.tokens])

    # Swap Subject and Object
    if swap_subject_object(triple.verb[0]):
        s_node = b_node
        o_node = a_node
    else:
        s_node = a_node
        o_node = b_node

    ret = []
    
    order = create_order(s_node, v_node, o_node)
    if order:
        ret.append(order)
    
    sub_order = order_tokens(main, s_node.tokens)
    if sub_order:
        ret.append(sub_order)
    
    obj_order = order_tokens(main, o_node.tokens)
    if obj_order:
        ret.append(obj_order)

    return ret



def order_unit(main, unit):
    unit_tokens = [*main.sp_doc[unit.l:unit.r+1]]
    return order_tokens(main, unit_tokens)



def order_text(main):
    sents = list(main.sp_doc.sents)
    sents_orders = {sent.start: [] for sent in sents}
    sents_triples = {sent.start: [] for sent in sents}

    # 1. Parse Subject-Verb-Object Triples
    for triple in textacy.extract.subject_verb_object_triples(doc):
        sents_triples[triple.verb[0].sent.start].append(triple)

    for sent in sents:
        for triple in sents_triples[sent.start]:
            orders = order_triple(main, triple)
            sents_orders[sent.start].extend(orders)
    
    # 2. Parse Units
    unit_bounds = list(main.units.unit_map.keys())
    distinct_unit_bounds = distinct_bounds(unit_bounds)
    units = [unit[1] for unit in main.units.unit_map.items() if unit[0] in distinct_unit_bounds]

    i = 0
    tokens = []
    while i < len(units):
        unit = units[i]
        tokens.extend([*unit.span()])
        
        next_unit = None if i + 1 >= len(units) else units[i+1]
        if next_unit and next_unit.label_has([Unit.P_PHRASE, Unit.LIST]):
            i += 1
            continue
        
        unit_order = order_tokens(main, tokens)
        unit_sent_start = unit.sent_start()
        tokens = []
        
        if not unit_order or unit_sent_start == -1:
            i += 1
            continue

        sents_orders[unit_sent_start].append(unit_order)
        i += 1

    # 3. Remove Duplicates
    sents_orders = {k: discrete_events(v) for k, v in sents_orders.items()}

    return sents_orders

In [114]:
def discrete_events(rels):
    rel_bounds_mapped = {(rel.start(), rel.end()): rel for rel in rels}
    rel_bounds = rel_bounds_mapped.keys()    
    disc_rel_bounds = distinct_bounds(rel_bounds, larger=True)    
    disc_rels = [rel_bounds_mapped[bound] for bound in disc_rel_bounds]
    return disc_rels

In [115]:
sent_orders = order_text(main)
orders = []
for k, v in sent_orders.items():
    for r in v:
        orders.append(r)
        print(k, r)

HELLO!!!!
GMAIL, YAHOO, and OUTLOOK List
HEY VENUS
GMAIL Item
WOO

HELLO!!!!
GMAIL Item
HEY VENUS
OUTLOOK Item
WOO

HELLO!!!!
OUTLOOK Item
HEY VENUS
YAHOO Item
WOO

HELLO!!!!
YAHOO Item
HEY VENUS
to shut down Prepositional Phrase
WOO

HELLO!!!!
to shut down Prepositional Phrase
HEY VENUS
. Fragment
WOO

0 (AOL)->(caused)->(GMAIL,,,YAHOO,,,and,OUTLOOK,to,shut,down)


In [85]:
def causal_link_between_orders(X, Y):
    if X.start() < Y.start():
        l = X.order[-1].start()
        r = Y.order[0].end()
    else:
        l = Y.order[-1].start()
        r = X.order[0].end()
        
    tokens = X.main.sp_doc[l+1:r]
    print(f"tokens in between X and Y: {tokens}")
    tokens_speech = [token.pos_ for token in tokens]
    
    cc_tokens = set(X.main.cause.tokens)
    
    if "VERB" in tokens_speech or cc_tokens.intersection(tokens):
        print("ret True")
        return True
    print("ret False")
    return False

In [78]:
def merge_sentence_orders(orders):
    orders = [order.copy() for order in orders]
    orders = sorted(orders, key=lambda order: order.start())

    i = 0
    while i + 1 < len(orders):
        # print("HERE")
        a_order = orders[i]
        b_order = orders[i+1]

        a_order_is_adp = "ADP" in [token.pos_ for token in a_order.get_tokens()]

        # Merge Current and Next Order
        if a_order_is_adp:
            print(1)
            print("a_order_is_adp branch")
            print(a_order)
            print(b_order)
            print()
            print()
            a_order.order.extend([Node(b_order.main), *b_order.order])
            orders.pop(i+1)
            continue
        else:
            print(2)
            print("else branch")
            print(a_order)
            print(b_order)
            merged_order = merge(a_order, b_order)
            print(merged_order)
            print(bool(merged_order))
            print()
            print()
            if merged_order:
                orders[i] = merged_order
                orders.pop(i+1)
                continue
            elif causal_link_between_orders(a_order, b_order):
                print(3)
                a_order.order.extend([Node(b_order.main), *b_order.order])
                orders.pop(i+1)
                continue
        
        i += 1

    return orders

In [79]:
def overlap(X, Y):
    X_str = set([x.text for x in X])
    Y_str = set([y.text for y in Y])

    if X_str & Y_str:
        return True
    return False
    
def merge(X, Y):
    X = X.copy()
    Y = Y.copy()
    
    if X.start() < Y.start():
        A = X
        B = Y
    else:
        A = Y
        B = X
    
    A_tokens = flatten([item.get_expanded_tokens() for item in A.order[-2:]])
    B_tokens = flatten([item.get_expanded_tokens() for item in B.order[:1]])
    
    if not overlap(A_tokens, B_tokens):
        return None

    A.order.extend([Node(B.main), *B.order])
    return A

In [80]:
merged_sent_orders = []
for orders in sent_orders.values():
    result = merge_sentence_orders(orders)
    merged_sent_orders.extend(result)

print("merged_sent_orders")
for order in merged_sent_orders:
    print(order)
print()


merged_orders = merge_sentence_orders(merged_sent_orders)
print("merged_orders")
for order in merged_orders:
    print(order)
print()

# ret = merge_sentence_orders(orders)
# if ret:
#     for order in ret:
#         print(order)
#         print(len(order.order))

merged_sent_orders
(AOL)->(caused)->(GMAIL,,,YAHOO,,,and,OUTLOOK,to,shut,down)

merged_orders
(AOL)->(caused)->(GMAIL,,,YAHOO,,,and,OUTLOOK,to,shut,down)



In [81]:
unit_map_bounds = main.units.unit_map.keys()
unit_map_bounds = sorted(unit_map_bounds)

for bound in unit_map_bounds:
    unit = main.units.unit_map[bound]
    print(f"({unit.l}, {unit.r}) ({unit.label_()}) -> {main.sp_doc[unit.l:unit.r+1]}")

(0, 1) (Fragment) -> AOL caused
(2, 2) (Item) -> GMAIL
(2, 7) (List) -> GMAIL, YAHOO, and OUTLOOK
(4, 4) (Item) -> YAHOO
(7, 7) (Item) -> OUTLOOK
(8, 10) (Prepositional Phrase) -> to shut down
(11, 11) (Fragment) -> .
