In [None]:
import re
import csv
import sys
import json
import math
import spacy
import textacy
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from taxonerd import TaxoNERD
from fastcoref import spacy_component
from spacy.matcher import Matcher, DependencyMatcher, PhraseMatcher
%run "./Main.ipynb"

In [33]:
class Colors:
    HEADER = '\033[95m'
    RED = '\033[91m'
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [None]:
df = pd.read_csv("../Week 16/Datasets/Baseline-1.csv")

i = 0
title = df.loc[i, "Title"]
abstract = df.loc[i, "Abstract"]

nlp = spacy.load("en_core_web_trf")
doc = nlp(abstract)

# Title
print(f"{Colors.BOLD}{title}{Colors.ENDC}")

# Abstract
verbs = []
objects = []
subjects = []

for triple in textacy.extract.subject_verb_object_triples(doc):
    verbs.extend(triple.verb)
    objects.extend(triple.object)
    subjects.extend(triple.subject)
    
for token in doc:
    color = Colors.ENDC
    if token in objects:
        color = Colors.YELLOW
    if token in subjects:
        color = Colors.BLUE
    if token in verbs:
        color = Colors.RED

    if token.sent.end == token.i + 1:
        print(f"{color}{token.text} ", end=f"{Colors.ENDC}")
    elif token.nbor() and token.nbor().text in [".","?","!",";", ")", ",", "]"]:
        print(f"{color}{token.text}", end=f"{Colors.ENDC}")
    elif token.sent.start == token.i:
        print(f"{color}{token.text} ", end=f"{Colors.ENDC}")
    elif token.text not in ["(", "["]:
        print(f"{color}{token.text} ", end=f"{Colors.ENDC}")
    else:
        print(f"{color}{token.text}", end=f"{Colors.ENDC}")

In [12]:
main = Main()
main.update_text(abstract)

  model.load_state_dict(torch.load(filelike, map_location=device))
08/27/2025 16:18:11 - INFO - 	 missing_keys: []
08/27/2025 16:18:11 - INFO - 	 unexpected_keys: []
08/27/2025 16:18:11 - INFO - 	 mismatched_keys: []
08/27/2025 16:18:11 - INFO - 	 error_msgs: []
08/27/2025 16:18:11 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M
08/27/2025 16:18:53 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00,  4.33 examples/s]
08/27/2025 16:19:02 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:16<00:00, 16.95s/it]


In [54]:
class Node:
    def __init__(self, main, is_action=False):
        self.main = main
        self.tokens = []
        self.expanded_tokens = []
        self.is_action = is_action

    def traits(self):
        tokens = set([*self.tokens, *self.expanded_tokens])
        tokens = tokens & set(self.main.trait.tokens)
        return tokens

    def species(self):
        tokens = set([*self.tokens, *self.expanded_tokens])
        tokens = tokens & set(self.main.species.tokens)
        return tokens

    def start(self):
        i = math.inf
        for token in self.tokens:
            i = min(i, token.i)
        return i

    def end(self):
        i = -math.inf
        for token in self.tokens:
            i = max(i, token.i)
        return i

    def __str__(self):
        tokens = list(set([*self.tokens, *self.expanded_tokens]))
        tokens = sorted(tokens, key=lambda token: token.i)
        return f"{','.join([t.text for t in tokens])}"

class Order:
    def __init__(self, main):
        self.main = main
        self.order = []

    def start(self):
        i = math.inf
        for item in self.order:
            i = min(i, item.start())
        return i

    def end(self):
        i = -math.inf
        for item in self.order:
            i = max(i, item.start())
        return i

    def __str__(self):
        ret = ""

        i = 0
        size = len(self.order)
        while i < size:
            ret += f"({self.order[i]})"
            if i != size - 1:
                ret += "->"
            i += 1
        
        return ret

In [86]:
def expand_token(main, sent_i, token):
    expanded_token = [token]

    if token.pos_ == "PRON":
        expanded_token = [*main.coref_map.get(token, [token])]

    i = 0
    size = len(expanded_token)
    while i < size:
        exp_token = expanded_token[i]
        for ent_pos, ent in main.parts.reg[sent_i].items():
            if ent_pos[0] <= exp_token.i <= ent_pos[1] and ent.label in [Entity.LIST]:
                expanded_token = [*main.sp_doc[ent_pos[0]:ent_pos[1]+1]]
                break
        i += 1

    i = 0
    size = len(expanded_token)
    while i < size:
        exp_token = expanded_token[i]
        if exp_token in main.noun_chunk_map:
            expanded_token.extend([*main.noun_chunk_map[exp_token]])

        if exp_token in main.ent_map:
            expanded_token.extend([*main.ent_map[exp_token]])

        i += 1

    expanded_token = list(set(expanded_token))
    return expanded_token

def order_tokens(main, sent_i, tokens):
    if len(tokens) <= 2:
        return None
    
    # Sort Tokens by Position in Doc
    tokens = sorted(tokens, key=lambda token: token.i)
    print(tokens)
    
    verbs = [token for token in tokens if token.pos_ == "VERB"]
    if not verbs:
        return

    # Sort Verbs by Token Position in Doc
    verbs = sorted(verbs, key=lambda token: token.i)
    verb = verbs[0]

    if verb == tokens[0]:
        return None

    # Partition Tokens in Doc (L)
    l = tokens[0].i

    # Partition Tokens in Doc (R)
    i = tokens.index(verb) + 1
    while i < len(tokens) and tokens[i].pos_ not in ["PROPN", "NOUN", "PRON"]:
        i += 1
    
    if i <= 0 or i >= len(tokens):
        return None
    
    r = tokens[i].i
    print(r)

    # Swap Subj. and Obj.
    aux = verb.nbor(-1) and verb.nbor(-1).pos_ == "AUX"
    adp = verb.nbor(1) and verb.nbor(1).lower_ == "by"
    swap = aux or adp

    # Creating Nodes
    v_node = Node(main, is_action=True)
    v_node.tokens = [verb]

    a_node = Node(main)
    a_node.tokens = [main.sp_doc[i] for i in range(l, verb.i)]
    if not a_node.tokens:
        return None
    print(a_node.tokens)

    b_node = Node(main)
    b_node.tokens = [main.sp_doc[i] for i in range(verb.i+1, r+1)]
    if not b_node.tokens:
        return None
    print(b_node.tokens)

    if swap:
        s_node = b_node
        o_node = a_node
    else:
        s_node = a_node
        o_node = b_node

    # Subject Tokens Transferred to Object
    sub_transfer_tokens = []
    for token in s_node.tokens:
        if token.pos_ == "VERB":
            sub_transfer_tokens.append(token)
    s_node.tokens = [tkn for tkn in s_node.tokens if tkn not in sub_transfer_tokens]
    o_node.tokens.extend(sub_transfer_tokens)

    # Object Tokens Transferred to Verb
    obj_transfer_tokens = []
    for token in o_node.tokens:
        if token in main.cause.tokens or token in main.change.tokens:
            obj_transfer_tokens.append(token)
    o_node.tokens = [tkn for tkn in o_node.tokens if tkn not in obj_transfer_tokens]
    v_node.tokens.extend(obj_transfer_tokens)

    # Expand Tokens in Subject and Object
    s_expanded_tokens = flatten([expand_token(main, sent_i, tkn) for tkn in s_node.tokens])
    s_node.expanded_tokens = s_expanded_tokens
    
    o_expanded_tokens = flatten([expand_token(main, sent_i, tkn) for tkn in o_node.tokens])
    o_node.expanded_tokens = o_expanded_tokens

    # Create Order S -> V -> 0
    if (
        not s_node.tokens or 
        not v_node.tokens or 
        not o_node.tokens
    ):
        return None
    
    order = Order(main)
    order.order = [s_node, v_node, o_node]
    return order

def order_entity(main, sent_i, ent):
    assert ent.doc == main.sp_doc
    ent_tokens = [main.sp_doc[i] for i in range(ent.l, ent.r+1)]
    print(ent_tokens)
    return order_tokens(main, sent_i, ent_tokens)

def order_document(main):
    sents = list(main.sp_doc.sents)
    sents_orders = {sent.start: [] for sent in sents}
    sents_triples = {sent.start: [] for sent in sents}

    # Subject-Verb-Object Triples
    for triple in textacy.extract.subject_verb_object_triples(doc):
        sents_triples[triple.verb[0].sent.start].append(triple)

    for sent_i, sent in enumerate(sents):
        # Sentence SVOs
        for triple in sents_triples[sent.start]:
            # print(f"{Colors.BOLD}{Colors.BLUE}{triple}{Colors.ENDC}")
            l_m = triple.verb[0].i
            r_m = triple.verb[-1].i
            
            v_node = Node(main, is_action=True)
            v_node.tokens = [*triple.verb]
            
            # We assume that the verb is between
            # the subject and object.
            svo_tokens = [*triple.subject, *triple.object]
            svo_tokens = sorted(svo_tokens, key=lambda token: token.i)
            
            l = svo_tokens[0].i
            r = svo_tokens[-1].i

            a_node = Node(main)
            a_node.tokens = [main.sp_doc[i] for i in range(l, l_m)]
            a_node.expanded_tokens =  flatten([expand_token(main, sent_i, tkn) for tkn in a_node.tokens])
            
            b_node = Node(main)
            b_node.tokens = [main.sp_doc[i] for i in range(r_m+1, r+1)]
            b_node.expanded_tokens =  flatten([expand_token(main, sent_i, tkn) for tkn in b_node.tokens])

            # TODO
            swap = False
            if swap:
                pass
            else:
                s_node = a_node
                o_node = b_node

            order = Order(main)
            order.order = [s_node, v_node, o_node]
            # if order:
            #     print(order)
            sents_orders[sent.start].append(order)

            # Evaluate Subject and Object
            # s_order = order_tokens(main, sent_i, s_node.tokens)
            # if s_order:
            #     print(s_order)
            
            # o_order = order_tokens(main, sent_i, o_node.tokens)
            # if o_order:
            #     print(o_order)
            # sents_orders[sent.start].extend([s_order, o_order])

        # for ent_pos, ent in main.parts.reg[sent_i].items():
        #     print(ent_pos, ent.label_(), ent.lower())
        
        entities = list(main.parts.reg[sent_i].items())
        for entity_range, entity in entities:
            # print(entity_range)
            if entity_range[0] == 219:
                print(entity.lower())
                e_order = order_entity(main, sent_i, entity)
                if e_order:
                    print(e_order)
            # sents_orders[sent.start].append(e_order)

    return sents_orders

In [87]:
ret = order_document(main)

that had settled on or between barnacles to remain in the community.
[that, had, settled, on, or, between, barnacles, to, remain, in, the, community, .]
[that, had, settled, on, or, between, barnacles, to, remain, in, the, community, .]
225
[that, had]
[on, or, between, barnacles]
(that,had,settled,on,or,between,barnacles,to,remain)->(settled)->(that,had,settled,on,or,between,barnacles,to,remain)
that had settled on or between barnacles to remain
[that, had, settled, on, or, between, barnacles, to, remain]
[that, had, settled, on, or, between, barnacles, to, remain]
225
[that, had]
[on, or, between, barnacles]
(that,had,settled,on,or,between,barnacles,to,remain)->(settled)->(that,had,settled,on,or,between,barnacles,to,remain)
that had settled
[that, had, settled]
[that, had, settled]
