In [1]:
import re
import spacy
import stanza
import textacy
from fastcoref import FCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")


In [2]:
sp_nlp = spacy.load("en_core_web_md")
st_nlp = stanza.Pipeline(lang='en', processors='tokenize')
tn_nlp = TaxoNERD().load(model="en_ner_eco_biobert")
fcoref = FCoref(enable_progress_bar=False)

2025-04-29 12:53:28 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
04/29/2025 12:53:28 - INFO - 	 Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


ConnectionError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /stanfordnlp/stanza-resources/main/resources_1.10.0.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000025DDE81A980>: Failed to resolve 'raw.githubusercontent.com' ([Errno 11001] getaddrinfo failed)"))

In [None]:
OWNER = "owner"
OWNED = "owned"

# Pattern 1
pattern_1 = [
    {
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "POS": {
                "IN": ["NOUN"]
            }
        }
    },
    {
        "LEFT_ID": OWNED,
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "poss"
        }
    }
]

# Pattern 2
pattern_2 = [
     {
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "POS": {
                "IN": ["NOUN"]
            }
        }
    },
    {
        "LEFT_ID": OWNED,
        "REL_OP": ">",
        "RIGHT_ID": "adp",
        "RIGHT_ATTRS": {
            "DEP": "prep",
            "POS": {
                "IN": ["ADP"]
            }
        }
    },
    {
        "LEFT_ID": "adp",
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "pobj",
            "POS": {
                "IN": ["NOUN"]
            }
        }
    }
]

# Pattern 3
pattern_3 = [
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "nsubj",
            "POS": {"IN": ["PRON"]}
        }
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "DEP": "dobj",
            "POS": {"IN": ["NOUN"]}
        }
    }
]

# Pattern 4:
pattern_4 = [
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "DEP": "nsubj",
            "POS": {"IN": ["NOUN"]}
        }
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "adp",
        "RIGHT_ATTRS": {
            "DEP": "prep",
            "POS": {"IN": ["ADP"]}
        }
    },
    {
        "LEFT_ID": "adp",
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "pobj",
            "POS": {"IN": ["NOUN"]}
        }
    }
]

patterns = {
    "Pattern1": pattern_1,
    "Pattern2": pattern_2,
    "Pattern3": pattern_3,
    "Pattern4": pattern_4
}

def what_matcher(sp_nlp):
    matcher = DependencyMatcher(sp_nlp.vocab)
    for pattern_id, pattern in patterns.items():
        matcher.add(pattern_id, [pattern])
    return matcher

def what_mapping(sp_nlp, sp_doc):
    matcher = what_matcher(sp_nlp)
    what_matches = matcher(sp_doc)
    
    index_to_what_map = {}
    for match_id, token_ids in what_matches:
        pattern_id = sp_nlp.vocab.strings[match_id]
        # print(pattern_id)
        owner = None
        owned = None
        for i in range(len(token_ids)):
            right_id = patterns[pattern_id][i]["RIGHT_ID"]
            if right_id == OWNER:
                owner = sp_doc[token_ids[i]]
            if right_id == OWNED:
                owned = sp_doc[token_ids[i]]
        if owner.i not in index_to_what_map:
            index_to_what_map[owner.i] = []
        index_to_what_map[owner.i].append(owned)
        if owned.i not in index_to_what_map:
            index_to_what_map[owned.i] = []
        index_to_what_map[owned.i].append(owner)

    return index_to_what_map

In [None]:
sent = "Grasshoppers exhibited significant diet shifts from grass to herbs (Kruskal-Wallis test, P 0.01, df 3) when they were in the presence of the comparatively sedentary species (the smaller Pisaurina and the larger Hogna) compared to controls without spiders (Fig. 2)."
sent = re.sub("[\(\[].*?[\)\]]", "", sent)
sent = re.sub("\s+", " ", sent)
# print(f"Sentence: {sent}")

sp_doc = sp_nlp(sent)

def process(text):
    sp_doc = sp_nlp(text)
    # print(f"\tSentence: {text}")

    # Find Object
    root = list(sp_doc.sents)[0].root
    if root.pos_ not in ["NOUN", "PROPN"]:
        for child in root.children:
            if child.pos_ in ["NOUN", "PROPN"]:
                root = child
                break
    # print(f"\t\tRoot: {root}")

    # Find Characteristic
    what_map = what_mapping(sp_nlp, sp_doc)
    # print(what_map)

    characteristic = None
    if root.i in what_map:
        characteristic = what_map[root.i]
    # print(f"Characteristic: {characteristic}")

    # Find Cause
    cause = None
    cause_li = 0
    cause_ri = 0
    for token in sp_doc:
        if token.pos_ != "SCONJ":
            continue
        cause_li = token.i
        cause_ri = cause_li + 1
        while cause_ri < len(sp_doc) and sp_doc[cause_ri].pos_ in ["DET", "NOUN", "PRON", "PROPN", "ADP", "ADV", "ADJ", "AUX"]:
            cause_ri += 1
        cause = sp_doc[cause_li:cause_ri]
    # print(f"Cause: {cause}")

    data = {
        "object": root,
        "characteristic": characteristic,
        "cause": cause
    }
    # print(data)
    
    return data
    
def process_sentence(sp_doc, l_i, r_i):
    verb = None
    for token in sp_doc[l_i:r_i]:
        if token.pos_ == "VERB":
            verb = token
            break

    # print(f"Verb: {verb}")
    
    if verb == None:
        return process(sp_doc[l_i:r_i+1].text)
    else:
        l_data = process_sentence(sp_doc, l_i, verb.i - 1)
        r_data = process_sentence(sp_doc, verb.i + 1, r_i)
        
        return {
            "left": l_data,
            "right": r_data
        }

process_sentence(sp_doc, 0, len(sp_doc))

In [None]:
class SentenceParser:
    def __init__(self, name, breed):
        self.what_map = {}
        self.refs_map = {}
    
    

In [None]:
from pprint import pprint
pprint(process_sentence(sp_doc, 0, len(sp_doc)), indent=4)

In [None]:
!pip uninstall nltk