In [1]:
import stanza
import spacy
import fastcoref
from fastcoref import FCoref
import spacy
from spacy.matcher import Matcher

In [2]:
stanza.download("en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-14 16:48:24 INFO: Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
03/14/2025 16:48:24 - INFO - 	 Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
2025-03-14 16:48:24 INFO: Downloading default packages for language: en (English) ...
03/14/2025 16:48:24 - INFO - 	 Downloading default packages for language: en (English) ...
2025-03-14 16:48:28 INFO: File exists: C:\Users\lbeln\stanza_resources\en\default.zip
03/14/2025 16:48:28 - INFO - 	 File exists: C:\Users\lbeln\stanza_resources\en\default.zip
2025-03-14 16:48:37 INFO: Finished downloading models and saved to C:\Users\lbeln\stanza_resources
03/14/2025 16:48:37 - INFO - 	 Finished downloading models and saved to C:\Users\lbeln\stanza_resources


In [3]:
nlp = stanza.Pipeline('en')

2025-03-14 16:50:22 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
03/14/2025 16:50:22 - INFO - 	 Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-14 16:50:23 INFO: Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
03/14/2025 16:50:23 - INFO - 	 Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
2025-03-14 16:50:26 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

03/14/2025 16:50:26 - INFO - 	 Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined

In [20]:
def find_dependents(id, doc):
    dependents = []
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.head == id:
                dependents.append(word)
    return dependents

def find_start_verb(doc):
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.upos == "VERB":
                return word
    return None

def find_object(doc, verb):
    dependents = find_dependents(verb.id, doc)
    for word in dependents:
        if word.deprel == "obj" or word.deprel == "nsubj:pass":
            return word
    return None

def find_subject(doc, verb):
    dependents = find_dependents(verb.id, doc)
    for word in dependents:
        if word.deprel == "nsubj" or word.deprel == "obl:agent":
            return word
    
    for word in dependents:
        if word.deprel == "advcl":
            subject = find_subject(doc, word)
            if subject:
                return subject

    return None

In [21]:
def find_noun(doc, word):
    dependents = find_dependents(word.id, doc)
    for dependent in dependents:
        if dependent.deprel == "nmod:poss":
            return find_noun(doc, dependent)
    if word.upos == "NOUN" or word.upos == "PRON":
        return word
    return None

In [22]:
def same_reference(sentence, a, b, doc=None):
    if not doc:
        doc = nlp(sentence)

    model = FCoref(enable_progress_bar=False)
    
    noun_a = find_noun(doc, a)
    noun_b = find_noun(doc, b)

    if not noun_a or not noun_b:
        return False

    a_pos = (noun_a.start_char, noun_a.end_char)
    b_pos = (noun_b.start_char, noun_b.end_char)

    a_found = False
    b_found = False
    
    clusters = model.predict(texts=[sentence])[0].get_clusters(as_strings=False)
    for cluster in clusters:
        for c in cluster:
            if a_pos[0] >= c[0] and a_pos[0] <= c[1] and a_pos[1] >= c[0] and a_pos[1] <= c[1]:
                a_found = True
            if b_pos[0] >= c[0] and b_pos[0] <= c[1] and b_pos[1] >= c[0] and b_pos[1] <= c[1]:
                b_found = True
            if a_found and b_found:
                return True

    return False

In [34]:
def find_relations(sentence):
    relations = []
    doc = nlp(sentence)
    for sent in doc.sentences:
        for word in sent.words:
            if word.upos != "VERB":
                continue
            verb = word
            object = find_object(doc, verb)
            subject = find_subject(doc, verb)
            if verb and object and subject and not same_reference(sentence, object, subject, doc=doc) and is_species_or_trait(sentence, object) and is_species_or_trait(sentence, subject):
                relations.append({"verb": verb, "object": object, "subject": subject})
    return relations

In [24]:
def find_context(sentence, stanza_word):
    snlp = spacy.load("en_core_web_sm")
    matcher = Matcher(snlp.vocab)
    
    pattern = [
        {"POS": {"IN": ["DET", "ADJ", "NOUN", "PROPN"]}, "OP": "+"},
        {"POS": "ADP", "OP": "?"},
        {"POS": {"IN": ["DET", "ADJ", "NOUN", "PROPN"]}, "OP": "+"}
    ]
    matcher.add("NOUN_PHRASE", [pattern])

    doc = snlp(sentence)
    spans = [doc[start:end] for _, start, end in matcher(doc)]
    for span in spacy.util.filter_spans(spans):
        for word in span:
            if stanza_word.start_char == word.idx and stanza_word.end_char == (word.idx + len(word)):
                return span

    return None

In [30]:
def find_species_or_trait(context_span):
    contains_traits = ""
    contains_species = ""

    # There is no current way to determine whether
    # a species is included or not, I'd have to fine-tune
    # a model for that. So, for now, I'm hard-coding it.
    for word in context_span:
        if word.text[0:2] == "TR":
            contains_traits = "TR"
        if word.text[0:2] == "SP":
            contains_species = "SP"

    return [contains_traits, contains_species]

In [29]:
def is_species_or_trait(sentence, stanza_word):
    context = find_context(sentence, stanza_word)
    if not context:
        return False
    
    st_context = find_species_or_trait(context)
    return st_context[0] != "" or st_context[1] != ""

In [39]:
sentence_basic = "Presence of SPECIES1 reduced the TRAIT of SPECIES2 on SPECIES3"
relations = find_relations(sentence_basic)

if not relations:
    print("No Relations")
for r in relations:
    print(f"Verb: {r['verb'].text}, Object: {r['object'].text}, Subject: {r['subject'].text}")
    print(f"Object Context: \"{find_context(sentence_basic, r['object'])}\"")
    print(f"Subject Context: \"{find_context(sentence_basic, r['subject'])}\"")

03/14/2025 18:20:44 - INFO - 	 missing_keys: []
03/14/2025 18:20:44 - INFO - 	 unexpected_keys: []
03/14/2025 18:20:44 - INFO - 	 mismatched_keys: []
03/14/2025 18:20:44 - INFO - 	 error_msgs: []
03/14/2025 18:20:44 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/14/2025 18:20:44 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

03/14/2025 18:20:44 - INFO - 	 ***** Running Inference on 1 texts *****


Verb: reduced, Object: TRAIT, Subject: Presence
Object Context: "the TRAIT of SPECIES2"
Subject Context: "Presence of SPECIES1"
