In [1]:
import stanza
import spacy
import fastcoref
from fastcoref import FCoref
import spacy
from spacy.matcher import Matcher

In [31]:
print(spacy.__version__)

3.8.2


In [2]:
stanza.download("en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-25 17:04:51 INFO: Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
03/25/2025 17:04:51 - INFO - 	 Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
2025-03-25 17:04:51 INFO: Downloading default packages for language: en (English) ...
03/25/2025 17:04:51 - INFO - 	 Downloading default packages for language: en (English) ...
2025-03-25 17:04:53 INFO: File exists: C:\Users\lbeln\stanza_resources\en\default.zip
03/25/2025 17:04:53 - INFO - 	 File exists: C:\Users\lbeln\stanza_resources\en\default.zip
2025-03-25 17:04:58 INFO: Finished downloading models and saved to C:\Users\lbeln\stanza_resources
03/25/2025 17:04:58 - INFO - 	 Finished downloading models and saved to C:\Users\lbeln\stanza_resources


In [3]:
nlp = stanza.Pipeline('en')

2025-03-25 17:04:59 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
03/25/2025 17:04:59 - INFO - 	 Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-25 17:04:59 INFO: Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
03/25/2025 17:04:59 - INFO - 	 Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
2025-03-25 17:05:01 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

03/25/2025 17:05:01 - INFO - 	 Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined

In [4]:
def find_dependents(id, doc):
    dependents = []
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.head == id:
                dependents.append(word)
    return dependents

def find_start_verb(doc):
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.upos == "VERB":
                return word
    return None

def find_object(doc, verb):
    dependents = find_dependents(verb.id, doc)
    for word in dependents:
        if word.deprel == "obj" or word.deprel == "nsubj:pass":
            return word
    return None

def find_subject(doc, verb):
    dependents = find_dependents(verb.id, doc)
    for word in dependents:
        if word.deprel == "nsubj" or word.deprel == "obl:agent":
            return word
    
    for word in dependents:
        if word.deprel == "advcl":
            subject = find_subject(doc, word)
            if subject:
                return subject

    return None

In [5]:
def find_noun(doc, word):
    dependents = find_dependents(word.id, doc)
    for dependent in dependents:
        if dependent.deprel == "nmod:poss":
            return find_noun(doc, dependent)
    if word.upos == "NOUN" or word.upos == "PRON":
        return word
    return None

In [6]:
def same_reference(sentence, a, b, doc=None):
    if not doc:
        doc = nlp(sentence)

    model = FCoref(enable_progress_bar=False)
    
    noun_a = find_noun(doc, a)
    noun_b = find_noun(doc, b)

    if not noun_a or not noun_b:
        return False

    a_pos = (noun_a.start_char, noun_a.end_char)
    b_pos = (noun_b.start_char, noun_b.end_char)

    a_found = False
    b_found = False
    
    clusters = model.predict(texts=[sentence])[0].get_clusters(as_strings=False)
    for cluster in clusters:
        # There's a bug here and I just realized it, I forgot to reset a_found and b_found
        for c in cluster:
            if a_pos[0] >= c[0] and a_pos[0] <= c[1] and a_pos[1] >= c[0] and a_pos[1] <= c[1]:
                a_found = True
            if b_pos[0] >= c[0] and b_pos[0] <= c[1] and b_pos[1] >= c[0] and b_pos[1] <= c[1]:
                b_found = True
            if a_found and b_found:
                return True

    return False

In [20]:
def find_relations(sentence):
    relations = []
    doc = nlp(sentence)
    for sent in doc.sentences:
        for word in sent.words:
            print(f"Word: {word.text}, {word.upos}")
            if word.upos != "VERB":
                continue
            # print(f"Verb: {word.text}")
            verb = word
            object = find_object(doc, verb)
            # print(f"Object: {object.text}")
            subject = find_subject(doc, verb)
            # print(f"Subject: {subject.text}")
            same_ref = same_reference(sentence, object, subject, doc=doc)
            # print(f"Same Reference: {same_ref}")
            rel_obj = is_species_or_trait(sentence, object)
            # print(f"Relevant Object: {rel_obj}")
            rel_sub = is_species_or_trait(sentence, subject)
            # print(f"Relevant Subject: {rel_sub}")
            if verb and object and subject and not same_ref and rel_obj and rel_sub:
                relations.append({"verb": verb, "object": object, "subject": subject})
    return relations

In [34]:
def find_context(sentence, stanza_word):
    snlp = spacy.load("en_core_web_sm")
    matcher = Matcher(snlp.vocab)
    
    pattern = [
        {"POS": {"IN": ["DET", "ADJ", "NOUN", "PROPN"]}, "OP": "+"},
        {"POS": "ADP", "OP": "?"},
        {"POS": {"IN": ["DET", "ADJ", "NOUN", "PROPN"]}, "OP": "+"}
    ]
    matcher.add("NOUN_PHRASE", [pattern])

    doc = snlp(sentence)
    spans = [doc[start:end] for _, start, end in matcher(doc)]
    print(spans)
    for span in spacy.util.filter_spans(spans):
        for word in span:
            if stanza_word.start_char == word.idx and stanza_word.end_char == (word.idx + len(word)):
                return span

    return None

In [9]:
def find_species_or_trait(context_span):
    contains_traits = ""
    contains_species = ""

    # There is no current way to determine whether
    # a species is included or not, I'd have to fine-tune
    # a model for that. So, for now, I'm hard-coding it.
    for word in context_span:
        if word.text[0:2] == "TR":
            contains_traits = "TR"
        if word.text[0:2] == "SP":
            contains_species = "SP"

    return [contains_traits, contains_species]

In [10]:
def is_species_or_trait(sentence, stanza_word):
    context = find_context(sentence, stanza_word)
    print(f"Context of '{stanza_word.text}': {context}")
    if not context:
        print(f"Returning False for {stanza_word.text}")
        return stanza_word.text[:2] == "SP" or stanza_word.text[:2] == "TR"
    st_context = find_species_or_trait(context)
    return st_context[0] != "" or st_context[1] != ""

In [38]:
sentence_basic = "Presence of nonlethal tigers reduced the TRAIT of SPECIES_TRAMMEA on SPECIES_SMALL_GREEN_FROGS"
print(f"Sentence: \"{sentence_basic}\"")
relations = find_relations(sentence_basic)

if not relations:
    print("No Relations")
for r in relations:
    print(f"Subject: \"{r['subject'].text}\"")
    print(f"Subject Context: \"{find_context(sentence_basic, r['subject'])}\"\n")
    print(f"Verb: \"{r['verb'].text}\"\n")
    print(f"Object: \"{r['object'].text}\"")
    print(f"Object Context: \"{find_context(sentence_basic, r['object'])}\"")

Sentence: "Presence of nonlethal tigers reduced the TRAIT of SPECIES_TRAMMEA on SPECIES_SMALL_GREEN_FROGS"
Word: Presence, NOUN
Word: of, ADP
Word: nonlethal, ADJ
Word: tigers, NOUN
Word: reduced, VERB


03/25/2025 17:55:53 - INFO - 	 missing_keys: []
03/25/2025 17:55:53 - INFO - 	 unexpected_keys: []
03/25/2025 17:55:53 - INFO - 	 mismatched_keys: []
03/25/2025 17:55:53 - INFO - 	 error_msgs: []
03/25/2025 17:55:53 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/25/2025 17:55:53 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

03/25/2025 17:55:54 - INFO - 	 ***** Running Inference on 1 texts *****


[Presence of nonlethal, Presence of nonlethal tigers, nonlethal tigers, the TRAIT, TRAIT of SPECIES_TRAMMEA, the TRAIT of SPECIES_TRAMMEA, SPECIES_TRAMMEA on SPECIES_SMALL_GREEN_FROGS]
Context of 'TRAIT': the TRAIT of SPECIES_TRAMMEA
[Presence of nonlethal, Presence of nonlethal tigers, nonlethal tigers, the TRAIT, TRAIT of SPECIES_TRAMMEA, the TRAIT of SPECIES_TRAMMEA, SPECIES_TRAMMEA on SPECIES_SMALL_GREEN_FROGS]
Context of 'Presence': Presence of nonlethal tigers
Word: the, DET
Word: TRAIT, NOUN
Word: of, ADP
Word: SPECIES_TRAMMEA, PROPN
Word: on, ADP
Word: SPECIES_SMALL_GREEN_FROGS, PROPN
No Relations


In [27]:
snlp = spacy.load('en_core_web_sm')
doc = snlp(sentence_basic)

for word in doc:
    print(word, word.pos_)

( PUNCT
2 X
) PUNCT
Presence PROPN
of ADP
nonlethal PROPN
SPECIES_ANAX PRON
reduced VERB
the DET
TRAIT PROPN
of ADP
SPECIES_TRAMEA PROPN
on ADP
SPECIES_SMALL_GREEN_FROGS ADV
. PUNCT
