In [22]:
import spacy
import stanza
import textacy
from fastcoref import FCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher
# !pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_md-1.1.0.tar.gz
# !pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_biobert-1.1.0.tar.gz
# !pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_md_weak-1.1.0.tar.gz
# !pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_biobert_weak-1.1.0.tar.gz

In [23]:
sp_nlp = spacy.load("en_core_web_sm")
st_nlp = stanza.Pipeline(lang='en', processors='tokenize')
tn_nlp = TaxoNERD().load(model="en_ner_eco_biobert")
fcoref = FCoref(enable_progress_bar=False)

2025-04-28 16:19:26 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
04/28/2025 16:19:26 - INFO - 	 Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 2.56MB/s]                    
2025-04-28 16:19:28 INFO: Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
04/28/2025 16:19:28 - INFO - 	 Downloaded file to C:\Users\lbeln\stanza_resources\resources.json
2025-04-28 16:19:28 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

04/28/2025 16:19:28 - INFO - 	 Loading these models for

In [None]:
# !pip freeze > requirements1.txt
# !pip install nltk
# import nltk
# nltk.download('wordnet')

In [24]:
# There's no definite names for these patterns as I do not know what
# to call them. These patterns are used to extract possessive
# relationships from a sentence. I also could not find better names for
# the two variables below.
OWNER = "owner"
OWNED = "owned"

pattern_1 = [
    {
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "POS": {
                "IN": ["NOUN"]
            }
        }
    },
    {
        "LEFT_ID": OWNED,
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "poss"
        }
    }
]

pattern_2 = [
     {
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "POS": {
                "IN": ["NOUN"]
            }
        }
    },
    {
        "LEFT_ID": OWNED,
        "REL_OP": ">",
        "RIGHT_ID": "adp",
        "RIGHT_ATTRS": {
            "DEP": "prep",
            "POS": {
                "IN": ["ADP"]
            }
        }
    },
    {
        "LEFT_ID": "adp",
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "pobj",
            "POS": {
                "IN": ["NOUN"]
            }
        }
    }
]

pattern_3 = [
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "nsubj",
            "POS": {"IN": ["PRON"]}
        }
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "DEP": "dobj",
            "POS": {"IN": ["NOUN"]}
        }
    }
]

pattern_4 = [
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "DEP": "nsubj",
            "POS": {"IN": ["NOUN"]}
        }
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "adp",
        "RIGHT_ATTRS": {
            "DEP": "prep",
            "POS": {"IN": ["ADP"]}
        }
    },
    {
        "LEFT_ID": "adp",
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "pobj",
            "POS": {"IN": ["NOUN"]}
        }
    }
]

patterns = {
    "Pattern1": pattern_1,
    "Pattern2": pattern_2,
    "Pattern3": pattern_3,
    "Pattern4": pattern_4,
}

def dependency_matcher(sp_nlp):
    matcher = DependencyMatcher(sp_nlp.vocab)
    for pattern_id, pattern in patterns.items():
        matcher.add(pattern_id, [pattern])
    return matcher

def index_to_what(sp_nlp, sp_doc, what_matches):
    index_to_what_map = {}
    for match_id, token_ids in what_matches:
        pattern_id = sp_nlp.vocab.strings[match_id]
        # print(pattern_id)
        owner = None
        owned = None
        for i in range(len(token_ids)):
            right_id = patterns[pattern_id][i]["RIGHT_ID"]
            if right_id == OWNER:
                owner = sp_doc[token_ids[i]]
            if right_id == OWNED:
                owned = sp_doc[token_ids[i]]
        if owner.i not in index_to_what_map:
            index_to_what_map[owner.i] = []
        index_to_what_map[owner.i].append(owned)
        if owned.i not in index_to_what_map:
            index_to_what_map[owned.i] = []
        index_to_what_map[owned.i].append(owner)

    return index_to_what_map

In [25]:
def split_sentence(sp_doc, svo, prev_svo, next_svo):
    sub_l_i = 0 if not prev_svo else prev_svo.object[-1].i + 1
    sub_r_i = svo.verb[0].i

    while sub_l_i < len(sp_doc) and sp_doc[sub_l_i].sent.start != svo.subject[0].sent.start:
        sub_l_i += 1
        
    obj_l_i = svo.verb[-1].i + 1
    obj_r_i = len(sp_doc) - 1 if not next_svo else next_svo.subject[0].i

    while obj_r_i >= 0 and sp_doc[obj_r_i].sent.start != svo.object[0].sent.start:
        obj_r_i -= 1

    return sp_doc[sub_l_i:sub_r_i], sp_doc[obj_l_i:obj_r_i+1]


def index_to_cluster(fc_predictions):
    index_to_cluster_map = {}
    for prediction in fc_predictions:
        clusters = prediction.get_clusters(as_strings=False)
        for cluster in clusters:
            for token in cluster:
                index = token[0]
                index_to_cluster_map[index] = list(filter(lambda t: t[0] != index, cluster))
    return index_to_cluster_map

def species_indices(tn_doc):
    indices = []
    for species_span in tn_doc.ents:
        for species in species_span:
            indices.append(species.idx)
    return indices

def is_species(tokens, context, species_indices):
    for token in [*tokens, *context]:
        if token.idx in species_indices:
            # print(f"\t\t\tToken '{token.text}' is a Species")
            return True
    return False


def get_holder(tokens, cluster_map, what_map):
    holder = []
    for token in tokens:
        if token.idx in cluster_map:
            holder += cluster_map[token.idx]
        if token.i in what_map:
            holder += what_map[token.i]
    return holder

def same_reference(a, b, cluster_map, what_map, species_indices):
    a_is_species = is_species(a, [], species_indices)
    print(f"A is Species: {a_is_species}")
    if not a_is_species:
        a = get_holder(a, cluster_map, what_map)
        if len(a) == 0:
            return False
    print(f"A: {a}")
    
    b_is_species = is_species(b, [], species_indices)
    print(f"B is Species: {b_is_species}")
    if not b_is_species:
        b = get_holder(b, cluster_map, what_map)
        if len(b) == 0:
            return False
    print(f"B is Species: {b}")

    for token_a in a:
        for token_b in b:
            print(f"\tToken A: {token_a}, Token B: {token_b}")
            if token_a.pos_ not in ["NOUN", "PROPN", "ADJ"]:
                continue
            if token_b.pos_ not in ["NOUN", "PROPN", "ADJ"]:
                continue
            if token_a.pos_ != token_b.pos_:
                continue
            print(f"Comparison: {token_a.lower_ != token_b.lower}")
            if token_a.lower_ != token_b.lower:
                return False
    return True

def get_related_possessions(tokens, what_map):
    related = []
    for token in tokens:
        if token.i in what_map:
            related += what_map[token.i]
    return related

def filter_by_species(tokens, species_indices):
    filtered = []
    for token in tokens:
        if token.idx in species_indices:
            filtered.append(token)
    return filtered

# def same_reference(tokensA, tokensB, cluster_map):
#     clustersA = set()
#     for token in tokensA:
#         if token.idx in cluster_map:
#             for cluster_token in cluster_map[token.idx]:
#                 clustersA.add(cluster_token)
#     clustersB = set()
#     for token in tokensB:
#         if token.idx in cluster_map:
#             for cluster_token in cluster_map[token.idx]:
#                 clustersB.add(cluster_token)

#     if not set(clustersA).isdisjoint(clustersB):
#         return False
#     return True
    
def parse(text, verbose=False):
    sp_doc = sp_nlp(text)
    tn_doc = tn_nlp(text)
    species = species_indices(tn_doc)

    matcher = dependency_matcher(sp_nlp)
    matches = matcher(sp_doc)
    what_map = index_to_what(sp_nlp, sp_doc, matches)

    predictions = fcoref.predict(texts=[text])
    cluster_map = index_to_cluster(predictions)
    print(cluster_map)
    # print(what_map)

    data = {}
    
    svo_triples = list(textacy.extract.subject_verb_object_triples(sp_doc))
    for index, svo_triple in enumerate(svo_triples):
        # Data to Mine
        data = {
            "source": "",
            "source_trait": "",
            "source_change": "",
            "target": "",
            "target_trait": "",
            "target_change": ""
        }
        
        print(svo_triple)

        prev_svo = None if index == 0 else svo_triples[index - 1]
        next_svo = None if index >= len(svo_triples) - 1 else svo_triples[index + 1]

        sub_context, obj_context = split_sentence(sp_doc, svo_triple, prev_svo, next_svo)
        # print(f"Sub Context: {sub_context}")
        # print(f"Obj Context: {obj_context}")

        base_verb_docs = [sp_nlp(verb) for verb in ["exhibited"]]
        verb_docs = [sp_nlp(verb.text) for verb in svo_triple.verb]

        same = same_reference(svo_triple.subject, svo_triple.object, cluster_map, what_map, species)
        if same:
            print("!!!")
            sub_is_species = is_species(svo_triple.subject, [], species)
            if sub_is_species:
                data["source"] = svo_triple.subject
                data["source_trait"] = svo_triple.object
            else:
                data["source_trait"] = svo_triple.subject
                data["source"] = svo_triple.object
        else:
            sub_is_species = is_species(svo_triple.subject, [], species)
            if sub_is_species:
                data["source"] = svo_triple.subject
            if not sub_is_species:
                data["source_trait"] = svo_triple.subject
                data["source"] = filter_by_species(get_related_possessions(svo_triple.subject, what_map), species)
    
            obj_is_species = is_species(svo_triple.object, [], species)
            if obj_is_species:
                data["target"] = svo_triple.object
            if not obj_is_species:
                data["target_trait"] = svo_triple.object
                data["target"] = filter_by_species(get_related_possessions(svo_triple.object, what_map), species)

        print(data)
            
        

In [26]:
# Example 1
text = "Grasshoppers exhibited significant diet shifts from grass to herbs (Kruskal-Wallis test, P 0.01, df 3) when they were in the presence of the comparatively sedentary species (the smaller Pisaurina and the larger Hogna) compared to controls without spiders (Fig. 2)."
parse(text)

04/28/2025 16:20:10 - INFO - 	 Tokenize 1 inputs...
04/28/2025 16:20:11 - INFO - 	 ***** Running Inference on 1 texts *****


{0: [(108, 112)], 108: [(0, 12)]}
SVOTriple(subject=[Grasshoppers], verb=[exhibited], object=[diet, shifts])
A is Species: True
A: [Grasshoppers]
B is Species: False
B is Species: [grass]
	Token A: Grasshoppers, Token B: grass
Comparison: True
{'source': [Grasshoppers], 'source_trait': '', 'source_change': '', 'target': [], 'target_trait': [diet, shifts], 'target_change': ''}


In [None]:
# Example 2
text = "Our results show that phototrophs can indirectly decrease the population density of heterotrophic bacteria by modification of the nature of bacterial interactions with predators."
parse(text)

In [None]:
# Example 3
text = "All predators inflicted significant mortality on the prey at each prey density compared to the predator-free control for that density"
parse(text)

In [None]:
# Example 4
text = "Our results show that an increase in sediment organic matter content is associated to a decline in the abundance of Loripes lucinalis (lucinid bivalve) in the Cymodocea nodosa meadows studied, which potentially may weaken the mutualism between the two species."
parse(text)

In [None]:
# Example 5
text = "The abundance of lucinids showed a negative correlation with the organic matter content in vegetated sediments (Fig. 3a), but showed no correlation in bare ones (Fig. 3b)."
parse(text)

In [None]:
# Example 6
text = "The MANOVA on the cattle tank experiment showed that the presence of Tramea, nonlethal Anax, and large bullfrog tadpoles all had significant effects on both small tadpole species (Table 1)."
parse(text)

In [None]:
# Example 7
text = "Thus the presence of predators, both nonlethal Anax and lethal Tramea, modified the tank environment in a way that facilitated invasion by midges, but only in the absence of large bullfrogs."
parse(text)

In [None]:
# Example 8
text = "We hypothesized that the presence of Anax would decrease foraging activity of small tadpoles, which in turn would decrease predation by Tramea on the small tadpoles"
parse(text)

In [None]:
# Example 9
text = '''Only a fraction of the individuals in a given prey population are likely to be killed and consumed by predators. In contrast, nearly all individuals experience the chronic effects of predation risk. When threatened by predators, prey adopt defensive tactics whole costs can lead to reduced growth, maturation rates, survivorship, fecundity, or population density. This nonconsumptive impact of predation risk on prey is known as a "trait-mediated interaction" (TMI) because it results from changes in prey traits such as behavior or physiology. Ecological theory suggests that the strength of TMI effects will reflect a balance between the conflicting demands of reproduction vs. predator avoidance. Competitor density and resource availability are expected to alter the balance between these conflicting forces. We conducted a meta-analysis of experimental studies that measured TMI effect size while varying competitor and/or resource density. The threat of predation had an overall negative effect on prey performance, but the strength of this effect varied with the level of competition. High competition exacerbated the negative effect of intimidation on prey density but moderated the negative effect of intimidation on prey life history and growth. We discuss these results in light of previously published theoretical expectations. Our results highlight the variable and context-dependent nature of interspecific interactions.'''
parse(text)

In [None]:
# Example 10
text = "Current theory on trophic interactions in food webs assumes that ecologically similar species can be treated collectively as a single functional unit such as a guild or trophic level. This theory implies that all species within that unit transmit identical direct and indirect effects throughout the community. We evaluated this assumption by conducting experiments to compare the direct and indirect effects of three top-predator species, belonging to the same hunting spider guild, on the same species of grasshopper and on old-field grasses and herbs. Observations under field conditions revealed that each spider species exhibited different hunting behavior (i.e., sit-and-wait, sit-and-pursue, and active hunting) and occupied different locations within the vegetation canopy. These differences resulted in different direct effects on grasshopper prey. Grasshoppers demonstrated significant behavioral (diet) shifts in the presence of sit-and-wait and sit-and-pursue species but not when faced with actively hunting species. Grasshopper density was significantly reduced by spider species that occupied lower parts of the vegetation canopy (sit-and-pursue and actively hunting species), but it was not significantly reduced by the sit-and-wait spider species that occupied the upper parts of the canopy. These direct effects manifested themselves differently in the plant trophic level. The sit-and-wait spider caused indirect effects on plants by changing grasshopper foraging behavior (a trait-mediated effect). The sit-and-pursue spider caused indirect effects by reducing grasshopper density (density-mediated effects); the effects of changes in grasshopper behavior were thus not reflected in the plant trophic level. The actively hunting spiders had strictly density-mediated indirect effects on plants. The study offers mechanistic insight into how predator species within the same guild can have very different trophic effects in food webs. Thus classical modeling approaches that treat all predator species as a single functional unit may not adequately capture biologically relevant details that influence community dynamics."
parse(text)

In [None]:
# Example 11
text = "Diversity and plasticity are hallmarks of cells of the monocyte-macrophage lineage. In response to IFNs, Toll-like receptor engagement, or IL-4/IL-13 signaling, macrophages undergo M1 (classical) or M2 (alternative) activation, which represent extremes of a continuum in a universe of activation states. Progress has now been made in defining the signaling pathways, transcriptional networks, and epigenetic mechanisms underlying M1-M2 or M2-like polarized activation. Functional skewing of mononuclear phagocytes occurs in vivo under physiological conditions (e.g., ontogenesis and pregnancy) and in pathology (allergic and chronic inflammation, tissue repair, infection, and cancer). However, in selected preclinical and clinical conditions, coexistence of cells in different activation states and unique or mixed phenotypes have been observed, a reflection of dynamic changes and complex tissue-derived signals. The identification of mechanisms and molecules associated with macrophage plasticity and polarized activation provides a basis for macrophage-centered diagnostic and therapeutic strategies."
parse(text)

In [None]:
# Example 12
text = "The stranger hit a bystander, the man hit a stranger"
parse(text)