In [69]:
import re
import spacy
import stanza
import textacy
from fastcoref import FCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher
from pprint import pprint

In [70]:
class References:
    fcoref = None
    
    def __init__(self, corpus, token_map=None):
        if not References.fcoref:
            References.fcoref = FCoref(enable_progress_bar=False)
        if isinstance(corpus, list):
            corpus = [corpus]        
        self.predictions = References.fcoref.predict(texts=corpus)
        self.token_map = token_map
        self.cluster_map = self.index_to_cluster(self.predictions)

    def index_to_cluster(self, predictions):
        print(predictions)
        index_to_cluster_map = {}
        for prediction in predictions:
            clusters = prediction.get_clusters(as_strings=False)
            for cluster in clusters:
                for token in cluster:
                    index = token[0]
                    if char_token_map:
                        if index not in self.token_map:
                            raise Exception("Invalid Token")
                        index = self.token_map[index]
                    index_to_cluster_map[index] = list(filter(lambda t: t[0] != index, cluster))
        return index_to_cluster_map

    def get_references(self, tokens):
        refs = []
        for token in tokens:
            index = token.idx if not self.char_token_map else token.i
            if index in self.cluster_map:
                refs += self.cluster_map[index]
        return refs

In [71]:
class Species:
    tn_nlp = None
    
    def __init__(self, text, token_map=None):
        if not Species.tn_nlp:
            Species.tn_nlp = TaxoNERD().load(model="en_ner_eco_biobert")
        self.tn_doc = self.tn_nlp(text)
        self.token_map = token_map
        self.species_list = self.species_indices(self.tn_doc)

    def species_indices(self, tn_doc):
        indices = []
        for species_span in self.tn_doc.ents:
            for species in species_span:
                index = species.idx
                if self.token_map:
                    if index not in self.token_map:
                        raise Exception("Invalid Token")
                    index = self.token_map[index]
                indices.append(index)
        return indices

    def is_species(self, token):
        index = token.idx if not self.token_map else token.i
        return index in self.species_list
        
    def contains_species(self, tokens):
        for token in tokens:
            index = token.idx if not self.token_map else token.i
            if index in self.species_list:
                return True
        return False

In [72]:
class Unit:
    def __init__(self, *, species=None, trait=None, change=None, cause=None):
        self.species = species
        self.trait = trait
        self.cause = cause
        self.change = change

    def empty(self):
        if not self.species and not self.trait and not self.cause and not self.change:
            return True
        return False

    def not_empty(self):
        return not empty()

    def can_merge(self, unit):
        # Two units can merge if there's no
        # overlap.
        if self.species and unit.species:
            return False
        if self.trait and unit.trait:
            return False
        if self.cause and unit.cause:
            return False
        if self.change and unit.change:
            return False
        return True

    def merge(self, unit):
        # We take the parts that the
        # other unit has; assuming that
        # there's no overlap, there's
        # no loss of information.
        if unit.species:
            self.species = unit.species
        if unit.trait:
            self.trait = unit.trait
        if unit.cause:
            self.cause = unit.cause
        if unit.change:
            self.change = unit.change

In [73]:
class Possession:
    matcher = None
    
    # There's no definite names for these patterns as I do not know what
    # to call them. These patterns are used to extract possessive
    # relationships from a sentence. I also could not find better names for
    # the two variables below.
    OWNER = "owner"
    OWNED = "owned"
    
    patterns = {
        "Pattern1": [
            {
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "POS": {
                        "IN": ["NOUN"]
                    }
                }
            },
            {
                "LEFT_ID": OWNED,
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "poss"
                }
            }
        ],
        "Pattern2": [
             {
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "POS": {
                        "IN": ["NOUN"]
                    }
                }
            },
            {
                "LEFT_ID": OWNED,
                "REL_OP": ">",
                "RIGHT_ID": "adp",
                "RIGHT_ATTRS": {
                    "DEP": "prep",
                    "POS": {
                        "IN": ["ADP"]
                    }
                }
            },
            {
                "LEFT_ID": "adp",
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "pobj",
                    "POS": {
                        "IN": ["NOUN"]
                    }
                }
            }
        ],
        "Pattern3": [
            {
                "RIGHT_ID": "verb",
                "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "nsubj",
                    "POS": {"IN": ["PRON"]}
                }
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "DEP": "dobj",
                    "POS": {"IN": ["NOUN"]}
                }
            }
        ],
        "Pattern4": [
            {
                "RIGHT_ID": "verb",
                "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "DEP": "nsubj",
                    "POS": {"IN": ["NOUN"]}
                }
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": "adp",
                "RIGHT_ATTRS": {
                    "DEP": "prep",
                    "POS": {"IN": ["ADP"]}
                }
            },
            {
                "LEFT_ID": "adp",
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "pobj",
                    "POS": {"IN": ["NOUN"]}
                }
            }
        ],
    }
    
    def __init__(self, sp_nlp, sp_doc=None):
        self.sp_nlp = sp_nlp
        print(f"SP_NLP: {self.sp_nlp}")
        self.sp_doc = sp_doc
        print(f"SP_DOC: {self.sp_doc}")

        if not Possession.matcher:
            Possession.matcher = DependencyMatcher(self.sp_nlp.vocab)
            for pattern_id, pattern in Possession.patterns.items():
                Possession.matcher.add(pattern_id, [pattern])

        if sp_doc:
            # Processed Output of Matcher
            matches = Possession.matcher(sp_doc)
            print(f"Matches: {matches}")
            owner_map, owned_map = self.index_to_what(matches)
            self.owner_map = owner_map # Maps Owner to Owned
            self.owned_map = owned_map # Maps Owned to Owner
        
    def index_to_what(self, matches):
        owner_map = {}
        owned_map = {}

        
        print(f"SP_NLP: {self.sp_nlp}")
        for match_id, token_ids in matches:
            pattern_id = self.sp_nlp.vocab.strings[match_id]
            # print(pattern_id)
            owner = None
            owned = None
            for i in range(len(token_ids)):
                right_id = Possession.patterns[pattern_id][i]["RIGHT_ID"]
                if right_id == Possession.OWNER:
                    owner = self.sp_doc[token_ids[i]]
                if right_id == Possession.OWNED:
                    owned = self.sp_doc[token_ids[i]]

            # Owner to Owned
            if owner.i not in owner_map:
                owner_map[owner.i] = []
            owner_map[owner.i].append(owned)

            # Owned to Owner
            if owned.i not in owned_map:
                owned_map[owned.i] = []
            owned_map[owned.i].append(owner)
            
        return (owner_map, owned_map)

    def get_owner(self, tokens):
        owners = []
        for token in tokens:
            index = token.i
            if index in self.owned_map:
                owners += self.owned_map[index]
        return owners

    def get_owned(self, tokens):
        owned = []
        for token in tokens:
            index = token.i
            if index in self.owner_map:
                owned += self.owner_map[index]
        return owned

In [74]:
class Parser:
    sp_nlp = None

    def __init__(self, text):
        if not Parser.sp_nlp:
            Parser.sp_nlp = spacy.load("en_core_web_sm")
        print(Parser.sp_nlp)
        
        # Clean Text
        self.text = text
        self.text = re.sub("[\(\[].*?[\)\]]", "", self.text)
        self.text = re.sub("\s+", " ", self.text)
        
        # Process Text
        self.sp_doc = Parser.sp_nlp(text)
        self.token_map = self.index_to_token(self.sp_doc)
        
        # Tools
        self.possession = Possession(Parser.sp_nlp, self.sp_doc)
        self.species = Species(text, self.token_map)
        self.references = References(text, self.token_map)

    def index_to_token(self, sp_doc):
        index_to_token_map = {}
        for token in sp_doc:
            index_to_token_map[token.idx] = token.i
        return index_to_token_map

    def parse_segment(self, l_i, r_i):
        used = []
        
        # Find Species
        species = None
        for token in self.sp_doc[l_i:r_i+1]:
            if self.species.is_species(token):
                species = token
                used.append(species)
                break
        
        # Find Change
        change_keywords = {"increase", "decrease", "change"}
        change = []
        for token in self.sp_doc[l_i:r_i+1]:
            if token not in used and not token.is_oov:
                for keyword in change_keywords:
                    if keyword.similarity(token) > 0.7:
                        change.append(token)
                        used.append(token)
                        break
            # I only want one word that represents
            # the change for simplicity
            if change:
                break

        # Find Trait
        trait = []
        if change:
            # The trait is listed before the change (i.e. "diet shifts from ...")
            prev_i = change[0].i - 1
            prev_token = None if prev_i < 0 else self.sp_doc[prev_i]
            if prev_token and prev_token not in used and prev_token.pos_ == "NOUN":
                used.append(prev_token)
                trait.append(prev_token)
            else:
                # Look for "in" (i.e. "increase in ...")
                for child in change[0].children:
                    if child in used:
                        continue
                    if child.pos_ == "ADP" and child.children:
                        if child.children[0] not in used:
                            used.append(child.children[0])
                            trait.append(child.children[0])

        # Find Cause
        cause = []
        for token in self.sp_doc[l_i:r_i+1]:
            if token not in used and token.pos_ == "SCONJ":
                start_i = token.i + 1
                end_i = start_i
                while end_i <= r_i + 1 and self.sp_doc[end_i].pos_ in ["ADP", "DET", "NOUN", "PROPN", "AUX"]:
                    used.append(self.sp_doc[end_i])
                    end_i += 1
                used.append(token)
                break

        unit = Unit(species, trait, change, cause)
        return unit

    def parse_sentence(self, l_i, r_i):
        # Find Verb
        # The verb is used to divide
        # the "parsing" space, which
        # makes the work simpler.
        verb = None
        for token in self.sp_doc:
            if token.pos_ == "VERB":
                verb = token
                break

        # Base Case
        # If there is no verb, we have
        # reached the simplest case and
        # can extract information.
        if verb == None:
            return parse_segment(l_i, r_i)
        # Merge
        # Given two units of information,
        # we need to merge them.
        else:
            l_unit = self.parse_sentence(l_i, verb.i - 1)
            r_unit = self.parse_sentence(verb.i + 1, r_i)

            if l_unit.not_empty() and not r_unit.empty():
                l_unit.change.append(verb)
                return l_unit
            elif r_unit.not_empty() and not l_unit.empty():
                r_unit.change.append(verb)
                return r_unit    
            elif l_unit.can_merge(r_unit):
                l_unit.merge(r_unit)
                return l_unit
            else:
                r_unit.cause = l_unit
                return r_unit

    def parse(self):
        units = []
        for sent in doc.sents:
            units.append(self.parse_sentence(sent.start, sent.end))
        return units

parser = Parser("Grasshoppers exhibited significant diet shifts from grass to herbs (Kruskal-Wallis test, P 0.01, df 3) when they were in the presence of the comparatively sedentary species (the smaller Pisaurina and the larger Hogna) compared to controls without spiders (Fig. 2).")
# units = parser.parse()
# pprint(units, indent=2)

<spacy.lang.en.English object at 0x00000275ADA25480>
SP_NLP: <spacy.lang.en.English object at 0x00000275ADA25480>
SP_DOC: Grasshoppers exhibited significant diet shifts from grass to herbs (Kruskal-Wallis test, P 0.01, df 3) when they were in the presence of the comparatively sedentary species (the smaller Pisaurina and the larger Hogna) compared to controls without spiders (Fig. 2).
Matches: [(14621589392117008497, [4, 5, 6]), (14621589392117008497, [26, 27, 31]), (14621589392117008497, [43, 44, 45]), (17237321022846380202, [41, 15, 42, 43])]
SP_NLP: <spacy.lang.en.English object at 0x00000275ADA25480>


04/29/2025 13:27:33 - INFO - 	 missing_keys: []
04/29/2025 13:27:33 - INFO - 	 unexpected_keys: []
04/29/2025 13:27:33 - INFO - 	 mismatched_keys: []
04/29/2025 13:27:33 - INFO - 	 error_msgs: []
04/29/2025 13:27:33 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
04/29/2025 13:27:33 - INFO - 	 Tokenize 1 inputs...
04/29/2025 13:27:33 - INFO - 	 ***** Running Inference on 1 texts *****


CorefResult(text="Grasshoppers exhibited significant diet shifts fro...", clusters=[['Grasshoppers', 'they']])


TypeError: 'CorefResult' object is not iterable