In [1]:
import re
import time
import spacy
import textacy
import requests
from pprint import pprint
from fastcoref import FCoref
from taxonerd import TaxoNERD
from fastcoref import spacy_component
from spacy.matcher import Matcher, DependencyMatcher, PhraseMatcher

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
spacy.require_gpu()

True

In [3]:
class Tools:
    def __init__(self):
        self.taxonerd = TaxoNERD(prefer_gpu=True)
        self.nlp = self.taxonerd.load(model="en_ner_eco_biobert")
        self.nlp.add_pipe("fastcoref")
        print(self.nlp.pipe_names)
        self.doc = None
        self.token_map = None

    @staticmethod
    def clean_text(text):
        cleaned_text = re.sub("[\(\[].*?[\)\]]", "", text)
        cleaned_text = re.sub("\s+", " ", cleaned_text)
        cleaned_text = re.sub(r"\s+([?.!,])", r"\1", cleaned_text)
        return cleaned_text

    def update(self, doc):
        self.doc = doc
        # Map Tokens to Index
        self.token_map = {}
        for token in self.doc:
            self.token_map[token.idx] = token.i

In [4]:
class References:
    def __init__(self, tools):
        self.tools = tools
        self.cluster_map = None

    def update(self):
        if not self.tools.doc:
            return
        self.cluster_map = self.get_cluster_map(self.tools.doc._.coref_clusters)
        
    def get_cluster_map(self, clusters):
        cluster_map = {}
        for cluster in clusters:
            token_cluster = []
            for span in cluster:
                if span[0] not in self.tools.token_map:
                    raise Exception("Invalid Token")
                index = self.tools.token_map[span[0]]
                token_cluster.append(self.tools.doc[index])
            # Mapping
            for token in token_cluster:
                cluster_map[token.i] = list(filter(lambda t: t != token, token_cluster))
        return cluster_map
            
    def get_references(self, tokens):
        refs = []
        for token in tokens:
            index = token.i
            if index in self.cluster_map:
                refs += self.cluster_map[index]
        return refs

In [5]:
class Possession:
    # There's no definite names for these patterns as I do not know what
    # to call them. These patterns are used to extract possessive
    # relationships from a sentence. I also could not find better names for
    # the two variables below.
    OWNER = "owner"
    OWNED = "owned"
    
    patterns = {
        "Pattern1": [
            {
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "POS": {
                        "IN": ["NOUN", "PROPN"]
                    }
                }
            },
            {
                "LEFT_ID": OWNED,
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "poss"
                }
            }
        ],
        "Pattern2": [
             {
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "POS": {
                        "IN": ["NOUN", "PROPN"]
                    }
                }
            },
            {
                "LEFT_ID": OWNED,
                "REL_OP": ">",
                "RIGHT_ID": "adp",
                "RIGHT_ATTRS": {
                    "DEP": "prep",
                    "POS": {
                        "IN": ["ADP"]
                    }
                }
            },
            {
                "LEFT_ID": "adp",
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "pobj",
                    "POS": {
                        "IN": ["NOUN", "PROPN"]
                    }
                }
            }
        ],
        "Pattern3": [
            {
                "RIGHT_ID": "verb",
                "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "nsubj",
                    "POS": {"IN": ["PRON"]}
                }
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "DEP": "dobj",
                    "POS": {"IN": ["NOUN", "PROPN"]}
                }
            }
        ],
        "Pattern4": [
            {
                "RIGHT_ID": "verb",
                "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "DEP": "nsubj",
                    "POS": {"IN": ["NOUN", "PROPN"]}
                }
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": "adp",
                "RIGHT_ATTRS": {
                    "DEP": "prep",
                    "POS": {"IN": ["ADP"]}
                }
            },
            {
                "LEFT_ID": "adp",
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "pobj",
                    "POS": {"IN": ["NOUN", "PROPN"]}
                }
            }
        ],
    }
    
    def __init__(self, tools):
        self.tools = tools
        self.matcher = DependencyMatcher(self.tools.nlp.vocab)
        for pattern_id, pattern in Possession.patterns.items():
            self.matcher.add(pattern_id, [pattern])
        self.owner_map = None
        self.owned_map = None
        self.update()
    
    def update(self):
        if not self.tools.doc:
            return
        matches = self.matcher(self.tools.doc)
        owner_map, owned_map = self.get_ownership_map(matches)
        self.owner_map = owner_map # Maps Owner to Owned
        self.owned_map = owned_map # Maps Owned to Owner
        
    def get_ownership_map(self, matches):
        owner_map = {}
        owned_map = {}

        for match_id, token_ids in matches:
            pattern_id = self.tools.nlp.vocab.strings[match_id]
            # print(pattern_id)
            owner = None
            owned = None
            for i in range(len(token_ids)):
                right_id = Possession.patterns[pattern_id][i]["RIGHT_ID"]
                if right_id == Possession.OWNER:
                    owner = self.tools.doc[token_ids[i]]
                if right_id == Possession.OWNED:
                    owned = self.tools.doc[token_ids[i]]

            # Owner to Owned
            if owner.i not in owner_map:
                owner_map[owner.i] = []
            owner_map[owner.i].append(owned)

            # Owned to Owner
            if owned.i not in owned_map:
                owned_map[owned.i] = []
            owned_map[owned.i].append(owner)
            
        return (owner_map, owned_map)

    def get_owner(self, tokens):
        owners = []
        for token in tokens:
            index = token.i
            if index in self.owned_map:
                owners += self.owned_map[index]
        return owners

    def get_owned(self, tokens):
        owned = []
        for token in tokens:
            index = token.i
            if index in self.owner_map:
                owned += self.owner_map[index]
        return owned

In [6]:
class Species:
    def __init__(self, tools):
        self.tools = tools
        self.species_indices = None
        self.update()

    def update(self):
        if not self.tools.doc:
            return
        self.species_indices = self.get_species_indices()
        
    def get_species_indices(self):
        indices = []

        # lowered_text = self.tools.doc.text.lower()
        # for token in self.tools.doc:
        #     if token.pos_ not in ["NOUN", "PROPN"]:
        #         continue
        #     try:
        #         results = requests.get(f"https://api.inaturalist.org/v1/search?q={token.lemma_}&sources=taxa&include_taxon_ancestors=false")
        #         results = results.json()
        #         results = results["results"]
        #         for result in results:
        #             if "record" not in result or "name" not in result["record"]:
        #                 continue
        #             if lowered_text.find(result["record"]["name"].lower()) == -1:
        #                 continue
        #             indices.append(token.i)
        #     except Exception as e:
        #         print("Network Error")
                
        for species_span in self.tools.doc.ents:
            for species in species_span:
                if species.idx not in self.tools.token_map:
                    raise Exception("Invalid Token")
                index = self.tools.token_map[species.idx]
                if index in indices:
                    continue
                indices.append(index)
        return indices

    def is_species(self, token):
        index = token.i
        return index in self.species_indices
        
    def contains_species(self, tokens):
        for token in tokens:
            if token.i in self.species_indices:
                return True
        return False

In [7]:
class Unit:
    def __init__(self, *, species=None, trait=None, change=None, cause=None):
        self.species = species
        self.trait = trait
        self.cause = cause
        self.change = change
        # Flag
        self.is_cause = False

    def empty(self):
        if not self.species and not self.trait and not self.cause and not self.change:
            return True
        return False

    def not_empty(self):
        return not self.empty()

    def can_merge(self, unit):
        # Two units can merge if there's no
        # overlap.
        if self.species and unit.species:
            return False
        if self.trait and unit.trait:
            return False
        if self.cause and unit.cause:
            return False
        if self.change and unit.change:
            return False
        return True

    def merge(self, unit):
        # We take the parts that the
        # other unit has; assuming that
        # there's no overlap, there's
        # no loss of information.
        if unit.species:
            self.species = unit.species
        if unit.trait:
            self.trait = unit.trait
        if unit.cause:
            self.cause = unit.cause
        if unit.change:
            self.change = unit.change

    def get_score(self):
        score = 0
        if self.species:
            score += 1
        if self.trait:
            score += 1
        if self.cause:
            score += 1
        if self.change:
            score += 1    
        return score

    def __str__(self):
        return f"Species: {self.species}, Trait: {self.trait}, Cause: ({self.cause}), Change: {self.change}"

In [8]:
class Keywords:    
    def __init__(self, tools):
        self.tools = tools
        # References
        self.unit_keywords = [self.tools.nlp(keyword) for keyword in {"unit", "%", "percent"}]
        self.change_keywords = [self.tools.nlp(keyword) for keyword in {"increase", "decrease", "change", "weaken", "shift", "cause"}]
        self.quantity_keywords = [self.tools.nlp(keyword) for keyword in {"tenfold", "half", "double", "triple", "quadruple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"}]
        # Instances
        self.unit_indices = []
        self.change_indices = []
        self.quantity_indices = []
        self.cause_indices = []
        self.update()
        
    def update(self):
        if not self.tools.doc:
            return
        self.unit_indices = self.load_unit_indices()
        self.change_indices = self.load_change_indices()
        self.quantity_indices = self.load_quantity_indices()
        self.cause_indices = self.load_cause_indices()
        return

    def is_unit(self, token):
        return token.i in self.unit_indices

    def has_unit(self, tokens):
        for token in tokens:
            if token.i in self.unit_indices:
                return True
        return False

    def load_unit_indices(self):
        indices = []
        for token in self.tools.doc:
            if token.pos_ not in ["NOUN"]:
                continue
            lemma = self.tools.nlp(token.lemma_)
            for keyword in self.unit_keywords:
                similarity = keyword.similarity(lemma)
                if similarity > 0.7:
                    indices.append(token.i)
        return indices

    def is_change(self, token):
        return token.i in self.change_indices

    def has_change(self, tokens):
        for token in tokens:
            if token.i in self.change_indices:
                return True
        return False

    def load_change_indices(self):
        indices = []
        for token in self.tools.doc:
            if token.lower_ == "to" and token.head and token.head.lower_ == "from":
                indices.append(token.i)
                continue
            if token.pos_ not in ["NOUN", "VERB"]:
                continue
            lemma = self.tools.nlp(token.lemma_)
            for keyword in self.change_keywords:
                similarity = keyword.similarity(lemma)
                if similarity > 0.7:
                    indices.append(token.i)
        return indices

    def is_quantity(self, token):
        return token.i in self.quantity_indices

    def has_quantity(self, tokens):
        for token in tokens:
            if token.i in self.quantity_indices:
                return True
        return False
        
    def load_quantity_indices(self):
        indices = []
        for token in self.tools.doc:
            if token.pos_ not in ["NOUN", "NUM"]:
                continue
            lemma = self.tools.nlp(token.lemma_)
            for keyword in self.quantity_keywords:
                similarity = keyword.similarity(lemma)
                if similarity > 0.7:
                    # Make sure that if there is a noun the quantity
                    # modifies, and that it is a unit.
                    if token.head and token.head.pos_ == "NOUN" and not self.is_unit(token.head):
                        continue
                    indices.append(token.i)
        return indices

    def is_cause(self, token):
        return token.i in self.cause_indices

    def has_cause(self, tokens):
        for token in tokens:
            if token.i in self.cause_indices:
                return True
        return False

    def load_cause_indices(self):
        indices = []
        for token in self.tools.doc:
            if token.pos_ not in ["ADP", "SCONJ", "PART", "PRON"]:
                continue
            if token.pos_ == "SCONJ":
                indices.append(token.i)
                continue
            elif token.pos_ == "PART":
                if token.head and token.head.pos_ == "VERB":
                    indices.append(token.i)
                    continue
            elif token.pos_ == "ADP":
                if token.lower_ == "due" and self.tools.doc[token.i + 1] and self.tools.doc[token.i + 1].lower_ == "to":
                    indices.append(token.i)
                    continue
                elif token.head:
                    if token.head.pos_ == "AUX":
                        indices.append(token.i)
                        continue
                    elif token.head.pos_ == "VERB" and token.head.i < token.i and self.is_change(token.head):
                        indices.append(token.i)
                        continue
                    elif token.lower_ != "to" and "AUX" in [child.pos_ for child in list(filter(lambda t: t.i < token.i,token.head.children))]:
                        indices.append(token.i)
                        continue
                elif token.ancestors:
                    if "AUX" in [ancestor.pos_ for ancestor in token.ancestors]:
                        indices.append(token.i)
                        continue
            elif token.pos_ == "PRON":
                if token.head and token.head.pos_ == "VERB" and self.is_change(token.head):
                    indices.append(token.i)
                    continue
        return indices

In [9]:
class Parser:
    def __init__(self):
        self.tools = Tools()
        self.species = Species(self.tools)
        self.possession = Possession(self.tools)
        self.references = References(self.tools)
        self.keywords = Keywords(self.tools)
            
    def update(self, doc):
        self.tools.update(doc)
        self.species.update()
        self.possession.update()
        self.references.update()
        self.keywords.update()
                
    def parse_segment(self, l_i, r_i):
        # print(f"\nPARSING SEGMENT\n")
        # print(f"Text: {self.tools.doc[l_i:r_i+1].text}")
        used = []

        # Find Cause
        cause = []
        for token in self.tools.doc[l_i:r_i+1]:
            if token not in used and token.pos_ in ["SCONJ"] and self.keywords.is_cause(token):
                start_i = token.i + 1
                end_i = start_i
                while end_i <= r_i and self.tools.doc[end_i] not in used and self.tools.doc[end_i].pos_ in ["ADP", "DET", "NOUN", "PROPN", "AUX", "ADV", "PRON", "ADJ"]:
                    used.append(self.tools.doc[end_i])
                    cause.append(self.tools.doc[end_i])
                    end_i += 1
                used.append(token)
        # print(f"Cause 1: {cause}")
        
        # Find Species
        species = []
        for token in self.tools.doc[l_i:r_i+1]:
            if self.species.is_species(token) and token not in used and (token.head and (token.head.pos_ not in ["SCONJ", "ADP"] or token.head.lower_ == "of")):
                species.append(token)
                used.append(species)
                break
        # print(f"Species: {species}")
        
        # Find Change
        change = []
        for token in self.tools.doc[l_i:r_i+1]:
            if token not in used and not token.is_oov:
                if self.keywords.is_change(token):
                    change.append(token)
                    used.append(token)
            # I only want one word that represents
            # the change for simplicity
            if change:
                break
        # print(f"Change 1: {change}")

        # Next Method to Find Change
        for token in self.tools.doc[l_i:r_i+1]:
            if token not in used and token.pos_ == "ADP" and token.lower_ != "of":
                # print("In Next Method...", token, token not in used, token.pos_)
                start_i = token.i + 1
                end_i = start_i
                possible_changes = []
                while end_i <= r_i and self.tools.doc[end_i].pos_ in ["NUM", "SYM", "NOUN", "ADP", "DET"]:
                    possible_changes.append(self.tools.doc[end_i])
                    end_i += 1
                if not self.keywords.has_quantity(possible_changes):
                    continue
                # print(f"Actual Changes: {possible_changes}")
                for possible_change in possible_changes:
                    used.append(possible_change)
                    change.append(possible_change)
                used.append(token)
                break
        # print(f"Change 2: {change}")

        # Find Trait
        trait = []
        if species:
            possible_traits = list(filter(lambda t: t.i >= l_i and t.i <= r_i, self.possession.get_owned(species)))
            valid_trait = False
            for possible_trait in possible_traits:
                for ancestor in possible_trait.ancestors:
                    if self.keywords.is_change(ancestor):
                        valid_trait = True
                        break
            if valid_trait:
                trait = possible_traits
        elif change:
            # The trait is listed before the change (i.e. "diet shifts from ...")
            prev_i = change[0].i - 1
            prev_token = None if prev_i < 0 else self.tools.doc[prev_i]
            if prev_token and prev_token not in used and prev_token.pos_ == "NOUN":
                used.append(prev_token)
                trait.append(prev_token)
            else:
                # Look for "in" (i.e. "increase in ...")
                for child in change[0].children:
                    # print(child, child.pos_, child.children)
                    if child in used:
                        continue
                    if child.pos_ == "ADP" and child.children:
                        children = list(child.children)
                        if children[0] not in used:
                            used.append(children[0])
                            trait.append(children[0])
        else:
            for token in self.tools.doc[l_i:r_i+1]:
                if token.head and self.keywords.is_change(token.head):
                    trait.append(token)
                    used.append(token)

                    possible_species = self.possession.get_owner(trait)
                    if self.species.contains_species(possible_species):
                        for sp in possible_species:
                            if token in sp.ancestors:
                                species.append(sp)
                                used.append(sp)
                                break
                    break
        # print(f"Trait: {trait}")
        
        # Find Cause
        is_cause = False
        for token in self.tools.doc[l_i:r_i+1]:
            if token not in used and token.pos_ in ["PRON"] and self.keywords.is_cause(token):
                is_cause = True
                used.append(token)
            elif token not in used and token.pos_ in ["ADP"] and self.keywords.is_cause(token):
                start_i = token.i + 1
                end_i = start_i
                buffer = []
                noun_found = False
                while end_i <= r_i and self.tools.doc[end_i] not in used and self.tools.doc[end_i].pos_ in ["ADP", "DET", "NOUN", "PROPN", "AUX", "ADV", "PRON", "ADJ"]:
                    if self.tools.doc[end_i].pos_ in ["NOUN", "PROPN", "PRON"]:
                        noun_found = True
                    buffer.append(self.tools.doc[end_i])
                    end_i += 1
                if noun_found:
                    for token in buffer:
                        used.append(token)
                        cause.append(token)
                    used.append(token)
        # print(f"Cause 2: {cause}")
        
        unit = Unit(species=species, trait=trait, change=change, cause=cause)
        unit.is_cause = is_cause
        return unit

    def parse_sentence(self, l_i, r_i):
        units = []
        
        # Recursive Split
        # We're extracting the core information
        # in the sentence into units.
        def recursive_split(r_l_i, r_r_i):
            nonlocal units
            # Find Verb
            # The verb is used to divide
            # the "parsing" space, which
            # makes the work simpler.
            verb = None
            for token in self.tools.doc[r_l_i:r_r_i+1]:
                if token.pos_ == "VERB":
                    verb = token
                    break
    
            # Base Case
            # If there is no verb, we have
            # reached the simplest case and
            # can extract information.
            if verb == None:
                units.append(self.parse_segment(r_l_i, r_r_i))
            else:
                recursive_split(r_l_i, verb.i - 1)
                units.append(verb)
                recursive_split(verb.i + 1, r_r_i)
            return
        recursive_split(l_i, r_i)

        # Recursive Merge
        # We are putting the pieces back together,
        # so that we, the computer, can understand
        # what's going on.
        def recursive_merge():
            nonlocal units
            if len(units) < 3:
                return
            
            l_unit = units[0]
            verb = units[1]
            r_unit = units[2]
            verb_is_change = self.keywords.is_change(verb)
            
            if l_unit.empty() and r_unit.empty():
                m_unit = Unit()  
            elif l_unit.not_empty() and r_unit.empty():
                # print(1)
                if verb_is_change:
                    l_unit.change.append(verb)
                m_unit = l_unit
            elif r_unit.not_empty() and l_unit.empty():
                # print(2)
                if verb_is_change:
                    r_unit.change.append(verb)
                m_unit = r_unit    
            elif l_unit.can_merge(r_unit):
                # print(3)
                l_unit.merge(r_unit)
                if verb_is_change:
                    l_unit.change.append(verb)
                m_unit = l_unit
            elif verb_is_change:
                # print(4)
                r_unit.cause = l_unit
                if verb_is_change:
                    r_unit.change.append(verb)
                m_unit = r_unit
            else:
                # print(5)
                if l_unit.get_score() >= r_unit.get_score():
                    m_unit = l_unit
                else:
                    m_unit = r_unit
            units = [m_unit] + units[3:]
            recursive_merge()
            return
        recursive_merge()

        assert len(units) == 1
        return units[0]

    def parse(self):
        units = []
        for sent in self.tools.doc.sents:
            print(f"Sentence: {self.tools.doc[sent.start:sent.end].text}")
            unit = self.parse_sentence(sent.start, sent.end - 1)
            print(unit)
            units.append(unit)
            print()
        return units

In [10]:
t0 = time.time()
parser = Parser()
t1 = time.time()
print(f"Initialization Took {t1 - t0}s")

for doc in parser.tools.nlp.pipe([""], n_process=2):    
    # Update
    t0 = time.time()
    parser.update(doc)
    t1 = time.time()
    print(f"Update Took {t1 - t0}s")
    
    # Parse
    t0 = time.time()
    units = parser.parse()
    t1 = time.time()
    print(f"Parsing Took {t1 - t0}s")

05/04/2025 12:00:07 - INFO - 	 GPU is available
05/04/2025 12:00:07 - INFO - 	 TaxoNERD will use GPU


ValueError: [E002] Can't find factory for 'fastcoref' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, en.lemmatizer