In [1]:
import re
import time
import spacy
import textacy
import requests
from pprint import pprint
from fastcoref import FCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher, PhraseMatcher

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# spacy.require_gpu()

In [3]:
class Main:
    def __init__(self, *, text=None):
        self.lg_nlp = spacy.load("en_core_web_lg")
        self.sp_nlp = spacy.load("en_core_web_trf")        
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler", "lemmatizer"])
        self.fcoref = FCoref(enable_progress_bar=False)
        self.sp_doc = None
        self.tn_doc = None
        self.tk_map = None
        if text:
            self.update(text)

    @staticmethod
    def clean_text(text):
        cleaned_text = re.sub("[\(\[].*?[\)\]]", "", text)
        cleaned_text = re.sub("\s+", " ", cleaned_text)
        cleaned_text = re.sub(r"\s+([?.!,])", r"\1", cleaned_text)
        return cleaned_text
        
    def update(self, text):
        self.sp_doc = self.sp_nlp(text)
        self.tn_doc = self.tn_nlp(text)
        self.tk_map = self.load_token_map()

    def load_token_map(self):
        tk_map = {}
        for token in self.sp_doc:
            tk_map[token.idx] = token.i
        return tk_map

In [4]:
class References:
    def __init__(self, main, texts=None):
        self.main = main
        self.predictions = None
        self.cluster_map = None
        if texts:
            self.update(texts)

    def update(self, texts):
        if not self.main.sp_doc:
            return
        self.predictions = self.main.fcoref.predict(texts=texts)
        self.cluster_map = self.load_cluster_map(self.predictions)
        
    def load_cluster_map(self, predictions):
        cluster_map = {}
        for prediction in predictions:
            clusters = prediction.get_clusters(as_strings=False)
            for cluster in clusters:
                # Converting Spans to Tokens
                token_cluster = []
                for span in cluster:
                    if span[0] not in self.main.tk_map:
                        raise Exception("Invalid Token")
                    index = self.main.tk_map[span[0]]
                    token_cluster.append(self.main.sp_doc[index])
                # Mapping
                for token in token_cluster:
                    cluster_map[token.i] = list(filter(lambda t: t != token, token_cluster))
        return cluster_map
            
    def get_references(self, tokens):
        refs = []
        for token in tokens:
            index = token.i
            if index in self.cluster_map:
                refs += self.cluster_map[index]
        return refs

In [5]:
class Possession:
    # There's no definite names for these patterns as I do not know what
    # to call them. These patterns are used to extract possessive
    # relationships from a sentence. I also could not find better names for
    # the two variables below.
    OWNER = "owner"
    OWNED = "owned"
    
    patterns = {
        "Pattern1": [
            {
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "POS": {
                        "IN": ["NOUN", "PROPN"]
                    }
                }
            },
            {
                "LEFT_ID": OWNED,
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "poss"
                }
            }
        ],
        "Pattern2": [
             {
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "POS": {
                        "IN": ["NOUN", "PROPN"]
                    }
                }
            },
            {
                "LEFT_ID": OWNED,
                "REL_OP": ">",
                "RIGHT_ID": "adp",
                "RIGHT_ATTRS": {
                    "DEP": "prep",
                    "POS": {
                        "IN": ["ADP"]
                    }
                }
            },
            {
                "LEFT_ID": "adp",
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "pobj",
                    "POS": {
                        "IN": ["NOUN", "PROPN"]
                    }
                }
            }
        ],
        "Pattern3": [
            {
                "RIGHT_ID": "verb",
                "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "nsubj",
                    "POS": {"IN": ["PRON"]}
                }
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "DEP": "dobj",
                    "POS": {"IN": ["NOUN", "PROPN"]}
                }
            }
        ],
        "Pattern4": [
            {
                "RIGHT_ID": "verb",
                "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": OWNED,
                "RIGHT_ATTRS": {
                    "DEP": "nsubj",
                    "POS": {"IN": ["NOUN", "PROPN"]}
                }
            },
            {
                "LEFT_ID": "verb",
                "REL_OP": ">",
                "RIGHT_ID": "adp",
                "RIGHT_ATTRS": {
                    "DEP": "prep",
                    "POS": {"IN": ["ADP"]}
                }
            },
            {
                "LEFT_ID": "adp",
                "REL_OP": ">",
                "RIGHT_ID": OWNER,
                "RIGHT_ATTRS": {
                    "DEP": "pobj",
                    "POS": {"IN": ["NOUN", "PROPN"]}
                }
            }
        ],
    }
    
    def __init__(self, main):
        self.main = main
        self.matcher = DependencyMatcher(self.main.sp_nlp.vocab)
        for pattern_id, pattern in Possession.patterns.items():
            self.matcher.add(pattern_id, [pattern])
        self.owner_map = None
        self.owned_map = None
        self.update()
    
    def update(self):
        if not self.main.sp_doc:
            return
        matches = self.matcher(self.main.sp_doc)
        owner_map, owned_map = self.load_ownership_maps(matches)
        self.owner_map = owner_map # Maps Owner to Owned
        self.owned_map = owned_map # Maps Owned to Owner
        
    def load_ownership_maps(self, matches):
        owner_map = {}
        owned_map = {}

        for match_id, token_ids in matches:
            pattern_id = self.main.sp_nlp.vocab.strings[match_id]
            # print(pattern_id)
            owner = None
            owned = None
            for i in range(len(token_ids)):
                right_id = Possession.patterns[pattern_id][i]["RIGHT_ID"]
                if right_id == Possession.OWNER:
                    owner = self.main.sp_doc[token_ids[i]]
                if right_id == Possession.OWNED:
                    owned = self.main.sp_doc[token_ids[i]]

            # Owner to Owned
            if owner.i not in owner_map:
                owner_map[owner.i] = []
            owner_map[owner.i].append(owned)

            # Owned to Owner
            if owned.i not in owned_map:
                owned_map[owned.i] = []
            owned_map[owned.i].append(owner)
            
        return (owner_map, owned_map)

    def get_owner(self, tokens):
        owners = []
        for token in tokens:
            index = token.i
            if index in self.owned_map:
                owners += self.owned_map[index]
        return owners

    def get_owned(self, tokens):
        owned = []
        for token in tokens:
            index = token.i
            if index in self.owner_map:
                owned += self.owner_map[index]
        return owned

In [6]:
class Species:
    def __init__(self, main):
        self.main = main
        self.species_indices = None
        self.update()

    def update(self):
        if not self.main.sp_doc:
            return
        self.species_indices = self.load_species_indices()
        
    def load_species_indices(self):
        indices = []

        # Try TaxoNERD
        for species_span in self.main.tn_doc.ents:
            for species in species_span:
                if species.idx not in self.main.tk_map:
                    raise Exception("Invalid Token")
                index = self.main.tk_map[species.idx]
                if index in indices:
                    continue
                indices.append(index)

        # Try API Call
        lowered_text = self.main.sp_doc.text.lower()
        for token in self.main.sp_doc:
            if token.pos_ not in ["NOUN", "PROPN"] or token.i in indices:
                continue
            try:
                results = requests.get(f"https://api.inaturalist.org/v1/search?q={token.lemma_}&sources=taxa&include_taxon_ancestors=false")
                results = results.json()
                results = results["results"]
                for result in results:
                    if "record" not in result or "name" not in result["record"]:
                        continue
                    if lowered_text.find(result["record"]["name"].lower()) == -1:
                        continue
                    indices.append(token.i)
            except Exception as e:
                pass
        
        return indices

    def is_species(self, token):
        index = token.i
        return index in self.species_indices
        
    def has_species(self, tokens):
        for token in tokens:
            if token.i in self.species_indices:
                return True
        return False

In [7]:
class Keywords:    
    def __init__(self, main):
        self.main = main
        # Unit
        self.unit_literals = {"unit", "%", "percent"}
        self.unit_keywords = [self.main.lg_nlp(literal) for literal in self.unit_literals]
        self.unit_indices = []
        # Quantitative
        self.quantitative_literals = {"tenfold", "half", "double", "triple", "quadruple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "higher", "lower"}
        self.quantitative_keywords = [self.main.lg_nlp(literal) for literal in self.quantitative_literals]
        self.quantitative_indices = []
        # Change
        self.change_literals = {"increase", "decrease", "change", "weaken", "shift", "cause", "produce"}
        self.change_keywords = [self.main.lg_nlp(literal) for literal in self.change_literals]
        self.change_indices = []
        # Cause
        self.cause_literals = {"thus"}
        self.cause_keywords = [self.main.lg_nlp(literal) for literal in self.cause_literals]
        self.cause_indices = []
        # Trait
        self.trait_literals = {"diet"}
        self.trait_keywords = [self.main.lg_nlp(literal) for literal in self.trait_literals]
        self.trait_indices = []
        # Update
        self.update()
        
    def update(self):
        if not self.main.sp_doc:
            return
        self.unit_indices = self.load_unit_indices()
        self.quantitative_indices = self.load_quantitative_indices()
        self.change_indices = self.load_change_indices()
        self.cause_indices = self.load_cause_indices()
        self.trait_indices = self.load_trait_indices()
        return

    def is_unit(self, token):
        return token.i in self.unit_indices

    def has_unit(self, tokens):
        for token in tokens:
            if token.i in self.unit_indices:
                return True
        return False

    def load_unit_indices(self):
        indices = []
        for token in self.main.sp_doc:
            if token.pos_ not in ["NOUN"]:
                continue
            # Fast Check
            if token.lemma_ in self.unit_literals:
                indices.append(token.i)
                continue
            # Comparing Similarity
            lemma = self.main.lg_nlp(token.lemma_)
            for keyword in self.unit_keywords:
                similarity = keyword.similarity(lemma)
                # print(f"{lemma} and {keyword} Similarity: {similarity}")
                if similarity > 0.7:
                    indices.append(token.i)
        return indices

    def is_change(self, token):
        return token.i in self.change_indices

    def has_change(self, tokens):
        for token in tokens:
            if token.i in self.change_indices:
                return True
        return False

    def load_change_indices(self):
        indices = []
        for token in self.main.sp_doc:
            if token.pos_ not in ["NOUN", "VERB", "ADP"]:
                continue
            # Fast Check
            if token.lemma_ in self.change_literals:
                indices.append(token.i)
                continue
            # Handling Case: "from" ... "to" ...
            if token.lower_ == "to" and token.head and token.head.i in indices:
                for child in token.head.children:
                    if child.lower_ == "from":
                        indices.append(token.i)
                        break
                continue
            # Comparing Similarity
            lemma = self.main.lg_nlp(token.lemma_)
            for keyword in self.change_keywords:
                similarity = keyword.similarity(lemma)
                # print(f"{lemma} and {keyword} Similarity: {similarity}")
                if similarity > 0.7:
                    indices.append(token.i)
        return indices

    def is_quantitative(self, token):
        return token.i in self.quantitative_indices

    def has_quantitative(self, tokens):
        for token in tokens:
            if token.i in self.quantitative_indices:
                return True
        return False
        
    def load_quantitative_indices(self):
        indices = []
        for token in self.main.sp_doc:
            if token.pos_ not in ["NOUN", "NUM"]:
                continue
            # Fast Check
            if token.lemma_ in self.quantitative_literals:
                indices.append(token.i)
                continue
            # Comparing Similarity
            lemma = self.main.lg_nlp(token.lemma_)
            for keyword in self.quantitative_keywords:
                similarity = keyword.similarity(lemma)
                # print(f"{lemma} and {keyword} Similarity: {similarity}")
                if similarity > 0.7:
                    # We need to make sure that any noun that the 
                    # quantitative token modifies (the unit) is a unit.
                    if token.head and self.is_unit(token.head):
                        indices.append(token.i)
        return indices

    def is_cause(self, token):
        return token.i in self.cause_indices

    def has_cause(self, tokens):
        for token in tokens:
            if token.i in self.cause_indices:
                return True
        return False

    def load_cause_indices(self):
        indices = []
        for token in self.main.sp_doc:
            if token.pos_ not in ["ADP", "SCONJ", "PART", "PRON", "ADV"]:
                continue
            # Fast Check
            if token.lemma_ in self.cause_literals:
                indices.append(token.i)
            elif token.pos_ == "SCONJ":
                # print("It's a SCONJ")
                indices.append(token.i)
                continue
            elif token.pos_ == "PART":
                if token.head and token.head.pos_ == "VERB":
                    # print("It's a PART + VERB")
                    indices.append(token.i)
                    continue
            elif token.pos_ == "ADP":
                if token.lower_ == "due" and token.i + 1 < len(self.main.sp_doc) and self.main.sp_doc[token.i + 1].lower_ == "to":
                    # print("It's a DUE TO")
                    indices.append(token.i)
                    continue
                elif token.lower_ == "in" and token.i + 1 < len(self.main.sp_doc) and self.main.sp_doc[token.i + 1].lower_ == "response":
                    # print("It's a DUE TO")
                    indices.append(token.i)
                    continue
                elif token.head:
                    if token.head.pos_ == "AUX":
                        # print("The head is an AUX")
                        indices.append(token.i)
                        continue
                    elif token.head.pos_ == "VERB" and token.head.i < token.i and self.is_change(token.head):
                        # print("The head is a change-VERB")
                        indices.append(token.i)
                        continue
                    elif token.lower_ != "to" and "AUX" in [child.pos_ for child in list(filter(lambda t: t.i < token.i,token.head.children))]:
                        # print("There's an AUX in the head's children")
                        # for child in token.head.children:
                            # print(f"\t\t{child}, {child.pos_}")
                        indices.append(token.i)
                        continue
                elif token.ancestors:
                    # print("There's an AUX ancestor")
                    if "AUX" in [ancestor.pos_ for ancestor in token.ancestors]:
                        indices.append(token.i)
                        continue
            elif token.pos_ == "PRON":
                if token.head and token.head.pos_ == "VERB" and self.is_change(token.head):
                    indices.append(token.i)
                    continue
        return indices

    def is_trait(self, token):
        return token.i in self.trait_indices

    def has_trait(self, tokens):
        for token in tokens:
            if token.i in self.trait_indices:
                return True
        return False

    def load_trait_indices(self):
        indices = []
        for token in self.main.sp_doc:
            if token.pos_ not in ["NOUN"]:
                continue

            # Fast Check
            if token.lemma_ in self.trait_literals:
                indices.append(token.i)
                continue

            # Comparing Similarity
            lemma = self.main.lg_nlp(token.lemma_)
            for keyword in self.trait_keywords:
                similarity = keyword.similarity(lemma)
                # print(f"{lemma} and {keyword} Similarity: {similarity}")
                if similarity > 0.7:
                    indices.append(token.i)
        return indices

In [8]:
class Context:
    def __init__(self, main):
        self.main = main
        self.chunk_map = None

    def update(self):
        if not self.main.sp_doc:
            return
        self.chunk_map = self.load_chunk_map()
        return

    def load_chunk_map(self):
        chunk_map = {}
        for chunk in self.main.sp_doc.noun_chunks:
            for token in chunk:
                chunk_map[token.i] = list(filter(lambda t: t.pos_ in ["NOUN", "PROPN", "ADJ"], [token for token in chunk]))
        return chunk_map

    def get_chunk(self, token):
        if not self.chunk_map or token.i not in self.chunk_map:
            return None
        return self.chunk_map[token.i]

    def get_list(self, token, max_i=None):
        max_i = len(self.main.sp_doc) - 1 if not max_i else max_i
    
        if not token or token.i >= max_i:
            # print(-1)
            return []
        if token.nbor().text != ",":
            # print(0)
            return []

        and_or_found = False
        conjunction_found = False
        offset_at_last_comma = -1
        number_commas_found = 0
        
        offset = 1
        while token.i + offset <= max_i:
            # print(f"Offset: {offset}")
            # print(f"Offset At Last Comma: {offset_at_last_comma}")
            nbor = token.nbor(offset)
            if nbor.text == ",":
                # STOP
                if nbor.head.i > nbor.i:
                    # print(f"Offset: {offset}")
                    # print(1, nbor, nbor.head)
                    break
                offset_at_last_comma = offset
                number_commas_found += 1
                offset += 1
                continue
            if nbor.pos_ in ["ADJ", "NOUN", "PROPN", "PRON"] or nbor.lower_ in ["-", "/"]:
                # print(2, nbor)
                offset += 1
                continue
            if nbor.pos_ == "CCONJ" and nbor.lower_ not in ["but"]:
                # print(3, nbor)
                if nbor.lower_ in ["and", "or"]:
                    # print(4, nbor)
                    if and_or_found:
                        # print(5, nbor)
                        break
                    elif number_commas_found == 1 and not conjunction_found and offset_at_last_comma == offset + 1:
                        # print(6, nbor)
                        offset = offset_at_last_comma - 1
                        break
                    and_or_found = True
                conjunction_found = True
                offset += 1
                continue
            break
        # print(7)
        if offset <= 2:
            return []
        # print(self.main.sp_doc[token.i:token.i+offset])
        return list(filter(lambda t: t.pos_ not in ["PUNCT"], [t for t in self.main.sp_doc[token.i:token.i+offset]]))

    def get_conjunct(self, token):
        has_conj = False
        for child in token.children:
            if child.pos_ == "CCONJ" and child.lower_ in ["and", "or"]:
                has_conj = True
                break
        if not has_conj:
            return []
        for child in token.children:
            if child.dep_ == "conj":
                if child.i in self.chunk_map:
                    return self.chunk_map[child.i]
                return [child]
        return []

    def get_conjunct_and_list(self, token, max_i=None):
        trail = self.get_list(token, max_i)
        conjuct = self.get_conjunct(token)
        if conjuct:
            trail += conjuct
        return trail

    def get_context(self, token, max_i=None):
        context = self.get_list(token, max_i)
        conjuct = self.get_conjunct(token)
        if conjuct:
            context += conjuct
        chunk = self.get_chunk(token)
        if chunk:
            context += list(filter(lambda t: t.i != token.i, chunk))
        return context

In [9]:
class Unit:
    def __init__(self, *, species=None, trait=None, change=None, cause=None):
        self.species = species
        self.trait = trait
        self.cause = cause
        self.change = change

    def empty(self):
        if not self.species and not self.trait and not self.cause and not self.change:
            return True
        return False

    def not_empty(self):
        return not self.empty()

    def can_merge(self, unit):
        # Two units can merge if there's no
        # overlap.
        if self.species and unit.species:
            return False
        if self.trait and unit.trait:
            return False
        if self.cause and unit.cause:
            return False
        if self.change and unit.change:
            return False
        return True

    def merge(self, unit):
        # We take the parts that the other unit has; assuming that
        # there's no overlap, there's no loss of information. This
        # is likely not foolproof.
        if unit.species:
            self.species = unit.species
        if unit.trait:
            self.trait = unit.trait
        if unit.cause:
            self.cause = unit.cause
        if unit.change:
            self.change = unit.change

    def get_score(self):
        score = 0
        if self.species:
            score += 1
        if self.trait:
            score += 1
        if self.cause:
            score += 1
        if self.change:
            score += 1    
        return score
    
    def __str__(self):
        return f"Species: {self.species}, Trait: {self.trait}, Cause: ({self.cause}), Change: {self.change}"

In [10]:
class Parser:
    def __init__(self, *, text=None, main=None, species=None, possession=None, references=None, keywords=None, context=None):
        self.main = main if main else Main(text=text)
        self.species = species if species else Species(self.main)
        self.context = context if context else Context(self.main)
        self.keywords = keywords if keywords else Keywords(self.main)
        self.possession = possession if possession else Possession(self.main)
        self.references = references if references else References(self.main, texts=list(filter(lambda t: t is not None,[text])))
            
    def update(self, text):
        self.main.update(text)
        self.species.update()
        self.context.update()
        self.keywords.update()
        self.possession.update()
        self.references.update(texts=[text])

    def parse_species(self, tokens, species, used):
        for token in tokens:
            if token in used:
                continue
            if self.species.is_species(token):
                if token.head:
                    if token.head.pos_ in ["SCONJ", "ADP"] and token.head.lower_ != "of":
                        continue
                ancestors = [t for t in token.ancestors]
                if self.keywords.has_cause(ancestors):
                    # print("!!!")
                    continue
                full_species = self.context.get_context(token)
                used.update([token, *full_species])
                species.update([token, *full_species])
                break

    def parse_change(self, tokens, change, used):
        for token in tokens:
            if token in used:
                continue
            if self.keywords.is_change(token):
                used.add(token)
                change.add(token)
                # I only want one word that represents
                # the change for simplicity
                break

        for token in tokens:
            if token in used:
                continue
            if token.pos_ == "ADP" and token.lower_ != "of":
                end_i = token.i + 1
                contenders = []
                while end_i <= tokens[-1].i and self.main.sp_doc[end_i] not in used:
                    if self.main.sp_doc[end_i].pos_ not in ["NUM", "SYM", "NOUN", "ADP", "DET"]:
                        break
                    contenders.append(self.main.sp_doc[end_i])
                    end_i += 1
                if not self.keywords.has_quantitative(contenders) and not self.keywords.has_change(contenders):
                    continue
                used.update([token, *contenders])
                change.update([token, *contenders])
                break

    def parse_trait(self, tokens, trait, used, species, change):
        if species:
            contenders = list(filter(lambda t: t.i >= tokens[0].i and t.i <= tokens[-1].i and t not in used, self.possession.get_owned(species)))
            for token in contenders:
                for ancestor in token.ancestors:
                    if ancestor in used:
                        continue
                    if self.keywords.is_change(ancestor):
                        used.add(token)
                        trait.add(token)
                        break    
        elif change:
            # The trait could be listed before the change (i.e. "diet shifts from ...")
            prev_i = list(change)[0].i - 1
            prev_token = None if prev_i < 0 else self.main.sp_doc[prev_i]
            if prev_token and prev_token not in used and prev_token.pos_ == "NOUN":
                if self.keywords.is_trait(prev_token):
                    used.add(prev_token)
                    trait.add(prev_token)
            else:
                # Look for "in" (i.e. "increase in ...")
                for ch in change:
                    if ch in used or not self.keywords.is_change(ch):
                        continue
                    for child in ch.children:
                        if child in used:
                            continue
                        if child.pos_ == "ADP" and child.children:
                            children = list(child.children)
                            if children[0] not in used:
                                used.add(children[0])
                                trait.add(children[0])
        else:
            for token in tokens:
                if token in used:
                    continue
                if token.pos_ in ["NOUN"] and token.head and self.keywords.is_change(token.head):
                    used.add(token)
                    trait.add(token)                        
                    break

        if trait:
            full_trait = self.context.get_context(list(trait)[0])
            for t in full_trait:
                if t not in used and t.pos_ == "NOUN":
                    used.add(t)
                    trait.add(t)

    def parse_species_by_trait(self, tokens, species, used, trait):
        species_contenders = list(filter(lambda t: t.i >= tokens[0].i and t.i <= tokens[-1].i and t not in used, self.possession.get_owner(trait)))
        for sp in species_contenders:
            if self.species.is_species(sp):
                full_sp = self.context.get_context(sp)
                used.update(full_sp)
                species.update(full_sp)

    def parse_cause(self, tokens, cause, used):
        for token in tokens:
            if token in used:
                continue
            if token.pos_ in ["SCONJ", "ADV"] and self.keywords.is_cause(token):
                end_i = token.i + 1
                # Expanding (Doesn't Handle Conjunctions)
                while end_i <= tokens[-1].i and self.main.sp_doc[end_i] not in used:
                    if self.main.sp_doc[end_i].pos_ not in ["ADP", "DET", "NOUN", "PROPN", "AUX", "ADV", "PRON", "ADJ"]:
                        break
                    used.add(self.main.sp_doc[end_i])
                    cause.add(self.main.sp_doc[end_i])
                    end_i += 1
                end_i -= 1
                # Expanding (Handles Conjunctions)
                if self.main.sp_doc[end_i].pos_ in ["NOUN", "PROPN"]:
                    context = self.context.get_conjunct_and_list(self.main.sp_doc[end_i])
                    # print(f"Context: {self.main.sp_doc[end_i], context}")
                    for context_token in context:
                        if context_token not in used:
                            used.add(context_token)
                            cause.add(context_token)
                used.add(token)
                cause.add(token)

    def parse_cause_by_ADP(self, tokens, cause, used):
        for token in tokens:
            if token in used:
                continue
            if token.pos_ in ["PRON"] and self.keywords.is_cause(token):
                used.add(token)
            elif token.pos_ in ["ADP"] and self.keywords.is_cause(token):
                end_i = token.i + 1
                # Expanding
                buffer = []
                noun_found = False
                while end_i <= tokens[-1].i and self.main.sp_doc[end_i] not in used:
                    if self.main.sp_doc[end_i].pos_ not in ["ADP", "DET", "NOUN", "PROPN", "AUX", "ADV", "PRON", "ADJ"]:
                        break
                    if self.main.sp_doc[end_i].pos_ in ["NOUN", "PROPN", "PRON"]:
                        noun_found = True
                    buffer.append(self.main.sp_doc[end_i])
                    end_i += 1
                # Expanding (Handles Conjunctions)
                end_i -= 1
                if self.main.sp_doc[end_i].pos_ in ["NOUN", "PROPN"]:
                    context = self.context.get_conjunct_and_list(self.main.sp_doc[end_i])
                    for context_token in context:
                        if context_token not in used:
                            buffer.append(context_token)
                
                if noun_found:
                    used.update([token, *buffer])
                    cause.update([token, *buffer])
                
    def parse_segment(self, l_i, r_i):
        # print(f"\nPARSING SEGMENT")
        # print(f"Text: {self.main.sp_doc[l_i:r_i+1].text}")

        used, cause, change, species, trait = set(), set(), set(), set(), set()
        tokens = self.main.sp_doc[l_i:r_i+1]
        
        # Parse
        self.parse_cause(tokens, cause, used)
        # print(f"Cause: {cause}")
        self.parse_change(tokens, change, used)
        # print(f"Change: {change}")
        self.parse_species(tokens, species, used)        
        # print(f"Species: {species}")
        self.parse_trait(tokens, trait, used, species, change)
        # print(f"Trait: {trait}")
        if not species:
            self.parse_species_by_trait(tokens, species, used, trait)
            # print(f"Species: {species}")
        self.parse_cause_by_ADP(tokens, cause, used)
        # print(f"Cause: {cause}")

        unit = Unit(species=species, trait=trait, change=change, cause=cause)
        return unit

    def parse_sentence(self, l_i, r_i):
        units = []
        
        # Recursive Split
        # We're extracting the core information
        # in the sentence into units.
        def recursive_split(r_l_i, r_r_i):
            nonlocal units
            # Find Verb
            # The verb is used to divide
            # the "parsing" space, which
            # makes the work simpler.
            verb = None
            for token in self.main.sp_doc[r_l_i:r_r_i+1]:
                if token.pos_ == "VERB":
                    verb = token
                    break
    
            # Base Case
            # If there is no verb, we have
            # reached the simplest case and
            # can extract information.
            if verb == None:
                units.append(self.parse_segment(r_l_i, r_r_i))
            else:
                # print(f"Verb: {verb}")
                recursive_split(r_l_i, verb.i - 1)
                units.append(verb)
                recursive_split(verb.i + 1, r_r_i)
            return
        recursive_split(l_i, r_i)

        # Recursive Merge
        # We are putting the pieces back together,
        # so that we, the computer, can understand
        # what's going on.
        def recursive_merge():
            nonlocal units
            if len(units) < 3:
                return
            
            l_unit = units[0]
            verb = units[1]
            r_unit = units[2]
            verb_is_change = self.keywords.is_change(verb)
            
            if l_unit.empty() and r_unit.empty():
                m_unit = Unit()  
            elif l_unit.not_empty() and r_unit.empty():
                # print(1)
                if verb_is_change:
                    l_unit.change.add(verb)
                m_unit = l_unit
            elif r_unit.not_empty() and l_unit.empty():
                # print(2)
                if verb_is_change:
                    r_unit.change.add(verb)
                m_unit = r_unit    
            elif l_unit.can_merge(r_unit):
                # print(3)
                l_unit.merge(r_unit)
                if verb_is_change:
                    l_unit.change.add(verb)
                m_unit = l_unit
            elif verb_is_change:
                # print(4)
                r_unit.cause = l_unit
                if verb_is_change:
                    r_unit.change.add(verb)
                m_unit = r_unit
            else:
                # print(5)
                if l_unit.get_score() >= r_unit.get_score():
                    m_unit = l_unit
                else:
                    m_unit = r_unit
            units = [m_unit] + units[3:]
            recursive_merge()
            return
        recursive_merge()

        assert len(units) == 1
        return units[0]

    def parse(self):
        units = []
        for sent in self.main.sp_doc.sents:
            # print(f"Sentence: {self.main.sp_doc[sent.start:sent.end].text}")
            unit = self.parse_sentence(sent.start, sent.end - 1)
            # print(unit)
            units.append(unit)
            # print()
        return units

In [11]:
text00 = Main.clean_text("Acridoidea exhibited significant diet shifts from grass to herbs (Kruskal-Wallis test, P 0.01, df 3) when they were in the presence of the comparatively sedentary species (the smaller Pisaurina and the larger Hogna) compared to controls without spiders (Fig. 2).")
text01 = Main.clean_text("Our results show that phototrophs can indirectly decrease the population density of heterotrophic bacteria by modification of the nature of bacterial interactions with predators.")
text02 = Main.clean_text("Our results show that Selachii can indirectly decrease the population density of Selachimorpha by modification of the nature of bacterial interactions with predators.")
text03 = Main.clean_text("All predators inflicted significant mortality on the prey at each prey density compared to the predator-free control for that density")
text04 = Main.clean_text("Our results show that an increase in sediment organic matter content is associated to a decline in the abundance of Loripes lucinalis (lucinid bivalve) in the Cymodocea nodosa meadows studied, which potentially may weaken the mutualism between the two species.")
text05 = Main.clean_text("The abundance of lucinids showed a negative correlation with the organic matter content in vegetated sediments (Fig. 3a), but showed no correlation in bare ones (Fig. 3b).")
text06 = Main.clean_text("The MANOVA on the cattle tank experiment showed that the presence of Tramea, nonlethal Anax, and large bullfrog tadpoles all had significant effects on both small tadpole species (Table 1).")
text07 = Main.clean_text("Thus the presence of predators, both nonlethal Anax and lethal Tramea, modified the tank environment in a way that facilitated invasion by Nematocera, but only in the absence of large bullfrogs.")
text08 = Main.clean_text("We hypothesized that the presence of Anax would decrease foraging activity of small tadpoles, which in turn would decrease predation by Tramea on the small tadpoles.")
text09 = Main.clean_text('''Only a fraction of the individuals in a given prey population are likely to be killed and consumed by predators. In contrast, nearly all individuals experience the chronic effects of predation risk. When threatened by predators, prey adopt defensive tactics whole costs can lead to reduced growth, maturation rates, survivorship, fecundity, or population density. This nonconsumptive impact of predation risk on prey is known as a "trait-mediated interaction" (TMI) because it results from changes in prey traits such as behavior or physiology. Ecological theory suggests that the strength of TMI effects will reflect a balance between the conflicting demands of reproduction vs. predator avoidance. Competitor density and resource availability are expected to alter the balance between these conflicting forces. We conducted a meta-analysis of experimental studies that measured TMI effect size while varying competitor and/or resource density. The threat of predation had an overall negative effect on prey performance, but the strength of this effect varied with the level of competition. High competition exacerbated the negative effect of intimidation on prey density but moderated the negative effect of intimidation on prey life history and growth. We discuss these results in light of previously published theoretical expectations. Our results highlight the variable and context-dependent nature of interspecific interactions.''')
text10 = Main.clean_text("Current theory on trophic interactions in food webs assumes that ecologically similar species can be treated collectively as a single functional unit such as a guild or trophic level. This theory implies that all species within that unit transmit identical direct and indirect effects throughout the community. We evaluated this assumption by conducting experiments to compare the direct and indirect effects of three top-predator species, belonging to the same hunting spider guild, on the same species of grasshopper and on old-field grasses and herbs. Observations under field conditions revealed that each spider species exhibited different hunting behavior (i.e., sit-and-wait, sit-and-pursue, and active hunting) and occupied different locations within the vegetation canopy. These differences resulted in different direct effects on grasshopper prey. Grasshoppers demonstrated significant behavioral (diet) shifts in the presence of sit-and-wait and sit-and-pursue species but not when faced with actively hunting species. Grasshopper density was significantly reduced by spider species that occupied lower parts of the vegetation canopy (sit-and-pursue and actively hunting species), but it was not significantly reduced by the sit-and-wait spider species that occupied the upper parts of the canopy. These direct effects manifested themselves differently in the plant trophic level. The sit-and-wait spider caused indirect effects on plants by changing grasshopper foraging behavior (a trait-mediated effect). The sit-and-pursue spider caused indirect effects by reducing grasshopper density (density-mediated effects); the effects of changes in grasshopper behavior were thus not reflected in the plant trophic level. The actively hunting spiders had strictly density-mediated indirect effects on plants. The study offers mechanistic insight into how predator species within the same guild can have very different trophic effects in food webs. Thus classical modeling approaches that treat all predator species as a single functional unit may not adequately capture biologically relevant details that influence community dynamics.")
text11 = Main.clean_text("Diversity and plasticity are hallmarks of cells of the monocyte-macrophage lineage. In response to IFNs, Toll-like receptor engagement, or IL-4/IL-13 signaling, macrophages undergo M1 (classical) or M2 (alternative) activation, which represent extremes of a continuum in a universe of activation states. Progress has now been made in defining the signaling pathways, transcriptional networks, and epigenetic mechanisms underlying M1-M2 or M2-like polarized activation. Functional skewing of mononuclear phagocytes occurs in vivo under physiological conditions (e.g., ontogenesis and pregnancy) and in pathology (allergic and chronic inflammation, tissue repair, infection, and cancer). However, in selected preclinical and clinical conditions, coexistence of cells in different activation states and unique or mixed phenotypes have been observed, a reflection of dynamic changes and complex tissue-derived signals. The identification of mechanisms and molecules associated with macrophage plasticity and polarized activation provides a basis for macrophage-centered diagnostic and therapeutic strategies.")
text12 = Main.clean_text("This investigation examines the role of trait-mediated indirect interactions in a simple aquatic food web. We conducted the experiments in cattle watering tanks in order to establish whether competitive and predator-prey interactions between two species are affected by other species in the system; i.e., are pairwise interaction strengths affected by the background species assemblage? We examined the survival and growth response of small bullfrog (Rana catesbeiana) and small green frog (Rana clamitans) tadpoles in the presence and absence of a competitor (large bullfrogs), the lethal presence of the larval odonate predator Tramea lacerata,and the nonlethal (caged) presence of the larval odonate predators Anax junius and Anax longipes. We demonstrate that large bullfrog competitors and caged Anax affect traits (foraging activity level) of small bullfrog and small green frog tadpoles and that these changes in traits, in turn, affect interactions of the small tadpole species with each other and with the other species. In particular, the following four trait- mediated indirect interactions were evident: (1) Presence of large bullfrog competitors increased the predation rate of Trameaon small green frogs and small bullfrogs. (2) Presence of nonlethal Anax reduced the predation rate of Tramea on small green frogs. (3) Presence of nonlethal Anax increased the competitive advantage of bullfrogs over green frogs. (4) Presence of nonlethal Anax facilitated midge invasion of the experimental units. The pro- posed mechanisms (changes in small tadpole activity) involved in these trait-mediated indirect interactions were supported by observational data on tadpole activity and resource levels in the experimental units, and in laboratory experiments examining tadpole activity responses to predators. The occurrence of strong trait-mediated indirect interactions in this simple food web underscores the potential importance of such interactions in animal com- munities.")
text13 = Main.clean_text("Presence of large bullfrog competitors increased the predation rate of Trameaon small green frogs and small bullfrogs.")
text14 = Main.clean_text("Presence of nonlethal Anax reduced the predation rate of Tramea on small green frogs.")
text15 = Main.clean_text("Presence of nonlethal Anax increased the competitive advantage of bullfrogs over green frogs.")
text16 = Main.clean_text("Presence of nonlethal Anax facilitated midge invasion of the experimental units.") # Not 100% sure about this one.
text17 = "Pea aphids (Acyrthosiphon pisum, Harris) have been shown to produce winged dispersal morphs in response to the presence of ladybirds or parasitoid natural enemies."
text18 = "The results presented here clearly demonstrate that the presence of both lacewing larvae and hoverfly larvae can induce Lucinida to produce a higher proportion of winged offspring."

In [12]:
main = Main(text=None)
possession = Possession(main=main)
keywords = Keywords(main=main)
species = Species(main=main)
references = References(main=main, texts=[])
context = Context(main=main)

  model.load_state_dict(torch.load(filelike, map_location=device))
05/05/2025 22:18:04 - INFO - 	 missing_keys: []
05/05/2025 22:18:04 - INFO - 	 unexpected_keys: []
05/05/2025 22:18:04 - INFO - 	 mismatched_keys: []
05/05/2025 22:18:04 - INFO - 	 error_msgs: []
05/05/2025 22:18:04 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M


In [13]:
parser = Parser(main=main, possession=possession, keywords=keywords, species=species, references=references, context=context, text=None)

# for text in [text18]:
#     # Update
#     parser.update(text)
#     # Parse
#     units = parser.parse()

In [14]:
# PDF to Text: PyMuPDF
# This looks like it may be good. The documentation
# looks great. I think PyMuPDF will be my saving grace
# for this portion of the project. I will be choosing
# PyMuPDF.
import pymupdf

def pdf_to_text(url):
    try:
        text = ""
        f = pdf_bytes(url)
        doc = pymupdf.open(stream=f)
        for d in doc:
            text += d.get_text()
        return text
    except Exception as e:
        return ""

# OpenAlex: Finding Papers
# Let's say we have a set or an array of keywords.
# We can use OpenAlex to find a large number of papers
# that, in some way, match those keywords. This is an
# example of how it could work. Furthermore, Veronica
# mentioned how there's other characteristics that you
# may be looking for, like how far back you want to go
# in searching for papers.
from pyalex import Works

# Later, I'll use these URLs to try out the PDF to text
# tools. If there's any.
urls = []
number_urls = 0

keywords = ["higher-order interactions", "trait-mediated interaction modification", "trait-mediated interaction", "polymorphism", "apparent competition", "resource competition", "keystone predation", "intraguild predation", "intransitive competition", "trophic chains", "competition chains", "mutual competition"]
pager = Works().search_filter(title=keywords[0]).paginate(per_page=200)
number_works = 0
number_unfiltered_works = 0

texts = []

for page in pager:
    for work in page:
        number_unfiltered_works += 1
        
        title = work['title']
        abstract = work['abstract']
        
        # Find Full Text
        url = None
        if work["primary_location"]:
            url = work["primary_location"]["pdf_url"]
            if url:
                urls.append(url)
        full_text = "" if not url else pdf_to_text(url)
        
        if not abstract and not full_text:
            continue
        texts.append((title, abstract if abstract and not full_text else full_text))
        number_works += 1
    if number_works >= 10000:
        break

print(len(texts), number_works, number_unfiltered_works)

889 889 1379


In [None]:
import random
random.shuffle(texts)

valid_texts = []

text_i = 0
number_texts = len(texts)

for title, text in texts[:number_texts]:
    try:
        text_i += 1
        print(f"{text_i}/{number_texts}")
        
        parser.update(text)
        units = parser.parse()

        for unit in units:
            if unit.get_score() == 4:
                valid_texts.append((title, unit))
                break
    except Exception as e:
        print(e)
        continue

In [16]:
len(valid_texts)

19

In [35]:
for t in valid_texts:
    print(t[0])
    print(t[1])
    print()

Direct and higher‐order interactions in plant communities under increasing weather persistence
Species: {species, interactions}, Trait: {role}, Cause: (Species: set(), Trait: {conditions}, Cause: (set()), Change: set()), Change: {destabilize}

HOM (higher order mode) losses at the IR (interaction region) of the B-factory
Species: {s}, Trait: {heating}, Cause: ({beam, the, wall, in, pipe}), Change: {producing}

Multitrophic higher-order interactions modulate species persistence
Species: {plant, species}, Trait: {interactions}, Cause: (Species: set(), Trait: {HOIs}, Cause: (set()), Change: set()), Change: {produced}

Bias-Variance Trade-Off in Hierarchical Probabilistic Models Using Higher-Order Feature Interactions
Species: {variance, trade, off, bias}, Trait: {complexity}, Cause: (Species: set(), Trait: {number}, Cause: ({that}), Change: {increasing}), Change: {increases}

Competition Between Aquatic Insects and Vertebrates: Interaction Strength and Higher Order Interactions
Species: {

In [26]:
print(text_i)

889


In [31]:
import pickle

In [34]:
# with open('texts.pickle', 'wb') as file:
#     pickle.dump(texts, file)

# with open('valid_texts.pickle', 'wb') as file:
#     pickle.dump([t[0] for t in valid_texts], file)

In [None]:
with open("valid_texts.pickle", 'rb') as file:
    data = pickle.load(file)
    print(data)