In [None]:
import re
import csv
import sys
import time
import spacy
import numpy as np
import pandas as pd
import random
import pickle
from fastcoref import FCoref, LingMessCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher, PhraseMatcher
from spacy.language import Language
from IPython.display import clear_output
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
%run -i "../utils.py"

In [None]:
VERBOSE_LEVEL = 4

In [None]:
class Base:
    # There is not a defined conversion method for these words.
    # This is the default list of irregular nouns. It maps the
    # the singular version to the plural version (SP).
    IRREGULAR_NOUNS_SP = {
        "ox": "oxen",
        "goose": "geese",
        "mouse": "mice",
        "bacterium": "bacteria"
    }

    # This is the reversed version of the dictionary above, meaning 
    # that the plural version is mapped to the singular version 
    # (PS).
    IRREGULAR_NOUNS_PS = {v: k for k, v in IRREGULAR_NOUNS_SP.items()}
    
    # The singular and plural versions of these words are the same. 
    # This is the default list of zero plural nouns.
    ZERO_PLURAL_NOUNS = [
        "species", 
        "deer", 
        "fish", 
        "moose", 
        "sheep", 
        "swine", 
        "buffalo", 
        "trout", 
        "cattle"
    ]

    # These pairs of characters define symbols that enclose other
    # information in a text.
    ENCLOSURES = {
        "(": ")",
        "[": "]",
        "{": "}"
    }

    LAX_ENCLOSURES = {
        "(": ")",
        "[": "]",
        "{": "}",
        "—": "—"
    }


    
    def __init__(self, main, irregular_nouns_sp=IRREGULAR_NOUNS_SP, irregular_nouns_ps=IRREGULAR_NOUNS_PS, zero_plural_nouns=ZERO_PLURAL_NOUNS):
        self.main = main
        self.zero_plural_nouns = zero_plural_nouns
        self.irregular_nouns_sp = irregular_nouns_sp
        self.irregular_nouns_ps = irregular_nouns_ps
        self.irregular_plural_nouns = list(self.irregular_nouns_sp.values())
        self.irregular_singular_nouns = list(self.irregular_nouns_sp.keys())



    def delete_extra_whitespace(self, string):
        # Duplicate spaces, spaces before punctuation marks,
        # and outside spaces are removed.
        string = re.sub(r"\s+", " ", string)
        string = re.sub(r"\s+([?.!,])", r"\1", string)
        string = string.strip()
        return string



    def delete_outer_non_alnum(self, string):
        while string:
            start_len = len(string)
            # Remove Leading Non-Alphanumeric Character
            if string and not string[0].isalnum():
                string = string[1:]
            # Remove Trailing Non-Alphanumeric Character
            if string and not string[-1].isalnum():
                string = string[:-1]
            # No Changes Made
            if start_len == len(string):
                break
        return string



    def get_parentheticals(self, text, enclosures=ENCLOSURES, flatten=False):
        # The parenthetical would be the content inside of a pair
        # of matching parentheses, brackets, or braces.
        parentheticals = []
        
        # This contains the text that's not inside of any
        # enclosure.
        base_text = []
        
        # This is used for building groups, which often has a 
        # nested structure.
        stacks = []
        
        # These are the pairs of characters that we recognize
        # as defining the parenthetical.
        openers = list(enclosures.keys())
        closers = list(enclosures.values())
        
        # This contains the opening characters of the groups 
        # that are currently open (e.g. '(', '['). We use it 
        # so that we know whether to open or close a group.
        opened = []
        
        for i, char in enumerate(text):
            # Open Group
            if char in openers:
                stacks.append([])
                opened.append(char)
            # Close Group
            elif opened and char == enclosures.get(opened[-1], ""):
                parentheticals.append(stacks.pop())
                opened.pop()
            # Add to Group
            elif opened:
                stacks[-1].append(i)
            # Add to Base Text
            else:
                base_text.append(i)
        
        # We close the remaining groups that have not
        # been closed.
        while stacks:
            parentheticals.append(stacks.pop())
            
        # Cluster Groups' Indices
        # A list in the lists of indices (where each list represents a group of text) could have 
        # an interruption (e.g. [0, 1, 2, 10 15]) because of a parenthetical. So, we cluster the
        # indices in each list to make the output more useful (e.g. [(0, 3), (10, 16)]).
        lists_of_indices = [*parentheticals, base_text]        
        lists_of_clustered_indices = []

        for list_of_indices in lists_of_indices:
            if not list_of_indices:
                continue

            # We start off with a single cluster that is made up of the
            # first index. If the next index follows the first index, 
            # we continue the cluster. If it doesn't, we create a new cluster.
            clustered_indices = [[list_of_indices[0], list_of_indices[0] + 1]]
            
            for index in list_of_indices[1:]:
                if clustered_indices[-1][1] == index:
                    clustered_indices[-1][1] = index + 1
                else:
                    clustered_indices.append([index, index + 1])

            # Add Clustered Indices
            lists_of_clustered_indices.append(clustered_indices)
            
        if flatten:
            flattened_clusters = []
            # We are placing each cluster of indices into one list.
            # This removes the context of the larger parenthetical,
            # but the context may be cumbersome instead of useful.
            for list_of_clustered_indices in lists_of_clustered_indices:
                for clustered_indices in list_of_clustered_indices:
                    flattened_clusters.append(clustered_indices)
            lists_of_clustered_indices = flattened_clusters
        
        return lists_of_clustered_indices



    def separate_span_by_parenthetical(self, span):
        span_parentheticals = []
        
        # The clusters of the span represented with tuples of char indices
        # (e.g. [(0, 1), (1, 5), (5, 10)]. This is a list of clustered
        # indices (like above).
        text_clusters = self.get_parentheticals(span.text, flatten=True)
        
        for cluster in text_clusters:
            if span.text[cluster[0]:cluster[1]].isspace():
                continue

            l_char_index = span[0].idx + cluster[0]
            r_char_index = span[0].idx + cluster[1] - 1

            # Instead of having a tuple dictating the start and end of a cluster,
            # we can use a span -- it's much simpler.
            cluster_as_span = self.get_span_at_indices(l_char_index, r_char_index)
            if not cluster_as_span:
                continue
            
            span_parentheticals.append(cluster_as_span)

        return span_parentheticals



    def separate_spans_by_parenthetical(self, spans):
        all_span_parentheticals = []
        for span in spans:
            all_span_parentheticals.extend(self.separate_span_by_parenthetical(span))
        return all_span_parentheticals

    
 
    def singularize(self, string):
        string = string.lower()
        
        # The string to singularize should not have any
        # non-alphanumeric characters at the end, or else
        # the algorithm will not work.
        words = re.split(r" ", string)

        if not words:
            return [string]

        # If the last word in the string is a zero plural
        # or a singular irregular noun, there's no changes
        # to make. For example, "red sheep" and "ox" are 
        # already singular.
        if (
            words[-1] in self.zero_plural_nouns or 
            words[-1] in self.irregular_singular_nouns
        ):
            return [string]

        # If the last word in the string is an irregular
        # plural noun, we rely on a dictionary with the
        # corresponding mapping.
        if words[-1] in self.irregular_plural_nouns:
            words[-1] = self.irregular_nouns_ps[words[-1]]
            singulars = [self.delete_extra_whitespace(" ".join(words))]
            return singulars
        
        # We take the singular form of the last word and
        # add it back in to the other words. As there could
        # be multiple forms (due to uncertainty), we need to
        # include all possible versions.
        singulars = []
        singular_endings = self.get_singular(words[-1])

        if not singular_endings:
            return [string]
        
        for singular_ending in singular_endings:
            singular = self.delete_extra_whitespace(" ".join([*words[:-1], singular_ending]))
            singulars.append(singular)
            
        return singulars



    def get_singular(self, string):
        versions = []

        # Replace -ies with -y
        if re.fullmatch(r".*ies$", string):
            versions.append(f'{string[:-3]}y')
            return versions

        # Replace -ves with -f and -fe
        if re.fullmatch(r".*ves$", string):
            versions.append(f'{string[:-3]}f')
            versions.append(f'{string[:-3]}fe')
            return versions

        # Delete -es 
        if re.fullmatch(r".*es$", string):
            versions.append(f'{string[:-2]}')
            return versions

        # Replace -i with -us
        if re.fullmatch(r".*i$", string):
            versions.append(f'{string[:-1]}us')
            return versions

        # Delete -s
        if re.fullmatch(r".*s$", string):
            versions.append(f'{string[:-1]}')
            return versions

        return versions


    
    def pluralize(self, string):
        string = string.lower()
        
        # The string to pluralize should not have any
        # non-alphanumeric characters at the end, or else
        # the algorithm will not work.
        words = re.split(r" ", string)

        if not words:
            return [string]

        # If the last word in the string is a zero plural
        # or a plural irregular noun, there's no changes
        # to make. For example, "red sheep" and "oxen" are 
        # already singular.
        if (
            words[-1] in self.zero_plural_nouns or 
            words[-1] in self.irregular_plural_nouns
        ):
            return [string]

        # If the last word in the string is an irregular
        # singular noun, we rely on a dictionary with the
        # corresponding mapping.
        if words[-1] in self.irregular_singular_nouns:
            words[-1] = self.irregular_nouns_sp[words[-1]]
            return [self.delete_extra_whitespace(" ".join(words))]
        
        # We take the singular form of the last word and
        # add it back in to the other words. As there could
        # be multiple forms (due to error), we need to
        # handle them all.
        plurals = []
        plural_endings = self.get_plural(words[-1])

        if not plural_endings:
            return [string]
            
        for plural_ending in plural_endings:
            plural = self.delete_extra_whitespace(" ".join([*words[:-1], plural_ending]))
            plurals.append(plural)
            
        return plurals

    
  
    def get_plural(self, string):
        versions = []

        # Words that end with -us often have
        # two different plural versions: -es and -i.
        # For example, the plural version of cactus 
        # can be cactuses or cacti.
        if re.fullmatch(r".*us$", string):
            versions.append(f'{string}es')
            versions.append(f'{string[:-2]}i')
            return versions

        # The -es ending is added to the words below.
        if re.fullmatch(r".*([^l]s|sh|ch|x|z)$", string):
            versions.append(f'{string}es')
            return versions

        # Words that end with a consonant followed by 'y'
        # are made plural by replacing the 'y' with -ies.
        # For example, the plural version of canary is
        # canaries.
        if re.fullmatch(r".*([^aeiou])(y)$", string):
            versions.append(f'{string[:-1]}ies')
            return versions
            
        # The plural version of words ending with -f
        # and -fe aren't clear. To be safe, I will add
        # both versions.
        if (re.fullmatch(r".*(f)(e?)$", string) and not re.fullmatch(r".*ff$", string)):
            last_clean = re.sub(r"(f)(e?)$", "", string)
            versions.append(f'{last_clean}fs')
            versions.append(f'{last_clean}ves')
            return versions

        # People add -s or -es to words that end with 'o'.
        # To be safe, both versions are added.
        if re.fullmatch(r".*([^aeiou])o$", string):
            versions.append(f'{string}s')
            versions.append(f'{string}es')
            return versions

        # If there's no -s at the end of the string and
        # the other cases didn't run, we add an -s.
        if re.fullmatch(r".*[^s]$", string):
            versions.append(f'{string}s')
        
        return versions


 
    def expand_unit(self, *, il_unit, ir_unit, il_boundary, ir_boundary, speech=[], literals=[], include=True, direction='BOTH', verbose=False):
        UNIT = self.main.sp_doc[il_unit:ir_unit+1]
        
        if il_unit > ir_unit:
            print(f"Error: il_unit of {il_unit} greater than ir_unit of {ir_unit}")
            return None
        
        if direction in ['BOTH', 'LEFT'] and il_boundary > il_unit:
            print(f"Error: il_unit of {il_unit} less than il_boundary of {il_boundary}")
            return None
        
        if direction in ['BOTH', 'RIGHT'] and ir_boundary < ir_unit:
            print(f"Error: ir_unit of {ir_unit} greater than ir_boundary of {ir_boundary}")
            return None
        
        # Move Left
        if direction in ['BOTH', 'LEFT']:
            # The indices are inclusive, therefore, when 
            # the condition fails, il_unit will be equal
            # to il_boundary.
            while il_unit > il_boundary:
                # We assume that the current token is allowed,
                # and look to the token to the left.
                l_token = self.main.sp_doc[il_unit-1]

                # If the token is invalid, we stop expanding.
                in_set = l_token.pos_ in speech or l_token.lower_ in literals

                # Case 1: include=False, in_set=True
                # If we're not meant to include the defined tokens, and the
                # current token is in that set, we stop expanding.
                # Case 2: include=True, in_set=False
                # If we're meant to include the defined tokens, and the current
                # token is not in that set, we stop expanding.
                # Case 3: include=in_set
                # If we're meant to include the defined tokens, and the current
                # token is in that set, we continue expanding. If we're not meant
                # to include the defined tokens, and the current token is not
                # in that set, we continue expanding.
                if include ^ in_set:
                    break
                
                # Else, the left token is valid, and
                # we continue to expand.
                il_unit -= 1

        # Move Right
        if direction in ['BOTH', 'RIGHT']:
            # Likewise, when the condition fails,
            # ir_unit will be equal to the ir_boundary.
            # The ir_boundary is also inclusive.
            while ir_unit < ir_boundary:
                # Assuming that the current token is valid,
                # we look to the right to see if we can
                # expand.
                r_token = self.main.sp_doc[ir_unit+1]

                # If the token is invalid, we stop expanding.
                in_set = r_token.pos_ in speech or r_token.lower_ in literals
                if include ^ in_set:
                    break

                # Else, the token is valid and
                # we continue.
                ir_unit += 1

        assert il_unit >= il_boundary and ir_unit <= ir_boundary
        
        expanded_unit = self.main.sp_doc[il_unit:ir_unit+1]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Expanded Unit of '{UNIT}': {expanded_unit}")
        
        return expanded_unit


    
    def contract_unit(self, *, il_unit, ir_unit, speech=[], literals=[], include=True, direction='BOTH', verbose=False):
        UNIT = self.main.sp_doc[il_unit:ir_unit+1]
        
        if il_unit > ir_unit:
            print(f"Error: il_unit of {il_unit} greater than ir_unit of {ir_unit}")
            return None
        
        # Move Right
        if direction in ['BOTH', 'LEFT']:
            while il_unit < ir_unit:
                # We must check if the current token is not allowed. If it's
                # not allowed, we contract (remove).
                token = self.main.sp_doc[il_unit]

                # include = True means that we want the tokens that match
                # the speech and/or literals in the contracted unit.
                
                # include = False means that we don't want the tokens that
                # match the speech and/or literals in the contracted unit.
                
                # Case 1: include = True, in_set = True
                # We have a token that's meant to be included in the set.
                # However, we're contracting, which means we would end up
                # removing the token if we continue. Therefore, we break.
                
                # Case 2: include = False, in_set = False
                # We have a token that's not in the set which defines the
                # tokens that aren't meant to be included. Therefore, we 
                # have a token that is meant to be included. If we continue,
                # we would end up removing this token. Therefore, we break.
                
                # Default:
                # If we have a token that's in the set (in_set=True) of
                # tokens we're not supposed to include in the contracted 
                # unit (include=False), we need to remove it. Likewise, if
                # we have a token that's not in the set (in_set=False) of
                # tokens to include in the contracted unit (include=True),
                # we need to remove it.
                
                in_set = token.pos_ in speech or token.lower_ in literals
                if include == in_set:
                    break

                # The token is valid, thus we continue.
                il_unit += 1

        # Move Left      
        if direction in ['BOTH', 'RIGHT']:
            while ir_unit > il_unit:
                token = self.main.sp_doc[ir_unit]

                # The token is invalid and we
                # stop contracting.
                in_set = token.pos_ in speech or token.lower_ in literals
                if include == in_set:
                    break

                # The token is valid and we continue.
                ir_unit -= 1

        assert il_unit <= ir_unit
        
        contracted_unit = self.main.sp_doc[il_unit:ir_unit+1]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Contracted Unit of '{UNIT}': {contracted_unit}")
        
        return contracted_unit


    
    def find_unit_context(self, *, il_unit, ir_unit, il_boundary, ir_boundary, speech=["ADJ", "NOUN", "ADP", "ADV", "PART", "PROPN", "VERB", "PRON", "DET", "AUX", "PART", "SCONJ"], literals=[], include=True, enclosures=LAX_ENCLOSURES, comma_encloses=False, verbose=False):
        UNIT = self.main.sp_doc[il_unit:ir_unit+1]
        
        if il_unit > ir_unit:
            print(f"Error: il_unit of {il_unit} greater than ir_unit of {ir_unit}")
            return None
        
        if il_boundary > il_unit:
            print(f"Error: il_unit of {il_unit} less than il_boundary of {il_boundary}")
            return None
        
        if ir_boundary < ir_unit:
            print(f"Error: ir_unit of {ir_unit} greater than ir_boundary of {ir_boundary}")
            return None
        
        # Caveat: Parentheticals
        # The context of a unit inside a set of enclosures should
        # not go farther than the boundaries of those enclosures.
        # However, we need to manually determine whether the unit
        # is in parentheses (or any set of the matching symbols
        # below).
        openers = list(enclosures.keys())
        closers = list(enclosures.values())
        enclosing_chars = [*closers, *openers]

        # Look for Group Punctuation on the Left
        i = il_unit
        opener = None
        while i > il_boundary:
            token = self.main.sp_doc[i]
            if token.lower_ in enclosing_chars and token.lower_ != ",":
                opener = token
                break
            i -= 1

        # Look for Group Punctuation on the Right
        i = ir_unit
        closer = None
        while i < ir_boundary:
            token = self.main.sp_doc[i]
            if token.lower_ in enclosing_chars and token.lower_ != ",":
                closer = token
                break
            i += 1

        # If there's a group punctuation on the left
        # and right, and they match each other (e.g. '(' and ')'),
        # we return the text between the punctuations.
        parenthetical = opener and closer and enclosures.get(opener.lower_) == closer.text
        if parenthetical:
            context = [t for t in self.main.sp_doc[opener.i:closer.i+1]]
            
            if verbose and VERBOSE_LEVEL >= 1:
                print(f"Parenthetical - Unit Context of '{UNIT}': {context}")
            
            return context

        # We can also check whether the unit it enclosed
        # in a comma or two, only if a comma can enclose.
        if comma_encloses:
            i = il_unit
            while i > il_boundary:
                i_token = self.main.sp_doc[i]
                if i_token.lower_ in [",", ";", "—"]:
                    break
                i -= 1

            j = ir_unit
            while j < ir_boundary:
                j_token = self.main.sp_doc[j]
                if j_token.lower_ in [",", ";", "—"]:
                    break
                j += 1

            if i_token.lower_ == "," or j_token.lower_ == ",":
                context = [t for t in self.main.sp_doc[i:j+1]]
            
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Comma - Unit Context of '{UNIT}': {context}")
                    
                return context
            
        # As the unit is not a parenthetical, we will expand
        # outwards until we run into a stopping token. The exclude
        # list contains tokens that should be excluded from the
        # context. Currently, it will contain any parentheticals
        # that we run into.
        exclude = []

        # We can modify the enclosures after handling the parenthetical
        # situation to make the code easier.
        if comma_encloses:
            enclosures[","] : ","
        
        # Expand Left
        while il_unit > il_boundary:
            # Assuming that the current token is fine,
            # we look to the left.
            l_token = self.main.sp_doc[il_unit-1]

            if l_token.lower_ not in closers:
                in_set = l_token.pos_ in speech or l_token.lower_ in literals
                if in_set ^ include:
                    break
                il_unit -= 1
            # If it's a closing enclosure (e.g. ')', ']'),
            # we need to skip over whatever is contained in
            # that punctuation.
            else:
                i = il_unit - 1
                
                token = self.main.sp_doc[i]
                exclude.append(token)

                # We continue until we reach the boundary or
                # we find the matching opening character.
                closed = []
                
                while i > il_boundary:
                    token = self.main.sp_doc[i]
                    # Found Closer
                    if token.lower_ in closers:
                        exclude.append(token)
                        closed.append(token.lower_)
                    # Currently Closed
                    elif closed:
                        exclude.append(token)
                        # Found Opener
                        if token.lower_ == enclosures.get(closed[-1]):
                            closed.pop()
                    else:
                        break
                    i -= 1
                
                il_unit = i

        # Expand Right
        while ir_unit < ir_boundary:
            # We're checking the token to the right
            # to see if we can expand or not.
            r_token = self.main.sp_doc[ir_unit+1]

            if r_token.lower_ not in openers:
                in_set = r_token.pos_ in speech or r_token.lower_ in literals
                if in_set ^ include:
                    break
                ir_unit += 1
            # If the token to the right is an opener (e.g. '(', '['), we must skip
            # it, the parenthetical inside, and the closer.
            else:
                i = ir_unit + 1
                
                token = self.main.sp_doc[i]
                exclude.append(token)

                # We continue until we reach the boundary or
                # we find all the closers for the openers.
                opened = []
                
                while i < ir_boundary:
                    token = self.main.sp_doc[i]
                    # Found Opener
                    if token.lower_ in openers:
                        exclude.append(token)
                        opened.append(token.lower_)
                    # Currently Opened
                    elif opened:
                        exclude.append(token)
                        # Found Closer
                        if token.lower_ == enclosures.get(opened[-1]):
                            opened.pop()
                    else:
                        break
                    i += 1
                
                ir_unit = i
        
        # We remove the excluded tokens and return the context.
        context = [t for t in self.main.sp_doc[il_unit:ir_unit+1] if t not in exclude]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Unit Context of '{UNIT}': {context}")
        
        return context


    
    def get_span_at_indices(self, l_index, r_index):
        text = self.main.sp_doc.text.lower()

        while text[l_index].isspace():
            l_index += 1

        while text[r_index].isspace():
            r_index -= 1

        if l_index > r_index:
            print(f"Error: l_index of {l_index} greater than r_index of {r_index}")
            return None
            
        l_token_i = self.main.token_at_char(l_index).i
        r_token_i = self.main.token_at_char(r_index).i
        
        return self.main.sp_doc[l_token_i:r_token_i+1]


    
    def get_base_nouns(self, span, return_tokens=False, immediate_stop=False):
        ending_nouns = []
        
        reversed_span = [t for t in span]
        reversed_span.reverse()
        
        for token in reversed_span:
            if token.pos_ in ["NOUN", "PROPN"]:
                ending_nouns.append(token if return_tokens else self.main.sp_doc[token.i:token.i+1])
                if immediate_stop:
                    break
            else:
                break

        return ending_nouns



    def flatten(self, arr):
        flat_arr = []

        if not isinstance(arr, list):
            return [arr]

        for element in arr:
            flat_arr.extend(self.flatten(element))

        return flat_arr

In [42]:
class Species:
    def __init__(self, main):
        # Tools
        self.main = main
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler"])
        self.tn_doc = None
        
        # Contains any spans that have been identified
        # as a species.
        self.spans = None
        self.span_starts = None
        
        # Contains any tokens that have been identified
        # as a species or being a part of a species.
        self.tokens = None
        
        # Used to quickly access the span that a token
        # belongs to.
        self.token_to_span = None
        
        # Maps a string to an array of strings wherein
        # the strings involved in the key-value pair 
        # have been identified as an alternate name of each other.
        self.alternate_names = None
        
        # Includes words that (1) are to be identified as species; and
        # (2) are sometimes not identified as species, more or less.
        self.dictionary = ["juvenile", "juveniles", "adult", "prey", "predator", "predators", "species", "tree", "cat", "dog", "fly", "flies", "plant", "plants"]



    def update(self, text, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        self.tn_doc = self.tn_nlp(text)
        self.spans, self.tokens, self.token_to_span, self.span_starts = self.load_species(verbose=verbose)
        self.alternate_names = self.load_alternate_names(self.spans)



    def convert_tn_spans_to_sp_spans(self, tn_spans):
        sp_spans = []

        for tn_span in tn_spans:
            l_char_index = self.tn_doc[tn_span.start].idx
            r_char_index = l_char_index + len(tn_span.text) - 1

            try:
                l_sp_token_i = self.main.token_at_char(l_char_index).i
                r_sp_token_i = self.main.token_at_char(r_char_index).i
            except Exception as e:
                print(f"Error: Couldn't find token at character index of {l_char_index} and token index of {l_sp_token_i}.")
                print(f"Error: Couldn't find token at character index of {r_char_index} and token index of {r_sp_token_i}.")
                print(e)
                continue

            sp_span = self.main.sp_doc[l_sp_token_i:r_sp_token_i+1]
            if sp_span.text != tn_span:
                print(f"Error: SpaCy span does not match TaxoNerd span.")
                continue
            
            sp_spans.append(sp_span)

        return sp_spans



    def load_search_strings(self, verbose=False):
        search_strings = [*self.dictionary]
        
        # Creating a Broad Set of Species
        spans = self.convert_tn_spans_to_sp_spans(self.tn_doc.ents)
        spans = self.main.separate_spans_by_parenthetical(spans)

        # Add Ending Nouns to Set
        all_nouns = []
        for span in spans:
            nouns = self.main.get_base_nouns(span)
            if nouns:
                all_nouns.extend(nouns)
        spans.extend(all_nouns)

        # Adding Plural and Singular Versions of Spans
        for span in spans:
            text = span.text.lower()
            text = self.main.delete_extra_whitespace(self.main.delete_outer_non_alnum(text))

            # Blank Text or No Letters
            if not text or not [c for c in text if c.isalpha()]:
                continue

            search_strings.append(text)

            # Add Plural Version
            singular = span[-1].pos_ == "NOUN" and span[-1].tag_ == "NN"
            if singular:
                plural_version = self.main.pluralize(text)
                search_strings.extend(plural_version)

            # Add Singular Version
            plural = span[-1].pos_ == "NOUN" and span[-1].tag_ == "NNS"
            if plural:
                singular_version = self.main.singularize(text)
                search_strings.extend(singular_version)

        # Remove Duplicates
        search_strings = list(set(search_strings))

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Search Strings: {search_strings}")
        
        return search_strings



    def load_alternate_names(self, spans, verbose=False):
        spans.sort(key=lambda span: span.start)

        # It's useful to know if a different name refers to a
        # species we have already seen. For example, in
        # "predatory crab (Carcinus maenas)", "predatory crab"
        # is an alternative name for "Carcinus maenas" and
        # vice versa. This is used so that the species can be
        # properly tracked and redundant points are less
        # likely to be given.
        alternate_names = {}
        
        # Finding and Storing Alternative Names
        for i, species_span in enumerate(spans):
            # There's not a next species to
            # evaluate.
            if i + 1 >= len(spans):
                break
            
            next_species_span = spans[i+1]
            
            # If there's one token between the species and the next species,
            # we check if the next species is surrounded by punctuation.
            if next_species_span.start - species_span.end == 1:
                # Token Before and After the Next Species
                before_next = self.main.sp_doc[next_species_span.start-1]
                after_next = self.main.sp_doc[next_species_span.end]

                if before_next.pos_ in ["PUNCT", "SYM"] and after_next.pos_ in ["PUNCT", "SYM"]:
                    sp_1_text = species_span.text.lower()
                    sp_2_text = next_species_span.text.lower()
                    
                    if sp_1_text not in alternate_names:
                        alternate_names[sp_1_text] = []
                    
                    if sp_2_text not in alternate_names:
                        alternate_names[sp_2_text] = []
                    
                    alternate_names[sp_1_text].append(sp_2_text)
                    alternate_names[sp_2_text].append(sp_1_text)
            # If there's no token between the species and the next,
            # species we assume that they refer to the same species.
            elif next_species_span.start - species_span.end == 0:
                sp_1_text = species_span.text.lower()
                sp_2_text = next_species_span.text.lower()
                
                if sp_1_text not in alternate_names:
                    alternate_names[sp_1_text] = []
                
                if sp_2_text not in alternate_names:
                    alternate_names[sp_2_text] = []

                alternate_names[sp_1_text].append(sp_2_text)
                alternate_names[sp_2_text].append(sp_1_text)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Alternate Names: {alternate_names}")

        return alternate_names



    def load_species(self, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        # Load Search Strings from Species Spans
        search_strings = self.load_search_strings(verbose=verbose)

        # Search for Species
        # The results are stored in different 
        # forms below.
        spans = []
        tokens = []
        token_to_span = {}

        # Where we're searching for species.
        text = self.main.sp_doc.text.lower()

        for string in search_strings:
            matches = re.finditer(re.escape(string), text, re.IGNORECASE)

            for l_char_index, r_char_index, matched_text in [(match.start(), match.end(), match.group()) for match in matches]:    
                # The full word must match, not just a substring inside of it.
                # So, if the species we're looking for is "ant", only "ant"
                # will match -- not "pants" or "antebellum". Therefore, the
                # characters to the left and right of the matched string cannot
                # be letters.
                l_char_is_letter = l_char_index > 0 and text[l_char_index-1].isalpha()
                r_char_is_letter = r_char_index < len(text) and text[r_char_index].isalpha()
                
                if l_char_is_letter or r_char_is_letter or not matched_text:
                    continue

                try:
                    l_token_i = self.main.token_at_char(l_char_index).i
                    r_token_i = self.main.token_at_char(r_char_index-1).i
                except Exception as e:
                    print(f"Error: Unable to find token at index of {l_char_index}.")
                    print(f"Error: Unable to find token at index of {r_char_index}.")
                    print(f"\tMatched: '{matched_text}'")
                    print(e)
                    continue

                # This is the matched substring (which would be
                # a species) as a span in the parent document.
                span = self.main.sp_doc[l_token_i:r_token_i+1]
                
                # Expand Species
                # Let's say there's a word like "squirrel". That's a bit ambiguous. 
                # Is it a brown squirrel, a bonobo? If the species is possibly missing
                # information (like an adjective to the left of it), we should expand
                # in order to get a full picture of the species.
                unclear_1 = len(span) == 1 and span[0].pos_ == "NOUN"
                unclear_2 = span.start > 0 and self.main.sp_doc[span.start-1].pos_ in ["ADJ"]
                
                if unclear_1 or unclear_2:
                    span = self.main.expand_unit(
                        il_unit=span.start, 
                        ir_unit=span.end-1,
                        il_boundary=0,
                        ir_boundary=len(self.main.sp_doc),
                        speech=["ADJ", "PROPN"],
                        literals=["-"],
                        include=True,
                        direction="LEFT",
                        verbose=verbose
                    )
                
                # Remove Outer Symbols
                # There are times where a species is identified with a parenthesis
                # nearby. Here, we remove that parenthesis (and any other symbols).
                span = self.main.contract_unit(
                    il_unit=span.start, 
                    ir_unit=span.end-1, 
                    speech=["PUNCT", "SYM", "DET", "PART"],
                    include=False,
                    verbose=verbose
                )

                if not span:
                    print(f"Error: Span does not exist; left character index {l_char_index}.")
                    print(f"\tMatched: '{matched_text}'")
                    continue
            
                # A species must have a noun or a
                # proper noun. This may help discard
                # bad results.
                letter_found = False
                for token in span:
                    if token.pos_ in ["NOUN", "PROPN"] or token.lower_ in self.dictionary:
                        letter_found = True
                        break

                if not letter_found:
                    continue

                # Adding Species
                spans.append(span)
                for token in span:
                    if token in tokens or token.pos_ in ["PUNCT", "SYM", "DET", "PART"]:
                        continue
                    tokens.append(token)
                    token_to_span[token] = span
        
        spans = list({span.start: span for span in spans}.values())
        spans.sort(key=lambda span: span.start)
        
        span_starts = [span[0] for span in spans]

        if verbose and VERBOSE_LEVEL >= 1:
            print("Output of load_species:")
            print(f"Spans: {spans}")
            print(f"Tokens: {tokens}")
            print(f"Mapped Tokens: {token_to_span}")
            print(f"Span Starts: {span_starts}")
        
        return (spans, tokens, token_to_span, span_starts)



    def is_alternate(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_a_text = sp_a.text.lower()
            
        # Species B is an alternate name for Species A
        if sp_b_text in self.alternate_names.get(sp_a_text, []):
            return True
        
        # Species A is an alternate name for Species B
        if sp_a_text in self.alternate_names.get(sp_b_text, []):
            return True

        return False



    def is_same_text(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_a_text = sp_a.text.lower()

        if sp_a_text == sp_b_text:
            return True
            
        sp_a_singular_texts = [sp_a_text] if sp_a[-1].tag_ in ["NN", "NNP"] else self.main.singularize(sp_a_text)
        sp_b_singular_texts = [sp_b_text] if sp_b[-1].tag_ in ["NN", "NNP"] else self.main.singularize(sp_b_text)

        if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
            return True
        return False



    def has_same_base_nouns(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_b_0_text = sp_b[0].lower_
        sp_b_0_is_noun = sp_b[0].pos_ in ["NOUN", "PROPN"]
        
        sp_b_nouns = []
        sp_b_num_adjectives = 0
        
        for token in sp_b:
            if not sp_b_nouns and token.pos_ == "ADJ":
                sp_b_num_adjectives += 1
            elif token.pos_ in ["PROPN", "NOUN"]:
                sp_b_nouns.append(token)

        if not sp_b_nouns:
            return False

        sp_b_nouns_text = [noun.lower_ for noun in sp_b_nouns]
        sp_b_singular_texts = [" ".join(sp_b_nouns_text)] if sp_b_nouns[-1].tag_ in ["NN", "NNP"] else self.main.singularize(" ".join(sp_b_nouns_text))

        sp_a_text = sp_a.text.lower()
        sp_a_0_text = sp_a[0].lower_
        sp_a_0_is_noun = sp_a[0].pos_ in ["NOUN", "PROPN"]

        # Case Example: 'Hyla' v. 'Hyla tadpoles'
        if sp_a_0_text == sp_b_0_text and (sp_a_0_is_noun or sp_b_0_is_noun):
            if sp_a_text in sp_b_text or sp_b_text in sp_a_text:
                return True
        
        # Case Example: 'dogs' v. 'red dogs'
        sp_a_nouns = []
        sp_a_num_adjectives = 0
        for token in sp_a:
            if not sp_a_nouns and token.pos_ == "ADJ":
                sp_a_num_adjectives += 1
            elif token.pos_ in ["PROPN", "NOUN"]:
                sp_a_nouns.append(token)
        
        if not sp_a_nouns:
            return False
        
        sp_a_nouns_text = [noun.lower_ for noun in sp_a_nouns]
        
        if sp_a_nouns and sp_b_nouns and (
            (sp_a_num_adjectives == 1 and sp_b_num_adjectives == 0) or 
            (sp_b_num_adjectives == 1 and sp_a_num_adjectives == 0)
        ):
            sp_a_singular_texts = [" ".join(sp_a_nouns_text)] if sp_a_nouns[-1].tag_ in ["NN", "NNP"] else self.main.singularize(" ".join(sp_a_nouns_text))
            if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
                return True

        return False



    def find_same_species(self, sp_A, sp_b, verbose=False):
        # METHOD 1: Check for Literal Matches
        for sp_a in sp_A:
            if self.is_same_text(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 1: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        # METHOD 2: Check Alternate Names
        for sp_a in sp_A:
            if self.is_alternate(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 2: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a
        
        # METHOD 3: Check Nouns
        # This is used if one or none of the species being compared
        # has 1 adjective.
        for sp_a in sp_A:
            if self.has_same_base_nouns(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 3: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        # METHOD 4: Last Ditch Effort
        # If there's been no matches, we just look for one string inside of
        # another.
        for sp_a in sp_A:
            sp_a_text = sp_a.text.lower()
            sp_b_text = sp_b.text.lower()
            
            r_sp_a_text = re.compile(f"(\s|^){sp_a_text}(\s|$)", re.IGNORECASE)
            r_sp_b_text = re.compile(f"(\s|^){sp_b_text}(\s|$)", re.IGNORECASE)
            
            if re.match(r_sp_a_text, sp_b_text) or re.match(r_sp_b_text, sp_a_text):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 4: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"No Matches Between {sp_A} and {sp_b}")
        
        return None



    def span_at_token(self, token):
        if token in self.token_to_span:
            return self.token_to_span[token]
        return None



    def is_species(self, token):
        return token in self.tokens



    def has_species(self, tokens, verbose=False):
        for token in tokens:
            if token in self.tokens:
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tToken '{token}' is Species")
                return True
        return False

In [43]:
class Keywords:
    REGEX = "regex"
    VOCAB = "vocab"
    RULES = "rules"



    def __init__(self, main, *, regexes=[], vocab=[], patterns=[], def_pos=[], def_tag=[], def_threshold=0.7, def_weight=1.0):
        self.main = main

        # Constraints
        self.def_threshold = def_threshold
        self.def_tag = def_tag
        self.def_pos = def_pos
        self.def_weight = def_weight
        
        # Three Types of Matching
        self.vocab, self.vocab_data = self.load_vocab(vocab)
        self.regex, self.regex_data = self.load_regex(regexes)
        self.rules, self.rules_data = self.load_rules(patterns)

        # Quick Lookup
        self.match_type_to_data= {
            Keywords.REGEX: self.regex_data,
            Keywords.VOCAB: self.vocab_data,
            Keywords.RULES: self.rules_data
        }

    

    def update(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        # Matched Tokens in Different Forms
        self.token_data, self.mapped_token_data, self.tokens = self.match_tokens(verbose=verbose)



    def load_regex(self, regexes):
        r = []
        r_data = {}

        for unit in regexes:
            if isinstance(unit, str):
                r.append(unit)
            else:
                regex = unit["regex"]
                r.append(regex)
                r_data[regex] = {
                    "types": unit.get("types", []),
                    "weight": unit.get("weight", self.def_weight)
                }

        return r, r_data



    def load_vocab(self, vocab):
        v = []
        v_data = {}
        
        for unit in vocab:
            if isinstance(unit, str):
                doc = self.main.sp_nlp(unit)
                v.append({
                    "doc": doc,
                    "lemma": " ".join([t.lemma_ for t in doc])
                })
            else:
                doc = self.main.sp_nlp(unit["word"])
                v.append({
                    "doc": doc,
                    "tag": unit.get("tag", self.def_tag),
                    "pos": unit.get("pos", self.def_pos),
                    "threshold": unit.get("threshold", self.def_threshold),
                    "lemma": " ".join([t.lemma_ for t in doc])
                })
                v_data[unit["word"]] = {
                    "types": unit.get("types") or [],
                    "weight": unit.get("weight", self.def_weight),
                }
        
        return v, v_data



    def load_rules(self, patterns):
        r = Matcher(self.main.sp_nlp.vocab)
        r_data = {}
        
        for i, unit in enumerate(patterns):
            if isinstance(unit, list):
                r.add(f"{i}", unit)
            else:
                r.add(unit["name"], unit["pattern"])
                r_data[unit["name"]] = {
                    "types": unit.get("types") or [],
                    "weight": unit.get("weight", self.def_weight),
                }

        return r, r_data



    def get_match_data(self, token, match_id, match_type):
        match_type_data = self.match_type_to_data[match_type]
        
        if match_id in match_type_data:
            return {
                "token": token,
                "types": match_type_data[match_id].get("types", []),
                "weight": match_type_data[match_id].get("weight", self.def_weight)
            }
        else:
            return {
                "token": token,
                "types": [],
                "weight": self.def_weight
            }



    def bad_pos(self, pos):
        return self.def_pos and pos not in self.def_pos



    def bad_tag(self, tag):
        return self.def_tag and tag not in self.def_tag



    def bad_token(self, token):
        return self.bad_pos(token.pos_) or self.bad_tag(token.tag_)



    def match_tokens(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        
        matched_data = []
        matched_tokens = []

        # Match by Regex
        text = self.main.sp_doc.text.lower()
        
        for regex in self.regex:
            matches = [(match.start(), match.end()) for match in re.finditer(regex, text, re.IGNORECASE)]
            
            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\t'{regex}' Regex Matches: {matches}")
            
            for l_char_index, r_char_index in matches:
                span = self.main.get_span_at_indices(l_char_index, r_char_index - 1)

                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tSpan Matched: {span}")

                for token in span:
                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tPossible Regex Match for Token '{token}' (Position: {token.pos_} and Tag: {token.tag_})")
                        
                    if self.bad_token(token):
                        continue
                    
                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tRegex Matched Token '{token}'")
                        
                    matched_tokens.append(token)
                    matched_data.append(self.get_match_data(token, regex, Keywords.REGEX))

        # Match by Rules
        matches = self.rules(self.main.sp_doc)

        if verbose and VERBOSE_LEVEL >= 2:
            print(f"\tRule Matches: {matches}")
        
        for match_id, start, end in matches:
            span = self.main.sp_doc[start:end]
            name = self.main.sp_nlp.vocab.strings[match_id]

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tPattern '{name}' Matched Span: {span}")
            
            for token in span:
                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tPossible Rule Match for Token '{token}' (Position: {token.pos_} and Tag: {token.tag_})")
                    
                if self.bad_token(token):
                    continue
                
                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tRule Matched Token '{token}'")

                matched_tokens.append(token)
                matched_data.append(self.get_match_data(token, name, Keywords.RULES))

        # Match by Vocab
        for token in self.main.sp_doc:
            if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tPossible Vocab Match for Token '{token}' (Position: {token.pos_} and Tag: {token.tag_})")
                    
            if self.bad_token(token) or token in matched_tokens:
                continue

            token_doc = self.main.sp_nlp(token.lower_)
            token_lemma = " ".join([t.lemma_ for t in token_doc])
            
            for vocab_word in self.vocab:
                # Ensure Correct Tag
                if vocab_word.get("tag"):
                    if not [t for t in token_doc if t.tag_ in vocab_word.get("tag")]:
                        if verbose and VERBOSE_LEVEL >= 4:
                            print(f"\t\t\tToken '{token_doc}' not in Vocab Word '{vocab_word['doc']}' Tags ({vocab_word.get('tag')})")
                        continue
                
                # Ensure Correct PoS
                if vocab_word.get("pos"):
                    if not [t for t in token_doc if t.pos_ in vocab_word.get("pos")]:
                        if verbose and VERBOSE_LEVEL >= 4:
                            print(f"\t\t\tToken '{token_doc}' not in Vocab Word '{vocab_word['doc']}' Speech ({vocab_word.get('pos')})")
                        continue

                # Check Lemma
                if verbose and VERBOSE_LEVEL >= 4:
                    print(f"\t\t\t{token_doc} Lemma ({token_lemma}) and {vocab_word['doc']} Lemma ({vocab_word['lemma']})")
                    
                if token_lemma == vocab_word["lemma"]:
                    matched_tokens.append(token)
                    matched_data.append(self.get_match_data(token, vocab_word["doc"].text, Keywords.VOCAB))

                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tVocab (Lemma) Matched Token '{token}'")
                    
                    break

                # Check Similarity
                similarity = vocab_word["doc"].similarity(token_doc)

                if verbose and VERBOSE_LEVEL >= 4:
                    print(f"\t\t\t{token_doc} and {vocab_word['doc']} Similarity: {similarity}")
                    
                if similarity >= vocab_word.get("threshold", self.def_threshold):
                    matched_tokens.append(token)
                    matched_data.append(self.get_match_data(token, vocab_word["doc"].text, Keywords.VOCAB))

                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tVocab Matched Token '{token}'")
                        
                    break

        # Mapping Match(ed Token) Data
        mapped_matched_data = {}
        for matched_token_data in matched_data:
            mapped_matched_data[matched_token_data["token"]] = matched_token_data

        if verbose and VERBOSE_LEVEL >= 1:
            print("Output of match_tokens")
            print(f"Token Data: {matched_data}")
            print(f"Mapped Token Data: {mapped_matched_data}")
            print(f"Token: {matched_tokens}")
        
        return matched_data, mapped_matched_data, matched_tokens

In [44]:
class ExperimentKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            vocab=[
                "study", 
                "hypothesis", 
                "experiment", 
                "found", 
                "discover", 
                "compare", 
                "finding", 
                "result", 
                "test", 
                "examine", 
                "model",
                "measure",
                "manipulate",
                "assess",
                "conduct",
                "data",
                "analyze",
                "sample",
                "observe",
                "observation",
                "predict",
                "suggest",
                "method",
                "investigation",
                "trial",
                "experimental",
                "evidence",
                "demonstrate",
                "analysis",
                "show",
                "compare",
                "comparable",
                "control group", 
                "independent",
                "dependent",
                "applied",
                "treatment",
                "survery",
                "evaluate",
            ],
            def_pos=["VERB", "NOUN", "ADJ"], 
            def_threshold=0.8
        )

In [45]:
class NegativeExperimentKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            vocab=[
                "theory",
                "review",
                "analysis",
                "meta-analysis"
            ],
            def_pos=["VERB", "NOUN", "ADJ"], 
            def_threshold=0.8
        )

In [46]:
class NegativeTopicKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            regexes=[
                r"co-?evolution",
                r"evolution",
            ],
            def_pos=["VERB", "NOUN", "ADJ"], 
            def_threshold=0.8
        )

In [47]:
class CauseKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            vocab=[
                "increase", 
                "decrease", 
                "change", 
                "shift", 
                "cause", 
                "produce", 
                "trigger", 
                "suppress", 
                "inhibit",
                "encourage",
                "allow",
                "influence",
                "affect",
                "alter",
                "induce",
                "produce",
                "result in",
                # "associated with",
                # "correlated with",
                "contribute",
                "impact",
                "deter",
                "depressed",
                "when",
                "because",
                # "reduce",
                # "killed",
                # "supported"
            ],
            def_pos=["VERB", "SCONJ", "NOUN"],
            # def_tag=["VB", "VBD", "WRB", "IN", "VBG"],
            # def_threshold=0.75
            def_threshold=0.8
        )


    
    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)


    
    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        filtered = []
        for token in tokens:
            # I'm not sure what cause words should be filtered out, because
            # I haven't seen everything, but this word should be filtered out,
            # it's not really reflective the changes that we're looking for. But,
            # sometimes it is, so it's up in the air. However, I feel like the
            # writer would use more clear language like "decrease" or something.
            if token.lemma_ in ["kill"]:
                continue
            filtered.append(token)
            
        return filtered

In [48]:
class ChangeKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            vocab=[
                "few", 
                "more", 
                "increase", 
                "decrease", 
                "less", 
                "short", 
                "long", 
                "greater"
                "shift",
                "fluctuate",
                "adapt",
                "grow",
                "rise"
                "surge",
                "intensify",
                "amplify",
                "multiply",
                "decline",
                "reduce",
                "drop",
                "diminish",
                "fall",
                "lessen",
                "doubled",
                "tripled",
                "lower",
            ],
            regexes=[
                # Match Examples:
                # 1. "one... as..."
                # 2. "2x than..."
                r"(one|two|three|four|five|six|seven|eight|nine|ten|twice|thrice|([0-9]+|[0-9]+.[0-9]+)(x|%))[\s-]+[^\s]*[\s-]+(as|more|than|likely)([\s-]+|$)"
            ],
            def_pos=["NOUN", "ADJ", "ADV"],
            def_threshold=0.75
        )


    
    def update(self, verbose=False):
        Keywords.update(self, verbose=verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)


    
    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        filtered = []
        for token in self.main.sp_doc:
            # Already Matched
            if token in tokens:
                filtered.append(token)
            
            # Comparative Adjective
            # Looking for words like "bigger" and "better".
            elif token.pos_ == "ADJ" and token.tag_ == "JJR":
                filtered.append(token)
                continue
            
        return filtered

In [64]:
class TraitKeywords(Keywords):
    FOOD = "Food"
    NOT_APPLICABLE = "N/A"
    
    def __init__(self, main):
        super().__init__(
            main, 
            regexes=[
                r"behaviou?r", 
                r"[^A-Za-z]+rate", 
                "colou?r",
                "biomass",
                r"[^A-Za-z]+mass", 
                r"[^A-Za-z]+size",
                "number",
                "length", 
                "pattern", 
                "weight",
                "shape", 
                "efficiency", 
                "trait",
                "phenotype",
                "demography",
                "scent",
                "population (structure|mechanic)s?",
                "ability", 
                "capacity", 
                "height", 
                "width", 
                "[A-Za-z]+span",
                {"regex": "diet", "types": [TraitKeywords.FOOD]},
                {"regex": "food", "types": [TraitKeywords.FOOD, TraitKeywords.NOT_APPLICABLE]},
                {"regex": "feeding", "types": [TraitKeywords.FOOD]},
                "nest",
                "substrate",
                "breeding",
                r"[^A-Za-z]+age[^A-Za-z]+",
                "lifespan",
                "development",
                "output",
                "time",
                "period"
                # "mating",
                # "[^A-Za-z]+fur",
                # "feathers",
                # "scales",
                # "skin",
                # "limb",
                "level",
                "configuration",
                "dimorphism",
                "capability",
                # "appendages",
                # "blood",
                "regulation",
                "excretion",
                "luminescence",
                r"[^A-Za-z]+role",
                # "reproduction",
                # "courtship",
                # "pollination",
                # "mechanism",
                "sensitivity",
                "resistance",
                r"(un|(^|\s)[A-Za-z]*-)infected",
                "temperature",
                # "fecundity",
                "density"
            ],
            def_pos=["NOUN"]
        )


    
    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)


    
    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Unfiltered Trait Tokens: {tokens}")
        
        filtered = []
        for token in tokens:
            expanded_token = self.main.expand_unit(
                il_unit=token.i, 
                ir_unit=token.i, 
                il_boundary=token.sent.start, 
                ir_boundary=token.sent.end-1, 
                speech=["PUNCT"],
                include=False,
                verbose=verbose
            )

            if self.main.species.has_species(expanded_token, verbose=verbose):
                filtered.append(token)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Filtered Trait Tokens: {filtered}")
        
        return filtered

In [65]:
class TestKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main,
            vocab=[
                "compare",
                "examine",
                "evaluate",
                "assess",
            ],
            def_pos=["VERB", "NOUN"], 
            def_threshold=0.8
        )

In [66]:
class VariabilityKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main,
            vocab=[
                {"word": "different", "pos": ["ADJ", "NOUN"]},
                {"word": "vary", "pos": ["VERB", "NOUN"]},
                {"word": "varied", "pos": ["VERB", "NOUN"]}
            ],
            regexes=[
                r"between",
                r"against",
                r"independen(t|ts|tly|cy)",
                r"dependen(t|ts|tly|cy)",
                r"treatments?",
                r"effect",
                r"control",
                r"(with|without)[A-Za-z]*(with|without)",
                r"(^| )(un|not)[-| ]?([A-Za-z]+) [^!;?.\n]* \3",
                r"([A-Za-z]+) [^!;?.\n]* (un|not)[-| ]?\1( |$)",
                
            ],
            patterns=[
                [[{"LOWER": {"IN": ["neither", "either", "both"]}}, {"OP": "*", "TAG": {"NOT_IN": ["."]}}, {"LOWER": {"IN": ["or", "and"]}}]],
                [[{"LOWER": {"IN": ["with", "without"]}}, {"OP": "*", "TAG": {"NOT_IN": ["."]}}, {"LOWER": {"IN": ["with", "without"]}}]],
                [[{"LOWER": {"IN": ["at"]}}, {"POS": "NUM"}]],
                [[{"LOWER": {"IN": ["at"]}}, {"LOWER": {"IN": ["several", "unique", "multiple", "different"]}}]],
            ],
        )

In [72]:
class Main(Base):
    def __init__(self):
        # Tools
        self.sp_nlp = spacy.load("en_core_web_lg")
        self.fcoref = FCoref(enable_progress_bar=False, device='cpu')
        self.sp_doc = None

        # Maps Character Position to Token in Document
        # Used to handle differences between different
        # pipelines and tools.
        self.index_map = None
    
        # Parsers
        self.species = Species(self)
        self.trait = TraitKeywords(self)
        self.cause = CauseKeywords(self)
        self.change = ChangeKeywords(self)
        self.experiment = ExperimentKeywords(self)
        self.not_experiment = NegativeExperimentKeywords(self)
        self.not_topic = NegativeTopicKeywords(self)
        self.variability = VariabilityKeywords(self)
        self.test = TestKeywords(self)

        # Helper
        super().__init__(self)


    
    def update_doc(self, doc, verbose=False):
        self.sp_doc = doc
        self.index_map = self.load_index_map()
        self.species.update(doc.text, verbose=False)
        self.trait.update(verbose=verbose)
        self.cause.update(verbose=False)
        self.change.update(verbose=False)
        self.experiment.update(verbose=False)
        self.not_experiment.update(verbose=False)
        self.not_topic.update(verbose=False)
        self.variability.update(verbose=False)
        self.test.update(verbose=False)


        
    def update_text(self, text, verbose=False):
        self.sp_doc = self.sp_nlp(text)
        self.update_doc(self.sp_doc, verbose=verbose)


        
    def token_at_char(self, char_index):
        # SpaCy Doc or Indexing Map Not Found
        if not self.sp_doc or not self.index_map:
            raise Exception("DNE")

        if char_index in self.index_map:
            return self.index_map[char_index]

        raise Exception(f"Token at Index {char_index} Not Found")


        
    def load_index_map(self):
        # SpaCy Doc Not Found
        if self.sp_doc is None:
            raise Exception("DNE")

        # Map Character Index to Token
        index_map = {}
        for token in self.sp_doc:
            l_char_index = token.idx
            r_char_index = token.idx + len(token)

            for i in range(l_char_index, r_char_index):
                index_map[i] = token

        return index_map


        
    def valid_trait_token(self, token, sent_cause_tokens, sent_change_tokens, verbose=False):
        if token not in self.trait.tokens:
            return 0

        token_data = self.trait.mapped_token_data[token]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Token '{token}' Types: {token_data['types']}")
            
        if TraitKeywords.NOT_APPLICABLE in token_data["types"]:
            return 0

        token_context = set(self.find_unit_context(
            il_unit=token.i, 
            ir_unit=token.i, 
            il_boundary=token.sent.start, 
            ir_boundary=token.sent.end-1, 
            verbose=verbose)
        )
        
        causes = set(sent_cause_tokens).intersection(token_context)
        changes = set(sent_change_tokens).intersection(token_context)

        if causes or changes:
            return 1.0
        else:
            return 0.25


    
    def valid_species_token(self, token, sent_cause_tokens, sent_change_tokens, verbose=False):
        if token not in self.species.tokens:
            return 0
        
        token_context = set(self.find_unit_context(
            il_unit=token.i, 
            ir_unit=token.i, 
            il_boundary=token.sent.start, 
            ir_boundary=token.sent.end-1, 
            verbose=verbose)
        )
        
        causes = set(sent_cause_tokens).intersection(token_context)
        changes = set(sent_change_tokens).intersection(token_context)

        if causes or changes:
            return 1
        else:
            return 0.25


    
    def update_seen_species(self, token, seen_species, sent_seen_species, sent_num_unique_species, verbose=False):
        # Update Seen Species in Text
        span = self.species.span_at_token(token)
        past_visits = 0
        prev_ref = self.species.find_same_species(seen_species.keys(), span, verbose=verbose)
        
        if prev_ref:
            past_visits = seen_species[prev_ref]
            seen_species[prev_ref] += 1
        else:
            seen_species[span] = 1

        # Check Seen Species in Sentence
        # We only add points if it's a species that has not been seen
        # in the sentence. This is to avoid redundant points. 
        # Also, if it species has not been seen at all (is_new_species),
        # then it cannot be a redundant species (we couldn't have seen it in the sentence
        # either).
        seen_in_sent = bool(self.species.find_same_species(sent_seen_species, span, verbose=verbose))        
        sent_seen_species.append(span)

        # Update Number of Unique Species in Sentence
        if not seen_in_sent:
            sent_num_unique_species += 1

        return seen_species, sent_seen_species, seen_in_sent, sent_num_unique_species


    
    def valid_trait_variation(self, verbose=False):
        max_trait_variation_points = 0
        
        sentences = list(self.sp_doc.sents)
        num_sentences = len(sentences)

        for i in range(num_sentences):
            sent_i = sentences[i]
            sent_i_tokens = set([token for token in sent_i])

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence I: {sent_i}")
            
            sent_i_test_tokens = sent_i_tokens.intersection(self.test.tokens)
            sent_i_experiment_tokens = sent_i_tokens.intersection(self.experiment.tokens)

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence I Test Tokens: {sent_i_test_tokens}")
                print(f"\tSentence I Experiment Tokens: {sent_i_experiment_tokens}")

            if not sent_i_test_tokens and not sent_i_experiment_tokens:
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tNo Experiment or Test Tokens in Sentence I")
                continue

            trait_variation_points_i = 0

            if sent_i_experiment_tokens:
                trait_variation_points_i = 0.10
            
            if sent_i_test_tokens:
                trait_variation_points_i = 0.25

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tTrait Variation Points for I: {trait_variation_points_i}")

            sent_i_trait_tokens = sent_i_tokens.intersection(self.trait.tokens)

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence I Trait Tokens: {sent_i_trait_tokens}")

            if not sent_i_trait_tokens:
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tNo Trait Tokens in Sentence I")
                continue

            variables = []
            sent_i_variability_tokens = sent_i_tokens.intersection(self.variability.tokens)

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence I Variability Tokens: {sent_i_variability_tokens}")

            deduct_points = not sent_i_variability_tokens
            
            if sent_i_variability_tokens:
                for token in sent_i_variability_tokens:
                    trait_in_context = set(self.find_unit_context(
                        il_unit=token.i, 
                        ir_unit=token.i, 
                        il_boundary=token.sent.start,
                        ir_boundary=token.sent.end-1, 
                        speech=["NOUN", "ADJ", "ADV", "PROPN", "ADP", "CCONJ"],
                        include=True,
                        comma_encloses=True,
                        verbose=verbose
                    )).intersection(self.trait.tokens)

                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tVariability Token '{token}' Traits in Context: {trait_in_context}") 

                    if not trait_in_context:
                        if verbose and VERBOSE_LEVEL >= 3:
                            print(f"\t\tNo Traits in Variability Token '{token}' Context")
                        continue

                    deduct_points = False
                    variables.extend(trait_in_context)

            variables = list(set(variables))

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tVariables: {variables}")
            
            if variables:
                trait_variation_points_i += 0.25
            else:
                trait_variation_points_i += 0.15

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tTrait Variation Points for I: {trait_variation_points_i}")

            assert trait_variation_points_i <= 0.5

            for j in range(i, num_sentences):
                sent_j = sentences[j]
                sent_j_tokens = set([token for token in sent_j])

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tSentence J: {sent_j}")

                sent_j_cause_tokens = sent_j_tokens.intersection(self.cause.tokens)
                sent_j_change_tokens = sent_j_tokens.intersection(self.change.tokens)
                sent_j_species_tokens = sent_j_tokens.intersection(self.species.span_starts)
                sent_j_trait_tokens = sent_j_tokens.intersection(self.trait.tokens)

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tSentence J Cause Tokens: {sent_j_cause_tokens}")
                    print(f"\tSentence J Change Tokens: {sent_j_change_tokens}")
                    print(f"\tSentence J Species Tokens: {sent_j_species_tokens}")
                    print(f"\tSentence J Trait Tokens: {sent_j_trait_tokens}")
                
                if not sent_j_species_tokens or (not sent_j_cause_tokens and not sent_j_change_tokens):
                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\tUnsatisfied Conditions for Sentence J")
                    continue

                trait_variation_points_j = 0
                
                if not sent_j_trait_tokens or not variables:
                    trait_variation_points_j += 0.25
                elif i != j:
                    # Check if Variable Referenced Again via Types
                    variable_types = set(self.flatten([self.trait.mapped_token_data[token]["types"] for token in variables]))
                    sent_j_trait_types = set(self.flatten([self.trait.mapped_token_data[token]["types"] for token in sent_j_trait_tokens]))

                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\tVariable Types: {variable_types}")
                        print(f"\tTrait Types in Sentence J: {sent_j_trait_types}")
                        
                    # Check if Variable Referenced Again via Literals
                    variable_strings = set([token.lower_ for token in variables])
                    sent_j_trait_strings = set([token.lower_ for token in sent_j_trait_tokens])

                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\tVariable Trait (as Strings): {variable_strings}")
                        print(f"\tTrait (as Strings) in Sentence J: {sent_j_trait_strings}")
                    
                    variable_referenced = bool(variable_types & sent_j_trait_types) or bool(variable_strings & sent_j_trait_strings)

                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\tVariable Referenced? {variable_referenced}")
                    
                    if variable_referenced:
                        trait_variation_points_j += 0.50
                    else:
                        trait_variation_points_j += 0.25
                elif i == j:
                    trait_variation_points_j += 0.25

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tTrait Variation Points for J: {trait_variation_points_j}")

                assert trait_variation_points_j <= 0.5

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\ti: {i}")
                    print(f"\tj: {j}")

                # Scale by Distance In-Between
                if j == i:
                    j_scale = 0.5
                else:
                    j_scale = (1 - ((j - i) / (num_sentences - 1)))
                
                assert j_scale <= 1

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tScale for Sentence J Points: {j_scale}")
                    
                trait_variation_points = trait_variation_points_i + j_scale * trait_variation_points_j

                # Scale by Distance from Top
                if num_sentences == 1:
                    i_scale = 1
                else:
                    i_scale = (1 - i/(num_sentences - 1))
                    
                assert i_scale <= 1

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tScale for Sentence I Points: {i_scale}")
                
                trait_variation_points *= i_scale

                if deduct_points:
                    trait_variation_points *= 0.6375
                
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tTrait Variation Points: {trait_variation_points}")
                
                max_trait_variation_points = max(max_trait_variation_points, trait_variation_points)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Max Trait Variation Points: {max_trait_variation_points}")
            
        return max_trait_variation_points


        
    def score(self, verbose=False):
        NUM_CATEGORIES = 6

        TRAIT = 0
        SPECIES = 1
        EXPERIMENT = 2
        INTERACTION = 3
        NOT_TOPIC = 4
        TRAIT_VARIATION = 5

        # Max # of Points of Category per Sentence (MPC)
        # A category can collect points from each sentence. However,
        # there's a maximum number of points it can collect. This is
        # determined by the MPC.
        MPC = [1] * NUM_CATEGORIES
    
        # Points per Instance of Category (PIC)
        # Each token is evaluated to check whether a category
        # can be given points. The number of points given, if
        # the token is determined to be satisfactory, is the PIC.
        # The PIC is less than or equal to the MPC for the corresponding
        # category. The idea behind the PIC and MPC is similar to how
        # sets work in tennis: you're not immediately awarded the full points
        # for the set (MPC) if your opponent fails to return the ball,
        # instead you're given a smaller # of points (PIC) that allow you to
        # incrementally win the set (category).
        PIC = [0] * NUM_CATEGORIES
        PIC[TRAIT] = MPC[TRAIT]*1.0
        PIC[SPECIES] = MPC[SPECIES]/3.0
        PIC[EXPERIMENT] = MPC[EXPERIMENT]*0.625
        PIC[INTERACTION] = MPC[INTERACTION]/3.0
        PIC[NOT_TOPIC] = MPC[NOT_TOPIC]*1.0

        for i in range(NUM_CATEGORIES):
            assert 0 <= PIC[i] <= MPC[i]

        # Category Weights (CW)
        # It may be helpful to weigh a certain category's fraction of total points
        # more or less than another's. Thus, at the end, we'll take a
        # weighted average of the category's FTP. The weights must add up to 1.
        CW = [0] * NUM_CATEGORIES
        CW[TRAIT] = 0.3
        CW[SPECIES] = 0.1
        CW[EXPERIMENT] = 0.1
        CW[INTERACTION] = 0.1
        CW[NOT_TOPIC] = 0.1
        CW[TRAIT_VARIATION] = 0.3

        assert round(np.sum(CW)) == 1

        # Leniency
        # There are certain categories that aren't going to be as frequent as others.
        # For example, the trait category. You could try and decrease the influence
        # of said category by lowering its MPC and/or increasing the PIC (so that it's
        # easier to achieve the FTP). However, this could make it harder to meaningfully
        # represent the category. The idea of leniency is to remove (some) sentences that had 0
        # points from the scoring. This increases the FTP as, for example, instead of comparing
        # 0.5 points to a total of 2.5 points, you can compare 0.5 to 2.0 points, and so on.
        # A leniency of 1 means that all sentences that received 0 points will be removed from
        # the scoring. A leniency of 0 means that all the sentences are included in the scoring.
        LEN = [0] * NUM_CATEGORIES
        LEN[TRAIT] = 0
        LEN[SPECIES] = 0
        LEN[EXPERIMENT] = 0
        LEN[INTERACTION] = 0
        LEN[NOT_TOPIC] = 0
        
        # Points
        points = [0] * NUM_CATEGORIES
        num_zero_pt_sents = [0] * NUM_CATEGORIES
        seen_species = {}

        if verbose and VERBOSE_LEVEL >= 1:
            print("Extracted Information")
            print(f"Cause Tokens: {self.cause.tokens}")
            print(f"Change Tokens: {self.change.tokens}")
            print(f"Trait Tokens: {self.trait.tokens}")
            print(f"Species Tokens: {self.species.tokens}")
            print(f"Experiment Tokens: {self.experiment.tokens}")
            print(f"Not-Experiment Tokens: {self.not_experiment.tokens}")
            print(f"Not-Topic Tokens: {self.not_topic.tokens}")
            print(f"Variability Tokens: {self.variability.tokens}")
            print(f"Test Tokens: {self.test.tokens}")
            
        for sent in self.sp_doc.sents:
            # Current Points in Sentence
            curr_points = [0] * NUM_CATEGORIES

            # Sentence Local Info
            sent_tokens = [token for token in sent]
            sent_cause_tokens = set(sent_tokens).intersection(self.cause.tokens)
            sent_change_tokens = set(sent_tokens).intersection(self.change.tokens)
            sent_seen_species = []
            sent_num_unique_species = 0

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence: {sent}")
                print(f"\tSentence Cause Tokens: {sent_cause_tokens}")
                print(f"\tSentence Change Tokens: {sent_change_tokens}")

            
            for token in sent_tokens:
                # If each category has reached their maximum number of points,
                # we can end the loop early.
                all_maxed = True
                for i in range(NUM_CATEGORIES):
                    if i == TRAIT_VARIATION:
                        continue
                    if curr_points[i] < MPC[i]:
                        all_maxed = False

                if all_maxed:
                    break

                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tToken in Sentence: {token}")
                
                # Not Topic Points
                if curr_points[NOT_TOPIC] < MPC[NOT_TOPIC]:
                    if token in self.not_topic.tokens:
                        curr_points[NOT_TOPIC] += PIC[NOT_TOPIC]

                        if verbose and VERBOSE_LEVEL >= 3:
                            print(f"\t\t+ Points for Not-Topic")

                        
                # Trait Points
                if curr_points[TRAIT] < MPC[TRAIT]:
                    if token in self.trait.tokens:
                        scale = self.valid_trait_token(token, sent_cause_tokens, sent_change_tokens, verbose=verbose)
                        curr_points[TRAIT] += scale * PIC[TRAIT]

                        if verbose and VERBOSE_LEVEL >= 3 and scale:
                            print(f"\t\t+ Points for Trait")

                        
                # Not Experiment Points
                if token in self.not_experiment.tokens:
                    curr_points[EXPERIMENT] -= 2 * PIC[EXPERIMENT]

                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\t- Points for Experiment")

                
                # Experiment Points
                elif curr_points[EXPERIMENT] < MPC[EXPERIMENT]:
                    if token in self.experiment.tokens:
                        curr_points[EXPERIMENT] += PIC[EXPERIMENT]

                        if verbose and VERBOSE_LEVEL >= 3:
                            print(f"\t\t+ Points for Experiment")

                        
                # Species and/or Interaction Points
                if token in self.species.span_starts:
                    # Update Species
                    seen_species, sent_seen_species, seen_in_sent, sent_num_unique_species = self.update_seen_species(
                        token, 
                        seen_species, 
                        sent_seen_species, 
                        sent_num_unique_species,
                        verbose=verbose
                    )
                    
                    if seen_in_sent:
                        if verbose and VERBOSE_LEVEL >= 3:
                            print(f"\t\tAlready Seen Species '{token}' in Sentence")
                        continue

                    
                    # Interaction Points
                    if curr_points[INTERACTION] < MPC[INTERACTION]:
                        if sent_num_unique_species == 2:
                            curr_points[INTERACTION] = 2.0 * PIC[INTERACTION]

                            if verbose and VERBOSE_LEVEL >= 3:
                                print(f"\t\t+ Points for Interaction")
                            
                        elif sent_num_unique_species > 2:
                            curr_points[INTERACTION] += PIC[INTERACTION]

                            if verbose and VERBOSE_LEVEL >= 3:
                                print(f"\t\t+ Points for Interaction")


                    # Species Points
                    if curr_points[SPECIES] < MPC[SPECIES]:
                        scale = self.valid_species_token(token, sent_cause_tokens, sent_change_tokens)
                        curr_points[SPECIES] += scale * PIC[SPECIES]

                        if verbose and VERBOSE_LEVEL >= 3 and scale:
                            print(f"\t\t+ Points for Species")

            
            # Add Sentence Points to Total Points
            for i in range(NUM_CATEGORIES):
                if curr_points[i] <= 0:
                    num_zero_pt_sents[i] += 1
                points[i] += max(0, min(curr_points[i], MPC[i]))

        
        # Trait Variation Points
        points[TRAIT_VARIATION] = self.valid_trait_variation(verbose=verbose)

        
        # Calculating Score            
        NUM_SENTENCES = len(list(self.sp_doc.sents))
        score = 0
        
        for i in range(NUM_CATEGORIES):
            if i != TRAIT_VARIATION:
                num_non_zero_pt_sents = NUM_SENTENCES - num_zero_pt_sents[i]
                lenient_num_sentences = max(num_non_zero_pt_sents, (1 - LEN[i]) * NUM_SENTENCES)
    
                # Calculating FTP
                points[i] = points[i] / (MPC[i] * lenient_num_sentences)
    
                # Take the Inverse for Not-Topic
                if i == NOT_TOPIC:
                    points[i] = 1 - points[i]
    
            # Add onto Score
            score += max(0, min(points[i], 1)) * CW[i]

        # Enforcing 3 or More Species            
        if len(seen_species) < 3:
            return 0, points
            
        assert 0.0 <= score <= 1.0

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Score, Points: {score}, {points}")
    
        return score, points

In [73]:
def score_dataset(name, save_output=False, version=""):
    # Redirect Print Statements
    # https://stackoverflow.com/questions/7152762/how-to-redirect-print-output-to-a-file
    if save_output:
        initial_stdout = sys.stdout
        f = open(f'./Print{name}{"" if not version else f"-{version}"}.txt', 'w')
        sys.stdout = f
        sys.stdout.reconfigure(encoding='utf-8')

    # Load Dataset
    data = load_preprocessed_dataset(name)

    # We'll be running the points algorithm
    # on the abstracts of these papers.
    texts = list(data['Abstract'].to_numpy())
    
    # The scores for each paper will be stored here,
    # we'll set this as a column of the dataframe.
    scores = []
    points = []
    trait_points = []
    species_points = []
    experiment_points = []
    interaction_points = []
    neg_topic_points = []
    trait_var_points = []
    
    # Scan and Evaluate Documents
    main = Main()
    for i, doc in enumerate(main.sp_nlp.pipe(texts)):
        print(f"{i+1}/{data.shape[0]} - {data.iloc[i]['Title']}\n")
        main.update_doc(doc, verbose=save_output)

        # Empty string literals cause errors, so it's
        # being handled here.
        if not main.sp_doc or not main.species.tn_doc:
            scores.append(0)
        else:
            score, _points = main.score(verbose=save_output)
            scores.append(score)
            points.append(_points)
            trait_points.append(_points[0])
            species_points.append(_points[1])
            experiment_points.append(_points[2])
            interaction_points.append(_points[3])
            neg_topic_points.append(_points[4])
            trait_var_points.append(_points[5])

        if not save_output:
            clear_output(wait=True)

    # Reset Standard Output
    if save_output:
        sys.stdout = initial_stdout
        f.close()

    data["Score"] = scores
    data["Trait Points"] = trait_points
    data["Species Points"] = species_points
    data["Experiment Points"] = experiment_points
    data["Interaction Points"] = interaction_points
    data["Negative Topic Points"] = neg_topic_points
    data["Trait Variation Points"] = trait_var_points
    data.sort_values(by='Score', ascending=False, inplace=True)
    
    return data

In [75]:
scored_data = score_dataset("Baseline-1", save_output=False, version='')
store_scored_dataset(scored_data, "Baseline-3.1", version='3')

scored_data = score_dataset("Examples", save_output=False, version='')
store_scored_dataset(scored_data, "Examples-3.1", version='3')

4/4 - Multiple predator effects result in risk reduction for prey across multiple prey densities



In [None]:
scored_data = score_dataset("DFiltered", save_output=False, version='')
store_scored_dataset(scored_data, "DFiltered-3.1", version='3')

In [70]:
data = load_preprocessed_dataset("Baseline-1")

Data Shape: (28, 4)


In [56]:
data.loc[data['Title'].str.contains('Impact of')]

Unnamed: 0,Title,Abstract,DOI,Score
23,Impact of intraspecific and intraguild predati...,Exotic predators are more likely to replace re...,,0


In [57]:
index = 23

title = data.iloc[index].Title
abstract = data.iloc[index].Abstract

print(f"Title: {title}")
print(f"Abstract: {abstract}")

Title: Impact of intraspecific and intraguild predation on predator invasion and coexistence. Can exotic ladybeetles displace native species
Abstract: Exotic predators are more likely to replace related native species when these species not only compete for similar prey species, but also predate on the offspring of the native predators. In several groups of arthropods, however, this intraguild predation (IGP) is not only mutual, but also co-occurs with intraspecific predation (ISP or cannibalism). These different processes may have counteracting effects on species invasion and coexistence. In this study, we derived simple rules that describe under which combinations of IGP and ISP a predator species is able to invade into a stable predator-prey system, and under which conditions an invasion will results in displacement or in coexistence. This theory is then applied to species pairs of exotic and native lady beetles, to test if differences in IGP and ISP may play a role in the establish

In [71]:
main = Main()
main.update_text(str(abstract), verbose=True)
print(main.score(verbose=True))

06/30/2025 21:43:38 - INFO - 	 missing_keys: []
06/30/2025 21:43:38 - INFO - 	 unexpected_keys: []
06/30/2025 21:43:38 - INFO - 	 mismatched_keys: []
06/30/2025 21:43:38 - INFO - 	 error_msgs: []
06/30/2025 21:43:38 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M


	'behaviou?r' Regex Matches: []
	'[^A-Za-z]+rate' Regex Matches: []
	'colou?r' Regex Matches: []
	'biomass' Regex Matches: []
	'[^A-Za-z]+mass' Regex Matches: []
	'[^A-Za-z]+size' Regex Matches: [(1223, 1228)]
		Span Matched: size
		Possible Regex Match for Token 'size' (Position: NOUN and Tag: NN)
		Regex Matched Token 'size'
	'number' Regex Matches: []
	'length' Regex Matches: []
	'pattern' Regex Matches: [(3032, 3039)]
		Span Matched: pattern
		Possible Regex Match for Token 'pattern' (Position: NOUN and Tag: NN)
		Regex Matched Token 'pattern'
	'weight' Regex Matches: []
	'shape' Regex Matches: []
	'efficiency' Regex Matches: []
	'trait' Regex Matches: []
	'phenotype' Regex Matches: []
	'demography' Regex Matches: []
	'scent' Regex Matches: []
	'population (structure|mechanic)s?' Regex Matches: []
	'ability' Regex Matches: [(1511, 1518), (4651, 4658)]
		Span Matched: ability
		Possible Regex Match for Token 'ability' (Position: NOUN and Tag: NN)
		Regex Matched Token 'ability'
		Sp