In [None]:
# Trait-Mediated Interaction Modification
# Empirical:
# You could look for words like "hypothesis", "experiment", "found", and "discovered". That may point
# towards there being an experiment in the paper. There are also words like "control group", "compared",
# "findings", "results", "study", and more.
# Qualitative vs. Quantitative:
# To infer whether something is quantitative, you could look for numeric tokens and units.
# However, you can only do so much with the abstract. Therefore, this is likely not good enough.
# Yet, you could still take advantage of words like "fewer" and "increased" to show that there is a change.
# However, this would be more suited for the above category.
# Traits:
# There is no NLP tool for traits that I can use or create so I think that I could instead use keywords.
# For example, "snail feeding rates" is a trait. You may be able to spot this by looking for a word like
# "rate". You'd expand that word to include "snail feeding rates". As "snail" is a species you can infer
# that "rates" is a trait. I would be more decisive and use a dependency parser to ensure that the trait
# is a property of the species (like before). However, with all the cases that may exist, I think checking
# to see whether a species can be found by traveling back and/or forward without finding certain tokens could
# work well enough.
# 3 Species or More:
# This is simple. However, I think using a dictionary and TaxoNerd would be beneficial (for higher accuracy).
# To handle the potential differences in tokenization, character offsets should be used.
# Standardization:
# There is a lot of variance in the scores. To squash this issue, I think that we could assign each sentence
# a value from 0 to 1. We would add these values and divide by the number of sentences. This would result in
# a number that is also from 0 to 1. However, there are categories that we would like to inspect. So, we must
# create an overall score in the interval from [0, 1] while also scoring each category. Well, for each sentence
# we could add a point for each category that is observed. The sentence would receive said score divided by the
# number of categories. At the end, we add up all the sentence scores and divide by the number of sentences.
# The aggregate score for each category would also be divided by the number of sentences.

In [None]:
import re
import csv
import sys
import time
import spacy
import numpy as np
import pandas as pd
import random
import pickle
from fastcoref import FCoref, LingMessCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher, PhraseMatcher
from spacy.language import Language
from IPython.display import clear_output
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
%run -i "../utils.py"

In [None]:
class Help:
    def __init__(self, main):
        self.main = main
        # Zero Plurals
        # The singular and plural versions of the words below are the same.
        self.zero_plurals = ["species", "deer", "fish", "moose", "sheep", "swine", "buffalo"]

    def remove_extra_spaces(self, string):
        # Remove Duplicate Spaces
        string = re.sub(r"\s+", " ", string)
        # Remove Spaces Before Punctuation
        string = re.sub(r"\s+([?.!,])", r"\1", string)
        # Remove Outside Spaces
        return string.strip()

    def remove_outer_non_alnum(self, string):
        while string:
            start_len = len(string)
            # Remove Leading Non-Alphanumeric Character
            if string and not string[0].isalnum():
                string = string[1:]
            # Remove Trailing Non-Alphanumeric Character
            if string and not string[-1].isalnum():
                string = string[:-1]
            # No Changes Made
            if start_len == len(string):
                break
        return string

    def singularize(self, string):
        # The string to singularize should not have any
        # non-alphanumeric characters at the end, or else
        # the algorithm will not work.
        words = re.split(r" ", string)

        # If the last word in the string is a zero plural,
        # there's no changes to make. For example, "red-
        # sheep" is already singular.
        if words[-1] in self.zero_plurals:
            return string
        
        singulars = []

        # We take the singular form of the last word and
        # add it back in to the other words. As there could
        # be multiple forms (due to error), we need to
        # handle them all.
        singular_forms = self.singular_form(words[-1])
        for singular_form in singular_forms:
            singular = self.remove_spaces(" ".join([*words[:-1], singular_form]))
            singulars.append(singular)
            
        return singulars

    def singular_form(self, string):
        versions = []

        # Change -ies to -y
        if re.fullmatch(r".*ies$", string):
            versions.append(f'{string[:-3]}y')
            return versions

        # Change -ves to -f and -fe
        if re.fullmatch(r".*ves$", string):
            versions.append(f'{string[:-3]}f')
            versions.append(f'{string[:-3]}fe')
            return versions

        # Remove -es 
        if re.fullmatch(r".*es$", string):
            versions.append(f'{string[:-2]}')
            return versions

        # Change -i to -us
        if re.fullmatch(r".*i$", string):
            versions.append(f'{string[:-1]}us')
            return versions

        # Remove -s
        if re.fullmatch(r".*s$", string):
            versions.append(f'{string[:-1]}')
            return versions

        return versions

    def pluralize(self, string):
        # The string to pluralize should not have any
        # non-alphanumeric characters at the end, or else
        # the algorithm will not work.
        words = re.split(r" ", string)

        if words[-1] in self.zero_plurals:
            return string

        plurals = []
        
        # We take the singular form of the last word and
        # add it back in to the other words. As there could
        # be multiple forms (due to error), we need to
        # handle them all.
        plural_forms = self.plural_form(words[-1])
        for plural_form in plural_forms:
            plural = self.remove_spaces(" ".join([*words[:-1], plural_form]))
            plurals.append(plural)
            
        return plurals
        
    def plural_form(self, string):
        versions = []

        # Words that end with -us often have
        # two different plural versions: -es and -i.
        # For example, the plural version of cactus 
        # can be cactuses or cacti.
        if re.fullmatch(r".*us$", string):
            versions.append(f'{string}es')
            versions.append(f'{string[:-2]}i')
            return versions

        # The -es ending is added to the words below.
        if re.fullmatch(r".*(s|sh|ch|x|z)$", string):
            versions.append(f'{string}es')
            return versions

        # Words that end with a consonant followed by 'y'
        # are made plural by replacing the 'y' with -ies.
        # For example, the plural version of canary is
        # canaries.
        if re.fullmatch(r".*([^aeiou])(y)$", string):
            versions.append(f'{string[:-1]}ies')
            return versions
            
        # The plural version of words ending with -f
        # and -fe aren't clear. To be safe, I will add
        # both versions.
        if (re.fullmatch(r".*(f)(e?)$", string) and not re.fullmatch(r".*ff$", string)):
            last_clean = re.sub(r"(f)(e?)$", "", string)
            versions.append(f'{last_clean}fs')
            versions.append(f'{last_clean}ves')
            return versions

        # People add -s or -es to words that end with 'o'.
        # To be safe, both versions are added.
        if re.fullmatch(r".*([^aeiou])o$", string):
            versions.append(f'{string}s')
            versions.append(f'{string}es')
            return versions

        # Default
        versions.append(f'{string}s')
        return versions

    def expand_unit(self, *, il_unit, ir_unit, il_boundary, ir_boundary, speech=[], literals=[], include=False, direction='BOTH', verbose=False):
        # Move Left
        if direction in ['BOTH', 'LEFT']:
            # The indices are inclusive, therefore, when 
            # the condition fails, il_unit will be equal
            # to il_boundary.
            while il_unit > il_boundary:
                # We assume that the current token is allowed,
                # and look to the token to the left.
                l_token = self.sp_doc[il_unit-1]

                # If the token is invalid, we stop expanding.
                in_set = l_token.pos_ in speech or l_token.lower_ in literals
                if include and not in_set:
                    break
                if not include and in_set:
                    break
                
                # Else, the left token is valid, and
                # we continue to expand.
                il_unit -= 1

        # Move Right
        if direction in ['BOTH', 'RIGHT']:
            # Likewise, when the condition fails,
            # ir_unit will be equal to the ir_boundary.
            # The ir_boundary is also inclusive.
            while ir_unit < ir_boundary:
                # Assuming that the current token is valid,
                # we look to the right to see if we can
                # expand.
                r_token = self.sp_doc[ir_unit+1]

                # If the token is invalid, we stop expanding.
                in_set = r_token.pos_ in speech or r_token.lower_ in literals
                if include and not in_set:
                    break
                if not include and in_set:
                    break

                # Else, the token is valid and
                # we continue.
                ir_unit += 1

        assert il_unit >= il_boundary and ir_unit <= ir_boundary
        expanded_unit = self.sp_doc[il_unit:ir_unit+1]
        return expanded_unit

    def contract_unit(self, *, il_unit, ir_unit, speech=[], literals=[], include=False, direction='BOTH', verbose=False):
        # Move Right
        if direction in ['BOTH', 'LEFT']:
            while il_unit < ir_unit:
                # We must check if the current token
                # is not allowed. If it's not allowed,
                # we contract (remove).
                token = self.sp_doc[il_unit]

                # The token is invalid, thus we stop
                # contracting.
                in_set = token.pos_ in speech or token.lower_ in literals
                if include and not in_set:
                    break
                if not include and in_set:
                    break

                # The token is valid, thus we continue.
                il_unit += 1

        # Move Left      
        if direction in ['BOTH', 'RIGHT']:
            while ir_unit > il_unit:
                token = self.sp_doc[ir_unit]

                # The token is invalid and we
                # stop contracting.
                in_set = token.pos_ in speech or token.lower_ in literals
                if include and not in_set:
                    break
                if not include and in_set:
                    break

                # The token is valid and we continue.
                ir_unit -= 1

        assert il_unit <= ir_unit
        contracted_unit = self.sp_doc[il_unit:ir_unit+1]
        return contracted_unit

    def find_unit_context(self, *, il_unit, ir_unit, il_boundary, ir_boundary, verbose=False):
        # Caveat: Parentheticals
        # The context of a unit inside of parentheses should not
        # go farther than the boundaries of those parentheses.
        # However, we need to manually determine whether the unit
        # is in parentheses (or any set of the matching symbols
        # below).
        matching_puncts = {
            "[": "]", 
            "(": ")", 
            "-": "-", 
            "--": "--"
            "{": "}"
        }
        
        # The opening symbols for group punctuation.
        opening_puncts = list(matching_puncts.keys())

        # The closing symbols for group punctuation.
        closing_puncts = list(matching_puncts.values())

        # Both the opening and closing symbols above.
        puncts = [*closing_puncts, *opening_puncts]

        # Look for Group Punctuation on the Left
        i = il_unit
        l_punct = None
        while i >= il_boundary:
            token = self.main.sp_doc[i]
            if token.text in puncts:
                l_punct = token
                break
            i -= 1

        # Look for Group Punctuation on the Right
        i = ir_unit
        r_punct = None
        while i <= ir_boundary:
            token = self.main.sp_doc[i]
            if token.text in puncts:
                r_punct = token
                break
            i += 1

        # If there's a group punctuation on the left
        # and right, and they match each other (e.g. '(' and ')'),
        # we return the text between the punctuations.
        parenthetical = l_punct and r_punct and matching_puncts.get(l_punct.lower_, '') == r_punct.text
        if parenthetical:
            return self.main.sp_doc[l_punct.i:r_punct.i+1]

        # As the unit is not a parenthetical, we will expand
        # outwards until we run into a stopping token. The exclude
        # list contains tokens that should be excluded from the
        # context. Currently, it will contain any parentheticals
        # that we run into.
        exclude = []

        # If a token's POS falls into these categories, we will
        # continue. If not, we stop expanding.
        speech = ["ADJ", "NOUN", "ADP", "ADV", "PART", "PROPN", "VERB", "PRON"]
        
        # Expand Left
        while il_unit > il_boundary:
            # Assuming that the current token is fine,
            # we look to the left.
            l_token = self.main.sp_doc[il_unit-1]

            # If it's a closing punctuation (e.g. ')', ']'),
            # we need to skip over whatever is contained in
            # that punctuation.
            if l_token.lower_ in closing_puncts:
                i = il_unit - 1
                # We continue until we reach the boundary or we
                # find the matching opening punctuation.
                token = self.main.sp_doc[i]
                while i >= il_boundary and matching_puncts.get(token.lower_, '') != l_token.lower_:
                    exclude.append(token)
                    i -= 1
                exclude.append(token)

                # After we've gone past the parenthetical,
                # we can jump to the next position.
                il_unit = i
                continue
            # If it's not a closing punctuation, we check
            # whether it's a stopping token
            else:
                if l_token.pos_ not in speech:
                    break
                else:
                    il_unit -= 1

        # Expand Right
        while ir_unit < ir_boundary:
            # We're checking the token to the right
            # to see if we can expand or not.
            r_token = self.main.sp_doc[ir_unit+1]
            
            # If the token to the right is an opening
            # punctuation (e.g. '(', '['), we must skip
            # it, the parenthetical inside, and the
            # closing punctuation.
            if r_token.lower_ in opening_puncts:
                i = ir_unit + 1
                token = self.main.sp_doc[i]
                while i <= ir_boundary and token.lower_ != matching_puncts.get(r_token.lower_, ''):
                    exclude.append(token)
                    i += 1
                exclude.append(token)

                # Skip
                ir_unit = i
                continue
            # If it's not an opening punctuation, we check
            # whether we can continue expanding.
            else:
                if r_token.pos_ not in speech:
                    break
                else:
                    ir_unit += 1

        # We remove the excluded tokens
        # and return the context.
        context = [t for t in self.main.sp_doc[il_unit:ir_unit+1] if t not in exclude]
        return context

In [None]:
# Used for the Dictionary
@Language.component("lower_case_lemmas")
def lower_case_lemmas(doc) :
    for token in doc :
        token.lemma_ = token.lemma_.lower()
    return doc

class Species:
    def __init__(self, main):
        # Tools
        self.main = main
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler"])
        self.tn_nlp.add_pipe("lower_case_lemmas", after="lemmatizer")
        self.tn_doc = None
        
        # Contains any spans that have been identified
        # as a species.
        self.spans = None
        
        # Contains any tokens that have been identified
        # as a species or being a part of a species.
        self.tokens = None
        
        # Used to quickly access the span that a token
        # belongs to.
        self.token_to_span = None
        
        # Maps a string to an array of strings wherein
        # the strings involved in the key-value pair 
        # have been identified as an alternate name of each other.
        self.alternate_names = None
        
        # Used to increase TaxoNERD's accuracy.
        self.dictionary = None
        self.load_dictionary()

    def load_dictionary(self):
        self.dictionary = ["juvenile", "adult", "prey", "predator", "species"]
        # df = pd.read_csv("VernacularNames.csv")
        # self.dictionary += df.VernacularName.to_list()

        patterns = []
        for name in self.dictionary:
            doc = self.tn_nlp(name)
            patterns.append({"label": "LIVB", "pattern": [{"LEMMA": token.lemma_} for token in doc]})
        ruler = self.tn_nlp.add_pipe("entity_ruler")
        ruler.add_patterns(patterns)
        
    def update(self, text, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        self.tn_doc = self.tn_nlp(text)
        self.spans, self.tokens, self.token_to_span, self.alternate_names = self.load_species(verbose=verbose)

    def load_species(self, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        # These three contain the species that have been
        # identified in the text. Tokens that aren't adjectives,
        # nouns, or proper nouns will be stripped.
        spans = []
        tokens = []
        token_to_span = {}

        # It's useful to know if a different name refers to a
        # species we have already seen. For example, in
        # "predatory crab (Carcinus maenas)", "predatory crab"
        # is an alternative name for "Carcinus maenas" and
        # vice versa. This is used so that the species can be
        # properly tracked and redundant points are less
        # likely to be given.
        alternate_names = {}

        # We convert the spans that TaxoNerd has recognized
        # to spans under a different parent document. This is
        # because we're largely using said parent document and
        # there is more functionality in that parent document.
        species_spans = []
        for tn_species_span in self.tn_doc.ents:
            char_i0 = self.tn_doc[span.start].idx
            char_i1 = self.tn_doc[span.end-1].idx

            sp_token_i0 = self.main.token_at_char(char_i0).i
            sp_token_i1 = self.main.token_at_char(char_i1).i

            sp_species_span = self.main.sp_doc[sp_token_i0:sp_token_i1+1]
            species_spans.append(sp_species_span)

            # Although they have different parent documents,
            # they should still have the same text.
            assert sp_species_span.text.lower() == tn_species.span.text.lower()

        # TaxoNerd sometimes recognizes one instance of a species
        # and fails to recognize it elsewhere. To fix this, I'll
        # search the text for all the species that TaxoNerd sees.
        # This should resolve that issue. To make this more robust,
        # I'll include the singular and plural versions of the
        # recognized species. Furthermore, the species being used
        # to search for other instances of species in the text will
        # be called search_species.
        search_species = []

        for species_span in species_span:
            species_text = species_span.text.lower()
            species_text = self.main.help.remove_outer_non_alnum(species_text)

            search_species.append(species_text)

            # Add Singular and/or Plural Version
            if species_span[-1].pos_ == "NOUN":
                # Singular
                if species_span[-1].tag_ == "NN":
                    singular_species = singularize(species_text)
                    search_species.extend(singular_species)
                # Plural
                if species_span[-1].tag_ == "NNS":
                    plural_species = pluralize(species_text)
                    search_species.extend(plural_species)

        # Now, we have the species to search for in the text.
        text = self.main.sp_doc.text.lower()
        search_species = list(set(search_species))
        
        for species in search_species:
            matches = re.finditer(re.escape(species), text)
            
            for char_i0, char_i1 in [(match.start(), match.end()) for match in matches]:
                # The full word must match, not just a substring inside of it.
                # So, if the species we're looking for is "ant", only "ant"
                # will match -- not "pants" or "antebellum". Therefore, the
                # characters to the left and right of the matched string must be
                # non-alphanumeric. 
                l_char_is_letter = char_i0 > 0 and text[char_i0-1].isalpha()
                r_char_is_letter = char_i1 < len(text) - 1 and text[char_i1+1].isalpha()
                
                if l_char_is_letter or r_char_is_letter:
                    continue
                    
                sp_li = self.main.token_at_char(char_i0).i
                sp_ri = self.main.token_at_char(char_i1-1).i

                # This is the matched substring (which would be
                # a species) as a span in the parent document.
                species_span = self.main.sp_doc[sp_li:sp_ri+1]
                
                # Expand Species
                # Let's say there's a word like "squirrel". That's a bit ambiguous. 
                # Is it a brown squirrel, a bonobo? If the species is possibly missing
                # information (like an adjective to the left of it), we should expand
                # in order to get a full picture of the species.
                unclear_1 = len(species_span) == 1 and species_span[0].pos_ == "NOUN"
                unclear_2 = species_span.start > 0 and self.main.sp_doc[species_span.start-1].pos_ in ["ADJ"]
                
                if unclear_1 or unclear_2:
                    species_span = self.main.help.expand_unit(
                        il_unit=species_span.start, 
                        ir_unit=species_span.end-1,
                        il_boundary=0,
                        ir_boundary=len(self.main.sp_doc),
                        speech=["ADJ", "PROPN"],
                        literals=["-"],
                        direction="LEFT",
                        verbose=verbose
                    )
                
                # Remove Outer Symbols
                # There are times where a species is identified with a parenthesis
                # nearby. Here, we remove that parenthesis (and any other symbols).
                species_span = self.main.help.contract_unit(
                    il_unit=species_span.start, 
                    ir_unit=species_span.end-1, 
                    speech=["PUNCT", "SYM"],
                    include=False,
                    verbose=verbose
                )
            
                # Assuming that a species has letters in it,
                # any species identified that somehow does not
                # have a letter in it will be removed.
                letter_found = False
                for token in species_span:
                    if token.pos_ not in ["PUNCT", "SYM"]:
                        letter_found = True
                        break

                if not letter_found:
                    continue

                # Adding Species
                spans.append(species_span)
                for token in species_span:
                    if token.pos_ in ["PUNCT", "SYM"]:
                        continue
                    tokens.append(token)
                    token_to_span[token] = species_span

        # Removing Duplicates and Sorting 
        spans = list({span.start: span for span in spans}.values())
        spans.sort(key=lambda span: span.start)
        
        # Finding and Storing Alternative Names
        for i, species_span in enumerate(spans):
            # There's not a next species to
            # evaluate.
            if i + 1 >= len(spans):
                break
            
            next_species_span = spans[i+1]
            
            # If there's one token between the species and the next species,
            # we check if the next species is surrounded by punctuation.
            if next_species_span.start - species_span.end == 1:
                # Token Before and After the Next Species
                before_next = self.main.sp_doc[next_species_span.start-1]
                after_next = self.main.sp_doc[next_species_span.end]

                if before_next.pos_ in ["PUNCT", "SYM"] and after_next.pos_ in ["PUNCT", "SYM"]:
                    sp_1_text = species_span.text.lower()
                    sp_2_text = next_species_span.text.lower()
                    
                    if sp_1_text not in alternate_names:
                        alternate_names[sp_1_text] = []
                    
                    if sp_2_text not in alternate_names:
                        alternate_names[sp_2_text] = []
                    
                    alternate_names[sp_1_text].append(sp_2_text)
                    alternate_names[sp_2_text].append(sp_1_text)
            # If there's no token between the species and the next,
            # species we assume that they refer to the same species.
            elif next_species_span.start - species_span.end == 0:
                sp_1_text = species_span.text.lower()
                sp_2_text = next_species_span.text.lower()
                
                if sp_1_text not in alternate_names:
                    alternate_names[sp_1_text] = []
                
                if sp_2_text not in alternate_names:
                    alternate_names[sp_2_text] = []

                alternate_names[sp_1_text].append(sp_2_text)
                alternate_names[sp_2_text].append(sp_1_text)
       
        return (spans, tokens, token_to_span, alternate_names)

    def span_at_token(self, token):
        if token in self.token_to_span:
            return self.token_to_span[token]
        return None
    
    def is_species(self, token):
        return token in self.tokens
        
    def has_species(self, tokens, verbose=False):
        for token in tokens:
            if token in self.tokens:
                return True
        return False

    def find_same_species(self, species_A, species_b, verbose=False):
        # METHOD 1: Check for Literal Matches
        sp_b_text = sp_b.text.lower()
        
        for sp_a in sp_A:
            # Verbatim Text
            sp_a_text = sp_a.text.lower()

            if sp_a_text == sp_b_text:
                return sp_a

            # Singularized Text
            sp_a_singular_texts = sp_a_text if sp_a[-1].tag_ in ["NN", "NNP"] else self.main.help.singularize(sp_a_text)
            sp_b_singular_texts = sp_b_text if sp_b[-1].tag_ in ["NN", "NNP"] else self.main.help.singularize(sp_b_text)

            if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
                return sp_a

        # METHOD 2: Check Alternate Names
        for sp_a in sp_A:
            # Species B is an alternate name for Species A
            if sp_b_text in self.alternate_names.get(sp_a_text, []):
                return sp_a
            # Species A is an alternate name for Species B
            if sp_a_text in self.alternate_names.get(sp_b_text, []):
                return sp_a
        
        # METHOD 3: Check Nouns
        # This is used if one or none of the species being compared
        # has 1 adjective.
        sp_b_0_text = sp_b[0].lower_
        sp_b_is_noun = sp_b[0].pos_ in ["NOUN", "PROPN"]
        
        for sp_a in sp_A:
            sp_a_0_text = sp_a[0].lower_
            sp_a_is_noun = sp_a[0].pos_ in ["NOUN", "PROPN"]
            
            if sp_a_0_text == sp_b_0_text and (sp_a_is_noun or sp_b_is_noun):
                if sp_a_text in sp_b_text or sp_b_text in sp_a_text:
                    return True
            else:
                sp_a_nouns = []
                sp_a_num_adjectives = 0
                for token in sp_a:
                    if not sp_a_nouns and token.pos_ == "ADJ":
                        sp_a_num_adjectives += 1
                    elif token.pos_ in ["PROPN", "NOUN"]:
                        sp_a_nouns.append(token)
                
                sp_b_nouns = []
                sp_b_num_adjectives = 0
                for token in sp_b:
                    if not sp_b_nouns and token.pos_ == "ADJ":
                        sp_b_num_adjectives += 1
                    elif token.pos_ in ["PROPN", "NOUN"]:
                        sp_b_nouns.append(token)
        
                if sp_a_nouns and sp_b_nouns and (
                    (sp_a_num_adjectives == 1 and sp_b_num_adjectives == 0) or 
                    (sp_b_num_adjectives == 1 and sp_a_num_adjectives == 0)
                ):
                    sp_a_singular_texts = " ".join(sp_a_nouns) if sp_a_nouns[-1].tag_ in ["NN", "NNP"] else self.main.help.singularize(" ".join(sp_a_nouns))
                    sp_b_singular_texts = " ".join(sp_b_nouns) if sp_b_nouns[-1].tag_ in ["NN", "NNP"] else self.main.help.singularize(" ".join(sp_b_nouns))
            
                    return set(sp_a_singular_texts).intersection(sp_b_singular_texts)
        
        return False

In [None]:
class Keywords:
    def __init__(self, main, *, base=[], speech=[], literals=[], threshold=0.7, include_substring=False):
        self.main = main
        # For a token to count towards a base word, it must be the same word.
        self.base = [b.lower() for b in base]
        self.speech = [s.upper() for s in speech]
        self.literals = [l.lower() for l in literals]
        # When comparing two words, SpaCy returns a value
        # from 0 to 1, representing how similar the two
        # embeddings are. The threshold below determines
        # the minimum number of similarity before two words
        # are considered as being equivalent.
        self.threshold = threshold
        self.vocab = [self.main.sp_nlp(word) for word in self.base]
        # If this is True, then we will also check
        # if the token contains a base word.
        self.include_substring = include_substring
        # This list contains the matched tokens.
        self.tokens = []

    def update(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        self.tokens = self.match_tokens(verbose=verbose)

    def match_tokens(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        
        matched_tokens = []

        # Check Words
        for token in self.main.sp_doc:    
            if token.pos_ not in self.speech:
                continue

            token_lower = token.lower_
            token_lemma_lower = token.lemma_.lower()
            
            # Look for Base Word
            if token_lemma_lower in self.base or token_lower in self.base:
                matched_tokens.append(token)
                continue
                
            # Look for Base Word in Token 
            # For example, a word like "biomass" would match
            # if "mass" is a base word.
            if self.include_substring:
                for base_word in self.base:
                    if base_word in token_lemma_lower or sub_base_word in token_lower:
                        matched_tokens.append(token)
                        break

            # Already Matched Token
            if matched_tokens and matched_tokens[-1] == token:
                continue
            
            # Comparing Similarity
            token_doc = self.main.sp_nlp(token_lower)
            for word in self.vocab:
                similarity = word.similarity(token_doc)
                
                if similarity >= self.threshold:
                    matched_tokens.append(token)
                    break

        # Check Literals
        text = self.main.sp_doc.text.lower()
        for literal in self.literals:
            for char_index in [match.start() for match in re.finditer(literal, text)]:
                matched_tokens.append(self.main.token_at_char(char_index))
                
        return matched_tokens

In [None]:
class ExperimentKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=[
                "study", 
                "hypothesis", 
                "experiment", 
                "found", 
                "discover", 
                "compare", 
                "finding", 
                "result", 
                "test", 
                "examine", 
                "model",
                "measure",
                "manipulate",
                "assess",
                "conduct",
                "data",
                "analyze",
                "sample",
                "observe"
                "predict",
            ],
            literals=[
                "control group", 
                "independent", 
                "dependent"
            ],
            speech=["VERB", "NOUN"], 
            threshold=0.8
        )

In [None]:
class CauseKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=[
                "increase", 
                "decrease", 
                "change", 
                "shift", 
                "cause", 
                "produce", 
                "trigger", 
                "suppress", 
                "inhibit",
                "encourage",
                "allow",
                "influence",
                "affect",
                "alter",
                "induce",
                "produce",
                "result in",
                "associated with",
                "correlated with",
                "contribute",
                "impact"
            ],
            speech=["VERB", "NOUN"], 
            threshold=0.8
        )

In [None]:
class ChangeKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=[
                "few", 
                "more", 
                "increase", 
                "decrease", 
                "less", 
                "short", 
                "long", 
                "greater"
                "shift",
                "fluctuate",
                "adapt",
                "grow",
                "rise"
                "surge",
                "intensify",
                "amplify",
                "multiply",
                "decline",
                "reduce",
                "drop",
                "diminish",
                "fall",
                "lessen"
            ],
            speech=["NOUN", "ADJ", "ADV"], 
            threshold=0.8
        )

    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)

    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        filtered = []
        for token in self.main.sp_doc:
            # Already Matched
            if token in tokens:
                filtered.append(token)
            
            # Comparative Adjective
            # Looking for words like "bigger" and "better".
            elif token.pos_ == "ADJ" and token.tag_ == "JJR":
                filtered.append(token)
                continue
            
        return filtered

In [None]:
class TraitKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=[
                "behavior", 
                "rate", 
                "color", 
                "mass", 
                "size", 
                "length", 
                "pattern", 
                "weight", 
                "shape", 
                "efficiency", 
                "trait",
                "ability", 
                "capacity", 
                "height", 
                "width", 
                "span",
                "diet",
                "feeding",
                "nest",
                "substrate",
                "breeding",
                "age",
                "lifespan",
                "development",
                "time",
                "mating",
                "fur",
                "feathers",
                "scales",
                "skin",
                "limb",
                "configuration",
                "dimorphism",
                "capability",
                "appendages",
                "blood",
                "regulation",
                "excretion",
                "luminescence",
                "role",
                "reproduction",
                "courtship",
                "pollination",
                "mechanism",
                "sensitivity",
                "resistance"
            ],
            include_substring=True,
            speech=["NOUN", "ADJ"], 
            threshold=0.8
        )

    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)

    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        filtered = []
        for token in tokens:
            expanded_token = self.main.help.expand_unit(
                il_unit=token.i, 
                ir_unit=token.i, 
                il_boundary=0, 
                ir_boundary=len(self.main.sp_doc) - 1, 
                speech=["ADJ", "NOUN", "ADP", "PART", "DET", "PROPN",],
                literals=["-", ","],
                verbose=verbose
            )

            if self.main.species.has_species(expanded_token):
                filtered.append(token)
        
        return filtered

In [None]:
class Main:
    def __init__(self):
        # Tools
        self.sp_nlp = spacy.load("en_core_web_lg")
        self.fcoref = FCoref(enable_progress_bar=False, device='cpu')
        self.sp_doc = None

        # Maps Character Position to Token in Document
        # Used to handle differences between different
        # pipelines and tools.
        self.index_map = None
    
        # Parsers
        self.species = Species(self)
        self.traits = TraitKeywords(self)
        self.causes = CauseKeywords(self)
        self.changes = ChangeKeywords(self)
        self.experiment = ExperimentKeywords(self)

    def update_doc(self, doc, verbose=False):
        self.sp_doc = doc
        self.index_map = self.load_index_map()
        self.species.update(doc.text, verbose=True)
        self.traits.update(verbose=False)
        self.causes.update(verbose=False)
        self.changes.update(verbose=False)
        self.experiment.update(verbose=False)

    def update_text(self, text, verbose=False):
        self.sp_doc = self.sp_nlp(text)
        self.update_doc(self.sp_doc, verbose=verbose)
        
    def token_at_char(self, char_index):
        # SpaCy Doc or Indexing Map Not Found
        if not self.sp_doc or not self.index_map:
            raise Exception("DNE")

        if char_index in self.index_map:
            return self.index_map[char_index]

        raise Exception("Token Not Found")
        
    def load_index_map(self):
        # SpaCy Doc Not Found
        if self.sp_doc is None:
            raise Exception("DNE")

        # Map Character Index to Token
        index_map = {}
        for token in self.sp_doc:
            # char_i0 is the index of the token's starting character.
            # char_i1 is the index of the character after the token's ending character.
            char_i0 = token.idx
            char_i1 = token.idx + len(token)
        
            for i in range(char_i0, char_i1):
                index_map[i] = token
            
        return index_map

    def score(self, verbose=False):
        NUM_CATEGORIES = 4

        # Requires the mention of a trait and a cause or change word.
        # The cause or change word indicates some variation.
        # Index 0 in Array
        TRAIT = 0

        # Requires the mention of a species and a cause or change word.
        # The cause or change word indicates that the species is being
        # affected or is affecting something else.
        # Index 1 in Array
        SPECIES = 1

        # Requires a word that has been defined as "experiment"-related.
        # Index 2 in Array
        EXPERIMENT = 2

        # Requires the mention of several species (more or less).
        # Index 3 in Array
        INTERACTION = 3

        # Max # of Points of Category per Sentence (MPC)
        # A sentence collects points from its categories.
        # For example, a sentence could get a maximum of 2 points from one category
        # and a maximum of 1 point from another. The MPC determines the maximum number
        # of points a category could contribute to a sentence. To have a range of [0, 1]
        # the maximum number of points, across categories, when added should be 1.
        MPC = [0] * NUM_CATEGORIES
        MPC[TRAIT] = 0.1
        MPC[SPECIES] = 0.3
        MPC[EXPERIMENT] = 0.3
        MPC[INTERACTION] = 0.3

        assert np.sum(MPC) == 1
        
        # Points per Instance of Category (PIC)
        # Each token is evaluated to check whether a category
        # can be given points. The number of points given, if
        # the token is determined to be satisfactory, is the PIC.
        # The PIC is less than or equal to the MPC for the corresponding
        # category. The idea behind the PIC and MPC is similar to how
        # sets work in tennis: you're not immediately awarded the full points
        # for the set (MPC) if your opponent fails to return the ball,
        # instead you're given a smaller # of points (PIC) that allow you to
        # incrementally win the set (category).
        PIC = [0] * NUM_CATEGORIES
        PIC[TRAIT] = MPC[TRAIT]*1.0
        PIC[SPECIES] = MPC[SPECIES]/3.0
        PIC[EXPERIMENT] = MPC[EXPERIMENT]/1.0
        PIC[INTERACTION] = MPC[INTERACTION]/3.0

        for i in range(NUM_CATEGORIES):
            assert PIC[i] <= MPC[i]

        # Category Weights (CW)
        # It may be helpful to weigh a certain category's fraction of total points
        # more or less than another's. Thus, at the end, we'll take a
        # weighted average of the category's FTP. The weights must add up to 1.
        CW = [0] * NUM_CATEGORIES
        CW[TRAIT] = 0.25
        CW[SPECIES] = 0.25
        CW[EXPERIMENT] = 0.25
        CW[INTERACTION] = 0.25

        assert np.sum(MPC) == 1

        # Points
        points = [0] * NUM_CATEGORIES

        # Extracted Information
        cause_tokens = self.causes.tokens
        change_tokens = self.changes.tokens
        trait_tokens = self.traits.tokens
        species_tokens = [self.sp_doc[span.start] for span in self.species.spans]
        experiment_tokens = self.experiment.tokens
         
        # This is used to ensure that at least three species
        # are mentioned.
        seen_species = {}

        for sent in self.sp_doc.sents:
            # This contains the number of points
            # each category has accumulated in the sentence.
            curr_points = [0] * NUM_CATEGORIES

            # Contains the tokens in the sentence.
            sent_tokens = [token for token in sent]

            # This is used for the species (must have a nearby cause and/or
            # change word).
            sent_cause_tokens = set(sent_tokens).intersection(cause_tokens)
            sent_change_tokens = set(sent_tokens).intersection(change_tokens)

            # We don't want to visit the same species more than one
            # in the same sentence as to avoid redundant points.
            sent_seen_species = []
            
            for token in sent_tokens:
                # If each category has reached their maximum number of points,
                # we can end the loop early.
                all_maxed = True
                for i in range(NUM_CATEGORIES):
                    if curr_points[i] < MPC[i]:
                        all_maxed = False

                if all_maxed:
                    break

                # TRAIT CATEGORY
                if curr_points[TRAIT] < MPC[TRAIT] and token in trait_tokens:
                    # To get points in the trait category, there must 
                    # be (1) a trait; and (2) a change or cause in the token's
                    # context.
                    token_context = set(self.find_unit_context(
                        il_unit=token.i, 
                        ir_unit=token.i, 
                        il_boundary=token.sent.start, 
                        ir_boundary=token.sent.end-1, 
                        verbose=verbose)
                    )
                    cause_tokens_in_context = set(sent_cause_tokens).intersection(token_context)
                    change_tokens_in_context = set(sent_change_tokens).intersection(token_context)

                    if cause_tokens_in_context or change_tokens_in_context:
                        curr_points[TRAIT] += PIC[TRAIT]

                # EXPERIMENT CATEGORY
                if curr_points[EXPERIMENT] < MPC[EXPERIMENT] and token in experiment_tokens:
                    curr_points[EXPERIMENT] += PIC[EXPERIMENT]

                # SPECIES CATEGORY
                if token in species_tokens:
                    # Find Species Span
                    species_span = self.species.span_at_token(token)           

                    # Updating Seen Species (in Entire Text)
                    past_visits = 0
                    for seen_species_span in seen_species.keys():
                        if self.species.same_species(species_span, seen_species_span, verbose=verbose):
                            past_visits = seen_species[seen_species_span]
                            seen_species[seen_species_span] += 1
                            break

                    if not past_visits:
                        seen_species[species_span] = 1

                    # Prevents Unneeded Function Call
                    is_new_species = past_visits == 0
          
                    # Checking Seen Species (in Sentence)
                    # We only add points if it's a species that has not been seen
                    # in the sentence. This is to avoid redundant points. 
                    # Also, if it species has not been seen at all (is_new_species),
                    # then it cannot be a redundant species (we couldn't have seen it in the sentence
                    # either).
                    redundant_species = False

                    if not is_new_species:
                        for seen_species_span in sent_seen_species:
                            if self.species.same_species(species_span, seen_species_span, verbose=verbose):
                                redundant_species = True
                                break
                    
                    sent_seen_species.append(species_span)
                    if redundant_species:
                        continue

                    # INTERACTION CATEGORY
                    # It is helpful to have this category here because (if we've reached here)
                    # we're dealing with a new species in the sentence.
                    if curr_points[INTERACTION] < MPC[INTERACTION]:
                        curr_points[INTERACTION] += PIC[INTERACTION]
                        
                    if curr_points[SPECIES] < MPC[SPECIES]:
                        # To get points in the species category, there must be 
                        # (1) a species; and (2) a change or cause in the phrase
                        # (or clause) that the token is a part of.
                        token_context = set(self.find_unit_context(
                            il_unit=token.i, 
                            ir_unit=token.i, 
                            il_boundary=token.sent.start, 
                            ir_boundary=token.sent.end-1, 
                            verbose=verbose)
                        )
                        cause_tokens_in_context = set(sent_cause_tokens).intersection(token_context)
                        change_tokens_in_context = set(sent_change_tokens).intersection(token_context)
                        
                        if cause_tokens_in_context or change_tokens_in_context:
                            curr_points[SPECIES] += PIC[SPECIES]
         
            # SENTENCE DONE
            # Add Sentence Points to Total Points
            for category in [TRAIT, SPECIES, EXPERIMENT, INTERACTION]:
                points[category] += min(curr_points[category], MPC[category])

        # Enforcing 3 or More Species            
        if len(seen_species) < 3:
            return 0
        
        # Calculating Score            
        NUM_SENTENCES = len(list(self.sp_doc.sents))

        score = 0
        for i in range(NUM_CATEGORIES):
            FTP = points[i] / (MPC[i] * NUM_SENTENCES)
            score += FTP * CW[i]
            
        assert 0.0 <= score <= 1.0

        return score

In [None]:
df = pd.read_csv("../../Datasets/Baseline-1.csv")

In [None]:
main = Main()
for abstract in [df.Abstract.to_list()[1]]:
    print(abstract)
    main.update_text(abstract, verbose=True)
    score = main.score(verbose=True)
    print(score)
    print()

In [17]:
nlp = spacy.load("en_core_web_lg")
doc = nlp("You're brighter than this.")

In [18]:
for token in doc:
    print(token, token.pos_, token.tag_)

You PRON PRP
're AUX VBP
brighter ADJ JJR
than ADP IN
this PRON DT
. PUNCT .
