In [None]:
# Trait-Mediated Interaction Modification
# Empirical:
# You could look for words like "hypothesis", "experiment", "found", and "discovered". That may point
# towards there being an experiment in the paper. There are also words like "control group", "compared",
# "findings", "results", "study", and more.
# Qualitative vs. Quantitative:
# To infer whether something is quantitative, you could look for numeric tokens and units.
# However, you can only do so much with the abstract. Therefore, this is likely not good enough.
# Yet, you could still take advantage of words like "fewer" and "increased" to show that there is a change.
# However, this would be more suited for the above category.
# Traits:
# There is no NLP tool for traits that I can use or create so I think that I could instead use keywords.
# For example, "snail feeding rates" is a trait. You may be able to spot this by looking for a word like
# "rate". You'd expand that word to include "snail feeding rates". As "snail" is a species you can infer
# that "rates" is a trait. I would be more decisive and use a dependency parser to ensure that the trait
# is a property of the species (like before). However, with all the cases that may exist, I think checking
# to see whether a species can be found by traveling back and/or forward without finding certain tokens could
# work well enough.
# 3 Species or More:
# This is simple. However, I think using a dictionary and TaxoNerd would be beneficial (for higher accuracy).
# To handle the potential differences in tokenization, character offsets should be used.
# Standardization:
# There is a lot of variance in the scores. To squash this issue, I think that we could assign each sentence
# a value from 0 to 1. We would add these values and divide by the number of sentences. This would result in
# a number that is also from 0 to 1. However, there are categories that we would like to inspect. So, we must
# create an overall score in the interval from [0, 1] while also scoring each category. Well, for each sentence
# we could add a point for each category that is observed. The sentence would receive said score divided by the
# number of categories. At the end, we add up all the sentence scores and divide by the number of sentences.
# The aggregate score for each category would also be divided by the number of sentences.

In [None]:
import re
import csv
import sys
import time
import spacy
import numpy as np
import pandas as pd
import random
import pickle
from fastcoref import FCoref, LingMessCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher, PhraseMatcher
from spacy.language import Language
from IPython.display import clear_output
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
%run -i "../utils.py"

In [None]:
class Help:
    def __init__(self, main):
        self.main = main
        # Zero Plurals
        # The singular and plural versions of the words below are the same.
        self.zero_plurals = [
            "species", 
            "deer", 
            "fish", 
            "moose", 
            "sheep", 
            "swine", 
            "buffalo", 
            "trout", 
            "cattle"
        ]
        # Irregular Nouns
        # There's not a defined conversion method.
        self.irregular_nouns = {
            "ox": "oxen",
            "goose": "geese",
            "mouse": "mice",
            "bacterium": "bacteria"
        }
        self.irregular_nouns_rev = {v: k for k, v in self.irregular_nouns.items()}
        self.irregular_singular_nouns = self.irregular_nouns.keys()
        self.irregular_plural_nouns = self.irregular_nouns.values()

    def remove_extra_spaces(self, string):
        # Remove Duplicate Spaces
        string = re.sub(r"\s+", " ", string)
        # Remove Spaces Before Punctuation
        string = re.sub(r"\s+([?.!,])", r"\1", string)
        # Remove Outside Spaces
        return string.strip()

    def remove_outer_non_alnum(self, string):
        while string:
            start_len = len(string)
            # Remove Leading Non-Alphanumeric Character
            if string and not string[0].isalnum():
                string = string[1:]
            # Remove Trailing Non-Alphanumeric Character
            if string and not string[-1].isalnum():
                string = string[:-1]
            # No Changes Made
            if start_len == len(string):
                break
        return string

    def group_text(self, text, flatten=False):
        # The parenthetical would be the content inside of a pair of
        # matching parentheses, brackets, or braces.
        parentheticals = []
        
        # This contains the text that's not inside of
        # parentheses and co.
        base_text = []
        
        # Used for building groups,
        # handles a nested structure.
        stacks = []
        
        # These are the characters we recognize
        # in terms of grouping.
        pairs = {
            "(": ")",
            "[": "]",
            "{": "}"
        }
        open_chars = pairs.keys()
        close_chars = pairs.values()
        
        # This contains the opening characters
        # of the groups that are currently open
        # (e.g. '(', '['). We use it so that we know
        # whether we open or close a group.
        opened = []
        
        for i, char in enumerate(text):
            # Opening Character
            if char in open_chars:
                stacks.append([])
                opened.append(char)
            # Closing Character
            elif opened and char == pairs.get(opened[-1], ""):
                parentheticals.append(stacks.pop())
                opened.pop()
            # Add Character to Group
            elif opened:
                stacks[-1].append(i)
            # Add Character to Ungrouped Text
            else:
                base_text.append(i)
        
        # If an opening character hasn't been closed,
        # we just close all the remaining opened groups.
        # This is moreso a problem regarding the text.
        while stacks:
            parentheticals.append(stacks.pop())
            
        # Merge
        groups = [*parentheticals, base_text]
        tuple_groups = []
        for group in groups:
            if not group:
                continue
            
            tuples = [[group[0], group[0] + 1]]
            for index in group[1:]:
                if tuples[-1][1] == index:
                    tuples[-1][1] = index + 1
                else:
                    tuples.append([index, index + 1])
            tuple_groups.append(tuples)
            
        if flatten:
            flattened_tuple_groups = []
            for tuple_group in tuple_groups:
                for tuple in tuple_group:
                    flattened_tuple_groups.append(tuple)
            tuple_groups = flattened_tuple_groups
        
        return tuple_groups

    def singularize(self, string):
        string = string.lower()
        
        # The string to singularize should not have any
        # non-alphanumeric characters at the end, or else
        # the algorithm will not work.
        words = re.split(r" ", string)

        if not words:
            return [string]

        # If the last word in the string is a zero plural
        # or a singular irregular noun, there's no changes
        # to make. For example, "red sheep" and "ox" are 
        # already singular.
        if (
            words[-1] in self.zero_plurals or 
            words[-1] in self.irregular_singular_nouns
        ):
            return [string]

        # If the last word in the string is an irregular
        # plural noun, we rely on a dictionary with the
        # corresponding mapping.
        if words[-1] in self.irregular_plural_nouns:
            words[-1] = self.irregular_nouns_rev[words[-1]]
            return [self.remove_extra_spaces(" ".join(words))]
        
        singulars = []

        # We take the singular form of the last word and
        # add it back in to the other words. As there could
        # be multiple forms (due to error), we need to
        # handle them all.
        singular_forms = self.singular_form(words[-1])

        if not singular_forms:
            return [string]
        
        for singular_form in singular_forms:
            singular = self.remove_extra_spaces(" ".join([*words[:-1], singular_form]))
            singulars.append(singular)
            
        return singulars

    def singular_form(self, string):
        versions = []

        # Change -ies to -y
        if re.fullmatch(r".*ies$", string):
            versions.append(f'{string[:-3]}y')
            return versions

        # Change -ves to -f and -fe
        if re.fullmatch(r".*ves$", string):
            versions.append(f'{string[:-3]}f')
            versions.append(f'{string[:-3]}fe')
            return versions

        # Remove -es 
        if re.fullmatch(r".*es$", string):
            versions.append(f'{string[:-2]}')
            return versions

        # Change -i to -us
        if re.fullmatch(r".*i$", string):
            versions.append(f'{string[:-1]}us')
            return versions

        # Remove -s
        if re.fullmatch(r".*s$", string):
            versions.append(f'{string[:-1]}')
            return versions

        return versions

    def pluralize(self, string):
        string = string.lower()
        
        # The string to pluralize should not have any
        # non-alphanumeric characters at the end, or else
        # the algorithm will not work.
        words = re.split(r" ", string)

        if not words:
            return [string]

        # If the last word in the string is a zero plural
        # or a plural irregular noun, there's no changes
        # to make. For example, "red sheep" and "oxen" are 
        # already singular.
        if (
            words[-1] in self.zero_plurals or 
            words[-1] in self.irregular_plural_nouns
        ):
            return [string]

        # If the last word in the string is an irregular
        # singular noun, we rely on a dictionary with the
        # corresponding mapping.
        if words[-1] in self.irregular_singular_nouns:
            words[-1] = self.irregular_nouns[words[-1]]
            return [self.remove_extra_spaces(" ".join(words))]
        
        plurals = []
        
        # We take the singular form of the last word and
        # add it back in to the other words. As there could
        # be multiple forms (due to error), we need to
        # handle them all.
        plural_forms = self.plural_form(words[-1])

        if not plural_forms:
            return [string]
            
        for plural_form in plural_forms:
            plural = self.remove_extra_spaces(" ".join([*words[:-1], plural_form]))
            plurals.append(plural)
            
        return plurals
        
    def plural_form(self, string):
        versions = []

        # Words that end with -us often have
        # two different plural versions: -es and -i.
        # For example, the plural version of cactus 
        # can be cactuses or cacti.
        if re.fullmatch(r".*us$", string):
            versions.append(f'{string}es')
            versions.append(f'{string[:-2]}i')
            return versions

        # The -es ending is added to the words below.
        if re.fullmatch(r".*([^l]s|sh|ch|x|z)$", string):
            versions.append(f'{string}es')
            return versions

        # Words that end with a consonant followed by 'y'
        # are made plural by replacing the 'y' with -ies.
        # For example, the plural version of canary is
        # canaries.
        if re.fullmatch(r".*([^aeiou])(y)$", string):
            versions.append(f'{string[:-1]}ies')
            return versions
            
        # The plural version of words ending with -f
        # and -fe aren't clear. To be safe, I will add
        # both versions.
        if (re.fullmatch(r".*(f)(e?)$", string) and not re.fullmatch(r".*ff$", string)):
            last_clean = re.sub(r"(f)(e?)$", "", string)
            versions.append(f'{last_clean}fs')
            versions.append(f'{last_clean}ves')
            return versions

        # People add -s or -es to words that end with 'o'.
        # To be safe, both versions are added.
        if re.fullmatch(r".*([^aeiou])o$", string):
            versions.append(f'{string}s')
            versions.append(f'{string}es')
            return versions

        # If there's no -s at the end of the string and
        # the other cases didn't run, we add an -s.
        if re.fullmatch(r".*[^s]$", string):
            versions.append(f'{string}s')
        
        return versions

    def expand_unit(self, *, il_unit, ir_unit, il_boundary, ir_boundary, speech=[], literals=[], include=True, direction='BOTH', verbose=False):
        assert il_unit <= ir_unit
        if direction in ['BOTH', 'LEFT']:
            assert il_boundary <= il_unit
        if direction in ['BOTH', 'RIGHT']:
            assert ir_boundary >= ir_unit
        
        # Move Left
        if direction in ['BOTH', 'LEFT']:
            # The indices are inclusive, therefore, when 
            # the condition fails, il_unit will be equal
            # to il_boundary.
            while il_unit > il_boundary:
                # We assume that the current token is allowed,
                # and look to the token to the left.
                l_token = self.main.sp_doc[il_unit-1]

                # If the token is invalid, we stop expanding.
                in_set = l_token.pos_ in speech or l_token.lower_ in literals

                # Case 1: include=False, in_set=True
                # If we're not meant to include the defined tokens, and the
                # current token is in that set, we stop expanding.
                # Case 2: include=True, in_set=False
                # If we're meant to include the defined tokens, and the current
                # token is not in that set, we stop expanding.
                # Case 3: include=in_set
                # If we're meant to include the defined tokens, and the current
                # token is in that set, we continue expanding. If we're not meant
                # to include the defined tokens, and the current token is not
                # in that set, we continue expanding.
                if include ^ in_set:
                    break
                
                # Else, the left token is valid, and
                # we continue to expand.
                il_unit -= 1

        # Move Right
        if direction in ['BOTH', 'RIGHT']:
            # Likewise, when the condition fails,
            # ir_unit will be equal to the ir_boundary.
            # The ir_boundary is also inclusive.
            while ir_unit < ir_boundary:
                # Assuming that the current token is valid,
                # we look to the right to see if we can
                # expand.
                r_token = self.main.sp_doc[ir_unit+1]

                # If the token is invalid, we stop expanding.
                in_set = r_token.pos_ in speech or r_token.lower_ in literals
                if include ^ in_set:
                    break

                # Else, the token is valid and
                # we continue.
                ir_unit += 1

        assert il_unit >= il_boundary and ir_unit <= ir_boundary
        expanded_unit = self.main.sp_doc[il_unit:ir_unit+1]
        return expanded_unit

    def contract_unit(self, *, il_unit, ir_unit, speech=[], literals=[], include=True, direction='BOTH', verbose=False):
        assert il_unit <= ir_unit
        
        # Move Right
        if direction in ['BOTH', 'LEFT']:
            while il_unit < ir_unit:
                # We must check if the current token
                # is not allowed. If it's not allowed,
                # we contract (remove).
                token = self.main.sp_doc[il_unit]

                # The token is invalid, thus we stop
                # contracting.
                # include = True means that we want the tokens that match
                # the speech and/or literals in the contracted unit.
                # include = False means that we don't want the tokens that
                # match the speech and/or literals in the contracted unit.
                # Case 1: include = True, in_set = True
                # We have a token that's meant to be included in the set.
                # However, we're contracting, which means we would end up
                # removing the token if we continue. Therefore, we break.
                # Case 2: include = False, in_set = False
                # We have a token that's not in the set which defines the
                # tokens that aren't meant to be included. Therefore, we 
                # have a token that is meant to be included. If we continue,
                # we would end up removing this token. Therefore, we break.
                # Default:
                # If we have a token that's in the set (in_set=True) of
                # tokens we're not supposed to include in the contracted 
                # unit (include=False), we need to remove it. Likewise, if
                # we have a token that's not in the set (in_set=False) of
                # tokens to include in the contracted unit (include=True),
                # we need to remove it.
                in_set = token.pos_ in speech or token.lower_ in literals
                if include == in_set:
                    break

                # The token is valid, thus we continue.
                il_unit += 1

        # Move Left      
        if direction in ['BOTH', 'RIGHT']:
            while ir_unit > il_unit:
                token = self.main.sp_doc[ir_unit]

                # The token is invalid and we
                # stop contracting.
                in_set = token.pos_ in speech or token.lower_ in literals
                if include == in_set:
                    break

                # The token is valid and we continue.
                ir_unit -= 1

        assert il_unit <= ir_unit
        contracted_unit = self.main.sp_doc[il_unit:ir_unit+1]
        return contracted_unit

    def find_unit_context(self, *, il_unit, ir_unit, il_boundary, ir_boundary, verbose=False):
        assert il_unit <= ir_unit
        assert il_boundary <= il_unit
        assert ir_boundary >= ir_unit
        
        # Caveat: Parentheticals
        # The context of a unit inside of parentheses should not
        # go farther than the boundaries of those parentheses.
        # However, we need to manually determine whether the unit
        # is in parentheses (or any set of the matching symbols
        # below).
        matching_puncts = {
            "[": "]", 
            "(": ")", 
            "-": "-", 
            "--": "--",
            "{": "}"
        }
        
        # The opening symbols for group punctuation.
        opening_puncts = list(matching_puncts.keys())

        # The closing symbols for group punctuation.
        closing_puncts = list(matching_puncts.values())

        # Both the opening and closing symbols above.
        puncts = [*closing_puncts, *opening_puncts]

        # Look for Group Punctuation on the Left
        i = il_unit
        l_punct = None
        while i >= il_boundary:
            token = self.main.sp_doc[i]
            if token.lower_ in puncts:
                l_punct = token
                break
            i -= 1

        # Look for Group Punctuation on the Right
        i = ir_unit + 1 if l_punct and il_unit == ir_unit else ir_unit
        r_punct = None
        while i <= ir_boundary:
            token = self.main.sp_doc[i]
            if token.lower_ in puncts:
                r_punct = token
                break
            i += 1

        # If there's a group punctuation on the left
        # and right, and they match each other (e.g. '(' and ')'),
        # we return the text between the punctuations.
        parenthetical = l_punct and r_punct and matching_puncts.get(l_punct.lower_, '') == r_punct.text
        if parenthetical:
            return self.main.sp_doc[l_punct.i:r_punct.i+1]

        # As the unit is not a parenthetical, we will expand
        # outwards until we run into a stopping token. The exclude
        # list contains tokens that should be excluded from the
        # context. Currently, it will contain any parentheticals
        # that we run into.
        exclude = []

        # If a token's POS falls into these categories, we will
        # continue. If not, we stop expanding.
        speech = ["ADJ", "NOUN", "ADP", "ADV", "PART", "PROPN", "VERB", "PRON", "DET", "AUX", "PART"]
        
        # Expand Left
        while il_unit > il_boundary:
            # Assuming that the current token is fine,
            # we look to the left.
            l_token = self.main.sp_doc[il_unit-1]

            # If it's a closing punctuation (e.g. ')', ']'),
            # we need to skip over whatever is contained in
            # that punctuation.
            if l_token.lower_ in closing_puncts:
                i = il_unit - 1
                # We continue until we reach the boundary or we
                # find the matching opening punctuation.
                token = self.main.sp_doc[i]
                while i >= il_boundary and matching_puncts.get(token.lower_, '') != l_token.lower_:
                    token = self.main.sp_doc[i]
                    exclude.append(token)
                    i -= 1
                exclude.append(token)

                # After we've gone past the parenthetical,
                # we can jump to the next position.
                il_unit = i
                continue
            # If it's not a closing punctuation, we check
            # whether it's a stopping token
            else:
                if l_token.pos_ not in speech:
                    break
                else:
                    il_unit -= 1

        # Expand Right
        while ir_unit < ir_boundary:
            # We're checking the token to the right
            # to see if we can expand or not.
            r_token = self.main.sp_doc[ir_unit+1]
            
            # If the token to the right is an opening
            # punctuation (e.g. '(', '['), we must skip
            # it, the parenthetical inside, and the
            # closing punctuation.
            if r_token.lower_ in opening_puncts:
                i = ir_unit + 1
                token = self.main.sp_doc[i]
                while i <= ir_boundary and token.lower_ != matching_puncts.get(r_token.lower_, ''):
                    token = self.main.sp_doc[i]
                    exclude.append(token)
                    i += 1
                exclude.append(token)

                # Skip
                ir_unit = i
                continue
            # If it's not an opening punctuation, we check
            # whether we can continue expanding.
            else:
                if r_token.pos_ not in speech:
                    break
                else:
                    ir_unit += 1

        # We remove the excluded tokens
        # and return the context.
        context = [t for t in self.main.sp_doc[il_unit:ir_unit+1] if t not in exclude]
        return context

In [None]:
# Used for the Dictionary
@Language.component("lower_case_lemmas")
def lower_case_lemmas(doc) :
    for token in doc :
        token.lemma_ = token.lemma_.lower()
    return doc

class Species:
    def __init__(self, main):
        # Tools
        self.main = main
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler"])
        self.tn_nlp.add_pipe("lower_case_lemmas", after="lemmatizer")
        self.tn_doc = None
        
        # Contains any spans that have been identified
        # as a species.
        self.spans = None
        
        # Contains any tokens that have been identified
        # as a species or being a part of a species.
        self.tokens = None
        
        # Used to quickly access the span that a token
        # belongs to.
        self.token_to_span = None
        
        # Maps a string to an array of strings wherein
        # the strings involved in the key-value pair 
        # have been identified as an alternate name of each other.
        self.alternate_names = None
        
        # Used to increase TaxoNERD's accuracy.
        self.dictionary = None
        self.load_dictionary()

    def load_dictionary(self):
        self.dictionary = ["juvenile", "adult", "prey", "predator", "species", "tree", "cat", "dog"]
        # df = pd.read_csv("VernacularNames.csv")
        # self.dictionary += df.VernacularName.to_list()

        patterns = []
        for name in self.dictionary:
            doc = self.tn_nlp(name)
            patterns.append({"label": "LIVB", "pattern": [{"LEMMA": token.lemma_} for token in doc]})
        ruler = self.tn_nlp.add_pipe("entity_ruler")
        ruler.add_patterns(patterns)
        
    def update(self, text, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        self.tn_doc = self.tn_nlp(text)
        self.spans, self.tokens, self.token_to_span, self.alternate_names = self.load_species(verbose=verbose)

    def load_species(self, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        # We'll search for species in the text.
        text = self.main.sp_doc.text.lower()

        # These three contain the species that have been
        # identified in the text. Tokens that aren't adjectives,
        # nouns, or proper nouns will be stripped.
        spans = []
        tokens = []
        token_to_span = {}

        # It's useful to know if a different name refers to a
        # species we have already seen. For example, in
        # "predatory crab (Carcinus maenas)", "predatory crab"
        # is an alternative name for "Carcinus maenas" and
        # vice versa. This is used so that the species can be
        # properly tracked and redundant points are less
        # likely to be given.
        alternate_names = {}

        # We convert the spans that TaxoNerd has recognized
        # to spans under a different parent document. This is
        # because we're largely using said parent document and
        # there is more functionality in that parent document.
        species_spans = []
        for tn_species_span in self.tn_doc.ents:
            char_i0 = self.tn_doc[tn_species_span.start].idx
            char_i1 = char_i0 + len(tn_species_span.text) - 1

            sp_token_i0 = self.main.token_at_char(char_i0).i
            sp_token_i1 = self.main.token_at_char(char_i1).i

            sp_species_span = self.main.sp_doc[sp_token_i0:sp_token_i1+1]
            
            # Although they have different parent documents,
            # they should still have the same text.
            if sp_species_span.text.lower() != tn_species_span.text.lower():
                print(sp_species_span.text.lower(), tn_species_span.text.lower())
            assert sp_species_span.text.lower() == tn_species_span.text.lower()

            # Sometimes, TaxoNerd recognizes two names of a species in one span.
            # If they're separated with parentheses, we can handle the case here.
            # The naming is difficult, so I'll just call it species_tuples.
            species_tuples = self.main.help.group_text(sp_species_span.text, flatten=True)

            species_span_chunks = []
            for species_tuple in species_tuples:
                species_span_chunk_text = sp_species_span.text[species_tuple[0]:species_tuple[1]]
                if species_span_chunk_text.isspace():
                    continue
                
                group_char_i0 = char_i0 + species_tuple[0]
                group_char_i1 = char_i0 + species_tuple[1] - 1

                # Update L Index to Exclude Whitespace Characters
                while text[group_char_i0].isspace():
                    group_char_i0 += 1

                # Update R Index to Exclude Whitespace Characters
                while text[group_char_i1].isspace():
                    group_char_i1 -= 1

                group_token_i0 = self.main.token_at_char(group_char_i0).i
                group_token_i1 = self.main.token_at_char(group_char_i1).i

                species_span_chunks.append(self.main.sp_doc[group_token_i0:group_token_i1+1])

            for species_span_chunk in species_span_chunks:
                species_spans.append(species_span_chunk)
    
                # TaxoNERD will recognize the full species (i.e. "brown squirrels"),
                # and we can use this to find more instances of a species in the text
                # by extracting the last noun or proper noun from that span 
                # (i.e. "squirrels"). Now, we can find "brown squirrels" and 
                # "squirrels".
                reversed_span = [t for t in species_span_chunk]
                reversed_span.reverse()
                for token in reversed_span:
                    if token.pos_ in ["NOUN", "PROPN"]:
                        species_spans.append(self.main.sp_doc[token.i:token.i+1])
                        break

        # TaxoNerd sometimes recognizes one instance of a species
        # and fails to recognize it elsewhere. To fix this, I'll
        # search the text for all the species that TaxoNerd sees.
        # This should resolve that issue. To make this more robust,
        # I'll include the singular and plural versions of the
        # recognized species. Furthermore, the species being used
        # to search for other instances of species in the text will
        # be called search_species. Using a database I downloaded,
        # I've initialized search_species with a set of english
        # vernacular names (e.g., "dog", "cat"). I'm removing it for
        # now because there's seeminly a lot of bogus values.
        # df = pd.read_csv("EnglishVernacularNames-2.csv")
        # search_species = df.Name.to_list()
        search_species = ["juvenile", "adult", "prey", "predator", "predators", "species", "tree", "cat", "dog", "flies", "plants", "plant", "fly"]

        for species_span in species_spans:
            species_text = species_span.text.lower()
            species_text = self.main.help.remove_outer_non_alnum(species_text)

            search_species.append(species_text)

            # Add Singular and/or Plural Version
            if species_span[-1].pos_ == "NOUN":
                # Plural
                if species_span[-1].tag_ == "NNS":
                    singular_species = self.main.help.singularize(species_text)
                    search_species.extend(singular_species)
                # Singular
                if species_span[-1].tag_ == "NN":
                    plural_species = self.main.help.pluralize(species_text)
                    search_species.extend(plural_species)

        # Now, we have the species to search for in the text.
        search_species = list(set(search_species))
        
        for species in search_species:
            matches = re.finditer(re.escape(species), text)
            
            for char_i0, char_i1 in [(match.start(), match.end()) for match in matches]:
                # The full word must match, not just a substring inside of it.
                # So, if the species we're looking for is "ant", only "ant"
                # will match -- not "pants" or "antebellum". Therefore, the
                # characters to the left and right of the matched string must be
                # non-alphanumeric. 
                l_char_is_letter = char_i0 > 0 and text[char_i0-1].isalpha()
                r_char_is_letter = char_i1 < len(text) and text[char_i1].isalpha()
                
                if l_char_is_letter or r_char_is_letter:
                    continue
                    
                sp_li = self.main.token_at_char(char_i0).i
                sp_ri = self.main.token_at_char(char_i1-1).i

                # This is the matched substring (which would be
                # a species) as a span in the parent document.
                species_span = self.main.sp_doc[sp_li:sp_ri+1]
                
                # Expand Species
                # Let's say there's a word like "squirrel". That's a bit ambiguous. 
                # Is it a brown squirrel, a bonobo? If the species is possibly missing
                # information (like an adjective to the left of it), we should expand
                # in order to get a full picture of the species.
                unclear_1 = len(species_span) == 1 and species_span[0].pos_ == "NOUN"
                unclear_2 = species_span.start > 0 and self.main.sp_doc[species_span.start-1].pos_ in ["ADJ"]
                
                if unclear_1 or unclear_2:
                    species_span = self.main.help.expand_unit(
                        il_unit=species_span.start, 
                        ir_unit=species_span.end-1,
                        il_boundary=0,
                        ir_boundary=len(self.main.sp_doc),
                        speech=["ADJ", "PROPN"],
                        literals=["-"],
                        include=True,
                        direction="LEFT",
                        verbose=verbose
                    )
                
                # Remove Outer Symbols
                # There are times where a species is identified with a parenthesis
                # nearby. Here, we remove that parenthesis (and any other symbols).
                species_span = self.main.help.contract_unit(
                    il_unit=species_span.start, 
                    ir_unit=species_span.end-1, 
                    speech=["PUNCT", "SYM", "DET", "PART"],
                    include=False,
                    verbose=verbose
                )
            
                # A species must have a noun or a
                # proper noun. This may help discard
                # bogus results.
                letter_found = False
                for token in species_span:
                    if token.pos_ in ["NOUN", "PROPN"]:
                        letter_found = True
                        break

                if not letter_found:
                    continue

                # Adding Species
                spans.append(species_span)
                for token in species_span:
                    if token in tokens or token.pos_ in ["PUNCT", "SYM", "DET", "PART"]:
                        continue
                    tokens.append(token)
                    token_to_span[token] = species_span

        # Removing Duplicates and Sorting 
        spans = list({span.start: span for span in spans}.values())
        spans.sort(key=lambda span: span.start)
        
        # Finding and Storing Alternative Names
        for i, species_span in enumerate(spans):
            # There's not a next species to
            # evaluate.
            if i + 1 >= len(spans):
                break
            
            next_species_span = spans[i+1]
            
            # If there's one token between the species and the next species,
            # we check if the next species is surrounded by punctuation.
            if next_species_span.start - species_span.end == 1:
                # Token Before and After the Next Species
                before_next = self.main.sp_doc[next_species_span.start-1]
                after_next = self.main.sp_doc[next_species_span.end]

                if before_next.pos_ in ["PUNCT", "SYM"] and after_next.pos_ in ["PUNCT", "SYM"]:
                    sp_1_text = species_span.text.lower()
                    sp_2_text = next_species_span.text.lower()
                    
                    if sp_1_text not in alternate_names:
                        alternate_names[sp_1_text] = []
                    
                    if sp_2_text not in alternate_names:
                        alternate_names[sp_2_text] = []
                    
                    alternate_names[sp_1_text].append(sp_2_text)
                    alternate_names[sp_2_text].append(sp_1_text)
            # If there's no token between the species and the next,
            # species we assume that they refer to the same species.
            elif next_species_span.start - species_span.end == 0:
                sp_1_text = species_span.text.lower()
                sp_2_text = next_species_span.text.lower()
                
                if sp_1_text not in alternate_names:
                    alternate_names[sp_1_text] = []
                
                if sp_2_text not in alternate_names:
                    alternate_names[sp_2_text] = []

                alternate_names[sp_1_text].append(sp_2_text)
                alternate_names[sp_2_text].append(sp_1_text)
       
        return (spans, tokens, token_to_span, alternate_names)

    def span_at_token(self, token):
        if token in self.token_to_span:
            return self.token_to_span[token]
        return None
    
    def is_species(self, token):
        return token in self.tokens
        
    def has_species(self, tokens, verbose=False):
        for token in tokens:
            if token in self.tokens:
                return True
        return False

    def find_same_species(self, sp_A, sp_b, verbose=False):
        # METHOD 1: Check for Literal Matches
        sp_b_text = sp_b.text.lower()
        
        for sp_a in sp_A:
            # Verbatim Text
            sp_a_text = sp_a.text.lower()

            if sp_a_text == sp_b_text:
                return sp_a

            # Singularized Text
            sp_a_singular_texts = sp_a_text if sp_a[-1].tag_ in ["NN", "NNP"] else self.main.help.singularize(sp_a_text)
            sp_b_singular_texts = sp_b_text if sp_b[-1].tag_ in ["NN", "NNP"] else self.main.help.singularize(sp_b_text)

            if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
                return sp_a

        # METHOD 2: Check Alternate Names
        for sp_a in sp_A:
            # Species B is an alternate name for Species A
            if sp_b_text in self.alternate_names.get(sp_a_text, []):
                return sp_a
            # Species A is an alternate name for Species B
            if sp_a_text in self.alternate_names.get(sp_b_text, []):
                return sp_a
        
        # METHOD 3: Check Nouns
        # This is used if one or none of the species being compared
        # has 1 adjective.
        sp_b_0_text = sp_b[0].lower_
        sp_b_is_noun = sp_b[0].pos_ in ["NOUN", "PROPN"]

        sp_b_nouns = []
        sp_b_num_adjectives = 0
        for token in sp_b:
            if not sp_b_nouns and token.pos_ == "ADJ":
                sp_b_num_adjectives += 1
            elif token.pos_ in ["PROPN", "NOUN"]:
                sp_b_nouns.append(token)
        sp_b_nouns_str = [noun.lower_ for noun in sp_b_nouns]
        sp_b_singular_texts = " ".join(sp_b_nouns_str) if sp_b_nouns[-1].tag_ in ["NN", "NNP"] else self.main.help.singularize(" ".join(sp_b_nouns_str))
        
        for sp_a in sp_A:
            sp_a_0_text = sp_a[0].lower_
            sp_a_is_noun = sp_a[0].pos_ in ["NOUN", "PROPN"]

            # Case Example: 'Hyla' v. 'Hyla tadpoles'
            if sp_a_0_text == sp_b_0_text and (sp_a_is_noun or sp_b_is_noun):
                if sp_a_text in sp_b_text or sp_b_text in sp_a_text:
                    return sp_a
            # Case Example: 'dogs' v. 'red dogs'
            else:
                sp_a_nouns = []
                sp_a_num_adjectives = 0
                for token in sp_a:
                    if not sp_a_nouns and token.pos_ == "ADJ":
                        sp_a_num_adjectives += 1
                    elif token.pos_ in ["PROPN", "NOUN"]:
                        sp_a_nouns.append(token)
                sp_a_nouns_str = [noun.lower_ for noun in sp_a_nouns]
                
                if sp_a_nouns and sp_b_nouns and (
                    (sp_a_num_adjectives == 1 and sp_b_num_adjectives == 0) or 
                    (sp_b_num_adjectives == 1 and sp_a_num_adjectives == 0)
                ):
                    sp_a_singular_texts = " ".join(sp_a_nouns_str) if sp_a_nouns[-1].tag_ in ["NN", "NNP"] else self.main.help.singularize(" ".join(sp_a_nouns_str))
                    
                    if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
                        return sp_a

        # METHOD 3: Last Ditch Effort
        # If there's been no matches, we just look for one string inside of
        # another.
        for sp_a in sp_A:
            sp_a_text = sp_a.text.lower()
            if sp_b_text in sp_a_text or sp_a_text in sp_b_text:
                return sp_a
        
        return None

In [None]:
class Keywords:
    def __init__(self, main, *, base=[], speech=[], literals=[], threshold=0.7, include_substring=False):
        self.main = main
        # For a token to count towards a base word, it must be the same word.
        self.base = [b.lower() for b in base]
        self.speech = [s.upper() for s in speech]
        self.literals = [l.lower() for l in literals]
        # When comparing two words, SpaCy returns a value
        # from 0 to 1, representing how similar the two
        # embeddings are. The threshold below determines
        # the minimum number of similarity before two words
        # are considered as being equivalent.
        self.threshold = threshold
        self.vocab = [self.main.sp_nlp(word) for word in self.base]
        # If this is True, then we will also check
        # if the token contains a base word.
        self.include_substring = include_substring
        # This list contains the matched tokens.
        self.tokens = []

    def update(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        self.tokens = self.match_tokens(verbose=verbose)

    def match_tokens(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        
        matched_tokens = []

        # Check Words
        for token in self.main.sp_doc:    
            if token.pos_ not in self.speech:
                continue

            token_lower = token.lower_
            token_lemma_lower = token.lemma_.lower()

            # Look for Base Word
            if token_lemma_lower in self.base or token_lower in self.base:
                matched_tokens.append(token)
                continue

            # Look for Base Word in Token 
            # For example, a word like "biomass" would match
            # if "mass" is a base word.
            if self.include_substring:
                for base_word in self.base:
                    if base_word in token_lemma_lower or base_word in token_lower:
                        matched_tokens.append(token)
                        break
            

            # Already Matched Token
            if matched_tokens and matched_tokens[-1] == token:
                continue
            
            # Comparing Similarity
            token_doc = self.main.sp_nlp(token_lower)
            for word in self.vocab:
                similarity = word.similarity(token_doc)

                if similarity >= self.threshold:
                    matched_tokens.append(token)
                    break

        # Check Literals
        text = self.main.sp_doc.text.lower()
        for literal in self.literals:
            for char_index in [match.start() for match in re.finditer(literal, text)]:
                adj_char_index = char_index
                while text[adj_char_index].isspace():
                    adj_char_index += 1
                matched_tokens.append(self.main.token_at_char(adj_char_index))
                
        return matched_tokens

In [None]:
class ExperimentKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=[
                "study", 
                "hypothesis", 
                "experiment", 
                "found", 
                "discover", 
                "compare", 
                "finding", 
                "result", 
                "test", 
                "examine", 
                "model",
                "measure",
                "manipulate",
                "assess",
                "conduct",
                "data",
                "analyze",
                "sample",
                "observe",
                "predict",
                "suggest",
                "method",
                "investigation",
                "trial",
                "experimental",
                "evidence",
                "demonstrate",
                "analysis",
                "show",
                "compare",
                "comparable"
            ],
            literals=[
                "control group", 
                "independent", 
                "dependent"
            ],
            speech=["VERB", "NOUN", "ADJ"], 
            threshold=0.7
        )

In [None]:
class CauseKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=[
                "increase", 
                "decrease", 
                "change", 
                "shift", 
                "cause", 
                "produce", 
                "trigger", 
                "suppress", 
                "inhibit",
                "encourage",
                "allow",
                "influence",
                "affect",
                "alter",
                "induce",
                "produce",
                "result in",
                "associated with",
                "correlated with",
                "contribute",
                "impact",
                "deter",
                "depressed"
            ],
            speech=["VERB", "NOUN"], 
            threshold=0.7
        )

In [None]:
class ChangeKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=[
                "few", 
                "more", 
                "increase", 
                "decrease", 
                "less", 
                "short", 
                "long", 
                "greater"
                "shift",
                "fluctuate",
                "adapt",
                "grow",
                "rise"
                "surge",
                "intensify",
                "amplify",
                "multiply",
                "decline",
                "reduce",
                "drop",
                "diminish",
                "fall",
                "lessen"
            ],
            speech=["NOUN", "ADJ", "ADV"], 
            threshold=0.7
        )

    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)

    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        filtered = []
        for token in self.main.sp_doc:
            # Already Matched
            if token in tokens:
                filtered.append(token)
            
            # Comparative Adjective
            # Looking for words like "bigger" and "better".
            elif token.pos_ == "ADJ" and token.tag_ == "JJR":
                filtered.append(token)
                continue
            
        return filtered

In [None]:
class TraitKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            literals=[
                "behavior", 
                r"[^A-Za-z]+rate", 
                "color", 
                r"[^A-Za-z]+mass", 
                "size", 
                "length", 
                "pattern", 
                "weight",
                "shape", 
                "efficiency", 
                "trait",
                "ability", 
                "capacity", 
                "height", 
                "width", 
                "span",
                "diet",
                "feeding",
                "nest",
                "substrate",
                "breeding",
                r"[^A-Za-z]+age[^A-Za-z]+",
                "lifespan",
                "development",
                "time",
                "mating",
                "fur",
                "feathers",
                "scales",
                "skin",
                "limb",
                "configuration",
                "dimorphism",
                "capability",
                "appendages",
                "blood",
                "regulation",
                "excretion",
                "luminescence",
                r"[^A-Za-z]+role",
                "reproduction",
                "courtship",
                "pollination",
                "mechanism",
                "sensitivity",
                "resistance"
            ],
            include_substring=True,
            speech=["NOUN", "ADJ"], 
            threshold=0.8
        )

    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)

    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        print("filter_tokens")
        print(tokens)
        print()
        
        filtered = []
        for token in tokens:
            expanded_token = self.main.help.expand_unit(
                il_unit=token.i, 
                ir_unit=token.i, 
                il_boundary=0, 
                ir_boundary=len(self.main.sp_doc) - 1, 
                speech=["PUNCT"],
                include=False,
                verbose=verbose
            )

            print(token)
            print(expanded_token)
            print(self.main.species.has_species(expanded_token))
            print()
            
            if self.main.species.has_species(expanded_token):
                filtered.append(token)
        
        return filtered

In [None]:
class Main:
    def __init__(self):
        # Tools
        self.sp_nlp = spacy.load("en_core_web_lg")
        self.fcoref = FCoref(enable_progress_bar=False, device='cpu')
        self.sp_doc = None

        # Maps Character Position to Token in Document
        # Used to handle differences between different
        # pipelines and tools.
        self.index_map = None
    
        # Parsers
        self.species = Species(self)
        self.traits = TraitKeywords(self)
        self.causes = CauseKeywords(self)
        self.changes = ChangeKeywords(self)
        self.experiment = ExperimentKeywords(self)

        # Helper
        self.help = Help(self)

    def update_doc(self, doc, verbose=False):
        self.sp_doc = doc
        self.index_map = self.load_index_map()
        self.species.update(doc.text, verbose=True)
        self.traits.update(verbose=False)
        self.causes.update(verbose=False)
        self.changes.update(verbose=False)
        self.experiment.update(verbose=False)

    def update_text(self, text, verbose=False):
        self.sp_doc = self.sp_nlp(text)
        self.update_doc(self.sp_doc, verbose=verbose)
        
    def token_at_char(self, char_index):
        # SpaCy Doc or Indexing Map Not Found
        if not self.sp_doc or not self.index_map:
            raise Exception("DNE")

        if char_index in self.index_map:
            return self.index_map[char_index]

        raise Exception("Token Not Found")
        
    def load_index_map(self):
        # SpaCy Doc Not Found
        if self.sp_doc is None:
            raise Exception("DNE")

        # Map Character Index to Token
        index_map = {}
        for token in self.sp_doc:
            # char_i0 is the index of the token's starting character.
            # char_i1 is the index of the character after the token's ending character.
            char_i0 = token.idx
            char_i1 = token.idx + len(token)
        
            for i in range(char_i0, char_i1):
                index_map[i] = token
            
        return index_map

    def score(self, verbose=False):
        NUM_CATEGORIES = 4

        # Requires the mention of a trait and a cause or change word.
        # The cause or change word indicates some variation.
        # Index 0 in Array
        TRAIT = 0

        # Requires the mention of a species and a cause or change word.
        # The cause or change word indicates that the species is being
        # affected or is affecting something else.
        # Index 1 in Array
        SPECIES = 1

        # Requires a word that has been defined as "experiment"-related.
        # Index 2 in Array
        EXPERIMENT = 2

        # Requires the mention of several species (more or less).
        # Index 3 in Array
        INTERACTION = 3

        # Max # of Points of Category per Sentence (MPC)
        # A sentence collects points from its categories.
        # For example, a sentence could get a maximum of 2 points from one category
        # and a maximum of 1 point from another. The MPC determines the maximum number
        # of points a category could contribute to a sentence. To have a range of [0, 1]
        # the maximum number of points, across categories, when added should be 1.
        MPC = [0] * NUM_CATEGORIES
        MPC[TRAIT] = 0.1
        MPC[SPECIES] = 0.3
        MPC[EXPERIMENT] = 0.3
        MPC[INTERACTION] = 0.3

        assert np.sum(MPC) == 1
        
        # Points per Instance of Category (PIC)
        # Each token is evaluated to check whether a category
        # can be given points. The number of points given, if
        # the token is determined to be satisfactory, is the PIC.
        # The PIC is less than or equal to the MPC for the corresponding
        # category. The idea behind the PIC and MPC is similar to how
        # sets work in tennis: you're not immediately awarded the full points
        # for the set (MPC) if your opponent fails to return the ball,
        # instead you're given a smaller # of points (PIC) that allow you to
        # incrementally win the set (category).
        PIC = [0] * NUM_CATEGORIES
        PIC[TRAIT] = MPC[TRAIT]*1.0
        PIC[SPECIES] = MPC[SPECIES]/3.0
        PIC[EXPERIMENT] = MPC[EXPERIMENT]/1.0
        PIC[INTERACTION] = MPC[INTERACTION]/3.0

        for i in range(NUM_CATEGORIES):
            assert PIC[i] <= MPC[i]

        # Category Weights (CW)
        # It may be helpful to weigh a certain category's fraction of total points
        # more or less than another's. Thus, at the end, we'll take a
        # weighted average of the category's FTP. The weights must add up to 1.
        CW = [0] * NUM_CATEGORIES
        CW[TRAIT] = 0.7
        CW[SPECIES] = 0.1
        CW[EXPERIMENT] = 0.1
        CW[INTERACTION] = 0.1

        assert np.sum(MPC) == 1

        # Points
        points = [0] * NUM_CATEGORIES

        # Extracted Information
        cause_tokens = self.causes.tokens
        change_tokens = self.changes.tokens
        trait_tokens = self.traits.tokens
        species_tokens = [self.sp_doc[span.start] for span in self.species.spans]
        experiment_tokens = self.experiment.tokens

        print(f"Cause Tokens: {cause_tokens}")
        print(f"Change Tokens: {change_tokens}")
        print(f"Experiment Tokens: {experiment_tokens}")
        print(f"Trait Tokens: {trait_tokens}")
        print(f"Species Tokens: {species_tokens}")
         
        # This is used to ensure that at least three species
        # are mentioned.
        seen_species = {}

        for sent in self.sp_doc.sents:
            # This contains the number of points
            # each category has accumulated in the sentence.
            curr_points = [0] * NUM_CATEGORIES

            # Contains the tokens in the sentence.
            sent_tokens = [token for token in sent]

            # This is used for the species (must have a nearby cause and/or
            # change word).
            sent_cause_tokens = set(sent_tokens).intersection(cause_tokens)
            sent_change_tokens = set(sent_tokens).intersection(change_tokens)

            # We don't want to visit the same species more than one
            # in the same sentence as to avoid redundant points.
            sent_seen_species = []

            print(f"Sentence Tokens: {sent_tokens}")
            print(f"Sentence Cause Tokens: {sent_cause_tokens}")
            print(f"Sentence Change Tokens: {sent_change_tokens}")
            
            for token in sent_tokens:
                # If each category has reached their maximum number of points,
                # we can end the loop early.
                all_maxed = True
                for i in range(NUM_CATEGORIES):
                    if curr_points[i] < MPC[i]:
                        all_maxed = False

                if all_maxed:
                    break

                # TRAIT CATEGORY
                if curr_points[TRAIT] < MPC[TRAIT] and token in trait_tokens:
                    print("TRAIT CATEGORY")
                    # To get points in the trait category, there must 
                    # be (1) a trait; and (2) a change or cause in the token's
                    # context.
                    token_context = set(self.help.find_unit_context(
                        il_unit=token.i, 
                        ir_unit=token.i, 
                        il_boundary=token.sent.start, 
                        ir_boundary=token.sent.end-1, 
                        verbose=verbose)
                    )
                    cause_tokens_in_context = set(sent_cause_tokens).intersection(token_context)
                    change_tokens_in_context = set(sent_change_tokens).intersection(token_context)

                    print(f"Token ({token}) Context: {token_context}")
                    print(f"Cause Tokens in Context: {cause_tokens_in_context}")
                    print(f"Change Tokens in Context: {change_tokens_in_context}")

                    if cause_tokens_in_context or change_tokens_in_context:
                        curr_points[TRAIT] += PIC[TRAIT]

                        print(f"Added Points for Trait via Token '{token}'")

                    print()

                # EXPERIMENT CATEGORY
                if curr_points[EXPERIMENT] < MPC[EXPERIMENT] and token in experiment_tokens:
                    curr_points[EXPERIMENT] += PIC[EXPERIMENT]

                    print(f"Added Points for Experiment via Token '{token}'\n")

                # SPECIES CATEGORY
                if token in species_tokens:
                    # Find Species Span
                    species_span = self.species.span_at_token(token)           

                    # Updating Seen Species (in Entire Text)
                    past_visits = 0

                    # Find Previous Instance of Species (if Any)
                    print("Seen Species Updated")
                    print(seen_species)
                    print()
                    
                    seen_species_span = self.species.find_same_species(seen_species.keys(), species_span)
                    if seen_species_span:
                        past_visits = seen_species[seen_species_span]
                        seen_species[seen_species_span] += 1
                    
                    if not past_visits:
                        seen_species[species_span] = 1

                    print("Seen Species Updated")
                    print(seen_species)
                    print()
                    
                    # Checking Seen Species (in Sentence)
                    # We only add points if it's a species that has not been seen
                    # in the sentence. This is to avoid redundant points. 
                    # Also, if it species has not been seen at all (is_new_species),
                    # then it cannot be a redundant species (we couldn't have seen it in the sentence
                    # either).
                    redundant_species = False

                    if not past_visits:
                        if self.species.find_same_species(sent_seen_species, species_span):
                            redundant_species = True
                    sent_seen_species.append(species_span)

                    print("Seen Species in Sentence")
                    print(sent_seen_species)
                    print()
                    
                    if redundant_species:
                        continue

                    # INTERACTION CATEGORY
                    # It is helpful to have this category here because (if we've reached here)
                    # we're dealing with a new species in the sentence.
                    if curr_points[INTERACTION] < MPC[INTERACTION]:
                        curr_points[INTERACTION] += PIC[INTERACTION]

                        print(f"Added Points for Interaction via Token '{token}'\n")
                        
                    if curr_points[SPECIES] < MPC[SPECIES]:
                        # To get points in the species category, there must be 
                        # (1) a species; and (2) a change or cause in the phrase
                        # (or clause) that the token is a part of.
                        token_context = set(self.help.find_unit_context(
                            il_unit=token.i, 
                            ir_unit=token.i, 
                            il_boundary=token.sent.start, 
                            ir_boundary=token.sent.end-1, 
                            verbose=verbose)
                        )
                        cause_tokens_in_context = set(sent_cause_tokens).intersection(token_context)
                        change_tokens_in_context = set(sent_change_tokens).intersection(token_context)

                        print(f"Token ({token}) Context: {token_context}")
                        print(f"Cause Tokens in Context: {cause_tokens_in_context}")
                        print(f"Change Tokens in Context: {change_tokens_in_context}")
                        
                        if cause_tokens_in_context or change_tokens_in_context:
                            curr_points[SPECIES] += PIC[SPECIES]

                            print(f"Added Points for Species via Token '{token}'")

                        print()
         
            # SENTENCE DONE
            # Add Sentence Points to Total Points
            for category in [TRAIT, SPECIES, EXPERIMENT, INTERACTION]:
                points[category] += min(curr_points[category], MPC[category])

        # Calculating Score            
        NUM_SENTENCES = len(list(self.sp_doc.sents))

        score = 0
        for i in range(NUM_CATEGORIES):
            points[i] = points[i] / (MPC[i] * NUM_SENTENCES)
            score += points[i] * CW[i]

        # Enforcing 3 or More Species            
        if len(seen_species) < 3:
            return 0, points
            
        assert 0.0 <= score <= 1.0
        
        return score, points

In [None]:
def score_dataset(name, save_output=False, version=""):
    # Redirect Print Statements
    # https://stackoverflow.com/questions/7152762/how-to-redirect-print-output-to-a-file
    if save_output:
        initial_stdout = sys.stdout
        f = open(f'./Print{name}{"" if not version else f"-{version}"}.txt', 'w')
        sys.stdout = f
        sys.stdout.reconfigure(encoding='utf-8')

    # Load Dataset
    data = load_preprocessed_dataset(name)

    # We'll be running the points algorithm
    # on the abstracts of these papers.
    texts = list(data['Abstract'].to_numpy())
    
    # The scores for each paper will be stored here,
    # we'll set this as a column of the dataframe.
    scores = []
    points = []
    trait_points = []
    species_points = []
    experiment_points = []
    interaction_points = []
    
    # Scan and Evaluate Documents
    main = Main()
    for i, doc in enumerate(main.sp_nlp.pipe(texts)):
        print(f"{i+1}/{data.shape[0]} - {data.iloc[i]['Title']}\n")
        main.update_doc(doc, verbose=save_output)

        # Empty string literals cause errors, so it's
        # being handled here.
        if not main.sp_doc or not main.species.tn_doc:
            scores.append(0)
        else:
            score, _points = main.score(verbose=save_output)
            scores.append(score)
            points.append(_points)
            trait_points.append(_points[0])
            species_points.append(_points[1])
            experiment_points.append(_points[2])
            interaction_points.append(_points[3])

        if not save_output:
            clear_output(wait=True)

    # Reset Standard Output
    if save_output:
        sys.stdout = initial_stdout
        f.close()

    data["Score"] = scores
    data["Trait Points"] = trait_points
    data["Species Points"] = species_points
    data["Experiment Points"] = experiment_points
    data["Interaction Points"] = interaction_points
    data.sort_values(by='Score', ascending=False, inplace=True)
    
    return data

In [None]:
# Score Datasets
# dataset_names = ["Examples", "Baseline-1", "SubA", "SubAFiltered", "SubB", "SubBFiltered", "C", "CFiltered", "D", "DFiltered"]
for name in ["Baseline-1"]:
    scored_data = score_dataset(name, save_output=False, version='')
    store_scored_dataset(scored_data, "Baseline-2-2", version='')

# Inspecting

In [None]:
# data = load_preprocessed_dataset(name)

In [None]:
# Fix: In an effort to find traits, I allowed for substring matching. Meaning if a trait we were looking for
# was 'rate' and a word had 'rate' in it, it would be recognized as a trait. If there's a species nearby as well,
# which is not abnormal, it would be counted as a trait. However, the word 'pirate' was incorrectly counted as
# a trait. Now, I'm using regexes (where needed) to match the whole word.
text = data.iloc[14].Abstract
main = Main()
main.update_text(str(text))
print(main.score())
print(text)

In [121]:
# data.loc[data['Title'] == 'Competition Between Aquatic Insects and Vertebrates: Interaction Strength and Higher Order Interactions']
text = data.iloc[3].Abstract
main = Main()
# main.update_text(str(text))
print(text)
# print(main.score())

06/18/2025 12:24:50 - INFO - 	 missing_keys: []
06/18/2025 12:24:50 - INFO - 	 unexpected_keys: []
06/18/2025 12:24:50 - INFO - 	 mismatched_keys: []
06/18/2025 12:24:50 - INFO - 	 error_msgs: []
06/18/2025 12:24:50 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M


Replicated experiments in artificial ponds demonstrated that an assemblage of aquatic insects competed with tadpoles of the frogs Hyla andersonii and Bufo woodhousei fowleri. We independently manipulated the presence or absence of aquatic insects, and the abundance of an anuran competitor (O or 150 Bufo w. fowleri per experimental pond), using a completely crossed design for two—factor variance analysis, and observed the responses of initially similar cohorts of Hyla andersonii tadpoles to neither, either, or both insect and anuran competitors. Insects and Bufo significantly depressed the mean individual mass at metamorphosis of Hyla froglets and the cumulative biomass of anurans leaving the ponds at metamorphosis. Neither insects nor Bufo affected the survival or larval period of Hyla. Insects also significantly reduced the mean mass of Bufo, showing that both anurans responded to competition from insects. The intensity of competition between natural densities of insects and Hyla tadp

In [None]:
# Fix: I've expanded the boundaries in which a species can be found (in order for
# a trait to count) in both the class and find_unit_context function.
# data.loc[data['Title'].str.contains('weevil')]
text = data.iloc[20].Abstract
main = Main()
main.update_text(str(text))
print(text)
print(main.score())

# Testing

In [None]:
text_1 = "The use of chemical pesticides in agriculture is a critical threat to the environment. Implementing the use of biological control practices is an increasing worldwide challenge to cope with this matter. The exploitation of trait-mediated indirect interactions (TMIIs), which is an avoidance behaviour of pests when detecting possible risk, is a new and interesting pathway to follow. Ants, which are predators of many insect pests, are commonly active on plants and release several different chemical traces in the substrate, making them potential candidates for TMII-based management approaches. We tested whether semiochemicals released by two Mediterranean ants, Crematogaster scutellaris and Tapinoma nigerrimum, are able to deter the occurrence of a strongly harmful pest of tree crops, the tephritid Ceratitis capitata, which lays eggs within fruits. Using binary choice tests between a plum previously visited by ants and another used as control, we actually observed an avoidance behaviour by females of C. capitata, which results in a lower amount of progeny production, suggesting that flies can detect the chemical compounds released by ants. This study suggests that scents triggering this deterrence effect are conserved across ant subfamilies and encourages improving this research to achieve a new low-impacting control method against agricultural pests."
main_1 = Main()
main_1.update_text(text_1)

text_2 = "Larvae of the Carolina sawyer Monochamus carolinensis (Olivier) (Cerambycidae) and bark beetle larvae (Scolytidae) often simultaneously feed in phloem of recently killed pine trees. Our investigations reveal that M. carolinensis larvae may act as facultative intraguild predators of bark beetle larvae. Phloem sandwiches were used in four experiments to examine inter- and intraspecific interactions. We discovered that all sizes of M. carolinensis larvae killed bark beetle larvae. Seventy-six percent of the killed bark beetle larvae were consumed by M. carolinensis, including 58% that were entirely ingested. Cannibalism in M. carolinensis occurred in every experimental trial. Based on this evidence, M. carolinensis, and possibly related cerambycid species associated with bark beetles, are facultative intraguild predators of larvae of other phloem inhabiting species. The consequences of this behavior may have important implications for bark beetle population dynamics."
main_2 = Main()
main_2.update_text(text_2)

text_3 = "In simple, linear food chains, top predators can have positive indirect effects on basal resources by causing changes in the traits (e.g. behaviour, feeding rates) of intermediate consumers. Although less is known about trait-mediated indirect interactions (TMIIs) in more complex food webs, it has been suggested that such complexity dampens trophic cascades. We examined TMIIs between a predatory crab (Carcinus maenas) and two ecologically important basal resources, fucoid algae (Ascophyllum nodosum) and barnacles (Semibalanus balanoides), which are consumed by herbivorous (Littorina littorea) and carnivorous (Nucella lapillus) snails, respectively. Because crab predation risk suppresses snail feeding rates, we hypothesized that crabs would also shape direct and indirect interactions among the multiple consumers and resources. We found that the magnitude of TMIIs between the crab and each resource depended on the suite of intermediate consumers present in the food web. Carnivorous snails (Nucella) transmitted TMIIs between crabs and barnacles. However, crab–algae TMIIs were transmitted by both herbivorous (Littorina) and carnivorous (Nucella) snails, and these TMIIs were additive. By causing Nucella to consume fewer barnacles, crab predation risk allowed fucoids that had settled on or between barnacles to remain in the community. Hence, positive interactions between barnacles and algae caused crab–algae TMIIs to be strongest when both consumers were present. Studies of TMIIs in more realistic, reticulate food webs will be necessary for a more complete understanding of how predation risk shapes community dynamics."
main_3 = Main()
main_3.update_text(text_3)

text_4 = "Replicated experiments in artificial ponds demonstrated that an assemblage of aquatic insects competed with tadpoles of the frogs Hyla andersonii and Bufo woodhousei fowleri. We independently manipulated the presence or absence of aquatic insects, and the abundance of an anuran competitor (O or 150 Bufo w. fowleri per experimental pond), using a completely crossed design for two—factor variance analysis, and observed the responses of initially similar cohorts of Hyla andersonii tadpoles to neither, either, or both insect and anuran competitors. Insects and Bufo significantly depressed the mean individual mass at metamorphosis of Hyla froglets and the cumulative biomass of anurans leaving the ponds at metamorphosis. Neither insects nor Bufo affected the survival or larval period of Hyla. Insects also significantly reduced the mean mass of Bufo, showing that both anurans responded to competition from insects. The intensity of competition between natural densities of insects and Hyla tadpoles was comparable to the intensity of competition between Bufo and Hyla, as a density of 150 Bufo/1000 L."
main_4 = Main()
main_4.update_text(text_4)

## 1. Remove Extra Spaces

In [None]:
assert main_1.help.remove_extra_spaces("     \n\tHello, world! ") == "Hello, world!"
assert main_1.help.remove_extra_spaces("Hello, world!") == "Hello, world!"
assert main_1.help.remove_extra_spaces(" H  e\nl l\to,\rw\no\tr\rl\nd! ") == "H e l l o, w o r l d!"
assert main_1.help.remove_extra_spaces("     \n\t\r   ") == ""
assert main_1.help.remove_extra_spaces("HelloWorld") == "HelloWorld"

## 2. Remove Outer Non-Alphanumeric Characters

In [None]:
assert main_1.help.remove_outer_non_alnum("HELLO") == "HELLO"
assert main_1.help.remove_outer_non_alnum("Hello, world!") == "Hello, world"
assert main_1.help.remove_outer_non_alnum(" H\ne\nl\nl\no,\nw\no\nr\nl\nd! ") == "H\ne\nl\nl\no,\nw\no\nr\nl\nd"
assert main_1.help.remove_outer_non_alnum("?!~(!+(@*") == ""
assert main_1.help.remove_outer_non_alnum("123ABC123") == "123ABC123"
assert main_1.help.remove_outer_non_alnum("123456789") == "123456789"
assert main_1.help.remove_outer_non_alnum("123@()(##@456789!@*(#@!") == "123@()(##@456789"
assert main_1.help.remove_outer_non_alnum("\\HeLLo, WorLd\\100") == "HeLLo, WorLd\\100"

## 3. Singularize

In [None]:
assert main_1.help.singularize("Dogs") == ["dog"]
assert main_1.help.singularize("SpEcIeS") == ["species"]
assert main_1.help.singularize("Black Cats") == ["black cat"]
assert main_1.help.singularize("red bird") == ["red bird"]
assert main_1.help.singularize("wolves") == ["wolf", "wolfe"]
assert main_1.help.singularize("bonoboes") == ["bonobo"]
assert main_1.help.singularize("bonobos") == ["bonobo"]
assert main_1.help.singularize("canaries") == ["canary"]
assert main_1.help.singularize("finches") == ["finch"]
assert main_1.help.singularize("cactuses") == ["cactus"]
assert main_1.help.singularize("octopi") == ["octopus"]
assert main_1.help.singularize("goose") == ["goose"]
assert main_1.help.singularize(" ") == [" "]

## 4. Pluralize

In [None]:
assert main_1.help.pluralize("Blue bird") == ["blue birds"]
assert main_1.help.pluralize("brown-tailed squirrels") == ["brown-tailed squirrels"]
assert main_1.help.pluralize("wolf") == ["wolfs", "wolves"]
assert main_1.help.pluralize("staff") == ["staffs"]
assert main_1.help.pluralize("butterfly") == ["butterflies"]
assert main_1.help.pluralize("octopus") == ["octopuses", "octopi"]
assert main_1.help.pluralize("monkey") == ["monkeys"]
assert main_1.help.pluralize("ox") == ["oxen"]
assert main_1.help.pluralize("mice") == ["mice"]
assert main_1.help.pluralize("goose") == ["geese"]
assert main_1.help.pluralize(" ") == [" "]

## 5. Expand Unit

In [None]:
tests = [
    {
        "unit": main_1.sp_doc[4],
        "il_unit": 4,
        "ir_unit": 4,
        "il_boundary": 0,
        "ir_boundary": len(main_1.sp_doc),
        "include": True,
        "speech": ["NOUN", "ADP", "ADJ", "VERB"],
        "literals": [],
        "result_text": "use of chemical pesticides in agriculture",
        "main": main_1
    },
    {
        "unit": main_1.sp_doc[19],
        "il_unit": 19,
        "ir_unit": 19,
        "il_boundary": 0,
        "ir_boundary": 20,
        "include": True,
        "speech": ["NOUN", "ADJ", "VERB"],
        "literals": [],
        "result_text": "biological control",
        "main": main_1
    },
    {
        "unit": main_2.sp_doc[0:5],
        "il_unit": 0,
        "ir_unit": 4,
        "il_boundary": 0,
        "ir_boundary": 10,
        "include": False,
        "speech": ["PUNCT", "SYM"],
        "literals": [],
        "result_text": "Larvae of the Carolina sawyer Monochamus carolinensis",
        "main": main_2
    },
    {
        "unit": main_3.sp_doc[0],
        "il_unit": 0,
        "ir_unit": 0,
        "il_boundary": 0,
        "ir_boundary": 22,
        "include": False,
        "speech": [],
        "literals": [],
        "result_text": "In simple, linear food chains, top predators can have positive indirect effects on basal resources by causing changes in the traits",
        "main": main_3
    }
]

for i, test in enumerate(tests):
    unit = test["unit"]
    expanded_unit = test["main"].help.expand_unit(
        il_unit=test["il_unit"],
        ir_unit=test["ir_unit"],
        il_boundary=test["il_boundary"],
        ir_boundary=test["ir_boundary"],
        speech=test["speech"],
        literals=test["literals"],
        include=test["include"]
    )

    print(f"TEST {i+1}:")
    print(f"Unit: '{unit}'")
    print(f"Expanded Unit: '{expanded_unit}'")
    print()
    
    assert expanded_unit.text == test["result_text"]

## 6. Contract Unit

In [None]:
tests = [
    {
        "unit": main_4.sp_doc[2:6],
        "il_unit": 2,
        "ir_unit": 5,
        "include": False,
        "speech": ["CCONJ", "DET", "ADJ", "ADP", "VERB", "ADV", "SCONJ"],
        "literals": [],
        "result_text": "ponds",
        "main": main_4
    },
    {
        "unit": main_4.sp_doc[2:6],
        "il_unit": 2,
        "ir_unit": 5,
        "include": True,
        "speech": ["CCONJ", "DET", "ADJ", "ADP", "VERB", "ADV", "SCONJ"],
        "literals": [],
        "result_text": "in artificial ponds demonstrated",
        "main": main_4
    },
    {
        "unit": main_3.sp_doc[23:30],
        "il_unit": 23,
        "ir_unit": 29,
        "include": False,
        "speech": ["X", "PUNCT", "SYM"],
        "literals": [],
        "result_text": "behaviour, feeding rates",
        "main": main_3
    }
]

for i, test in enumerate(tests):
    unit = test["unit"]
    contracted_unit = test["main"].help.contract_unit(
        il_unit=test["il_unit"],
        ir_unit=test["ir_unit"],
        speech=test["speech"],
        literals=test["literals"],
        include=test["include"]
    )

    print(f"TEST {i+1}:")
    print(f"Unit: '{unit}'")
    print(f"Contracted Unit: '{contracted_unit}'")
    print()
    
    assert contracted_unit.text == test["result_text"]

## 7. Find Unit Context

In [None]:
tests = [
    {
        "unit": main_3.sp_doc[23:30],
        "il_unit": 23,
        "ir_unit": 29,
        "il_boundary": 0,
        "ir_boundary": len(main_3.sp_doc),
        "result_text": "(e.g. behaviour, feeding rates)",
        "main": main_3
    },
    {
        "unit": main_3.sp_doc[23],
        "il_unit": 23,
        "ir_unit": 23,
        "il_boundary": 0,
        "ir_boundary": len(main_3.sp_doc),
        "result_text": "(e.g. behaviour, feeding rates)",
        "main": main_3
    },
    {
        "unit": main_3.sp_doc[25],
        "il_unit": 25,
        "ir_unit": 25,
        "il_boundary": 0,
        "ir_boundary": len(main_3.sp_doc),
        "result_text": "(e.g. behaviour, feeding rates)",
        "main": main_3
    },
    {
        "unit": main_1.sp_doc[48],
        "il_unit": 48,
        "ir_unit": 48,
        "il_boundary": 0,
        "ir_boundary": len(main_1.sp_doc),
        "result_text": ["avoidance", "behaviour", "of", "pests"],
        "main": main_1
    },
    {
        "unit": main_2.sp_doc[15],
        "il_unit": 15,
        "ir_unit": 15,
        "il_boundary": 0,
        "ir_boundary": len(main_2.sp_doc),
        "result_text": ["bark", "beetle", "larvae", "often", "simultaneously", "feed", "in", "phloem", "of", "recently", "killed", "pine", "trees"],
        "main": main_2
    },
    {
        "unit": main_2.sp_doc[20],
        "il_unit": 20,
        "ir_unit": 20,
        "il_boundary": 0,
        "ir_boundary": len(main_2.sp_doc),
        "result_text": ["bark", "beetle", "larvae", "often", "simultaneously", "feed", "in", "phloem", "of", "recently", "killed", "pine", "trees"],
        "main": main_2
    }
]

for i, test in enumerate(tests):
    unit = test["unit"]
    unit_context = test["main"].help.find_unit_context(
        il_unit=test["il_unit"],
        ir_unit=test["ir_unit"],
        il_boundary=test["il_boundary"],
        ir_boundary=test["ir_boundary"],
    )

    print(f"TEST {i+1}:")
    print(f"Unit: {unit}")
    print(f"Unit Context Unit: {unit_context}")
    print()

    if isinstance(unit_context, list):
        assert [t.text for t in unit_context] == test["result_text"]
    else:
        assert unit_context.text == test["result_text"]

## 8. Load Species

In [None]:
# for token in main_1.sp_doc:
#     if token.pos_ in ["NOUN", "PROPN"]:
#         print(token.i, token)

expected_main_1_species = [51, 66, 70, 73, 74, 80, 110, 112, 113, 115, 116, 128, 130, 134, 135, 136, 154, 170, 171, 185, 193, 207, 225]
expected_main_1_species.sort()

actual_main_1_species = [t.i for t in main_1.species.tokens]
actual_main_1_species.sort()

print("TEST 1:")
print(f"You've Identified: {expected_main_1_species}")
print(f"Identified Species: {actual_main_1_species}")
print(f"Difference: {set(expected_main_1_species) - set(actual_main_1_species)}")
print()
assert set(actual_main_1_species) >= set(expected_main_1_species)

# for token in main_2.sp_doc:
#     if token.pos_ in ["NOUN", "PROPN"]:
#         print(token.i, token)

expected_main_2_species = [0, 3, 4, 5, 6, 8, 11, 14, 15, 16, 18, 29, 35, 36, 37, 43, 46, 47, 69, 70, 71, 74, 75, 84, 85, 86, 90, 91, 104, 116, 117, 127, 132, 134, 152]
expected_main_2_species.sort()

actual_main_2_species = [t.i for t in main_2.species.tokens]
actual_main_2_species.sort()

print("TEST 2:")
print(f"You've Identified: {expected_main_2_species}")
print(f"Identified Species: {actual_main_2_species}")
print(f"Difference: {set(expected_main_2_species) - set(actual_main_2_species)}")
print()
assert set(actual_main_2_species) >= set(expected_main_2_species)

# for token in main_3.sp_doc:
#     if token.pos_ in ["NOUN", "PROPN"]:
#         print(token.i, token)

expected_main_3_species = [8, 70, 72, 73, 83, 85, 86, 89, 91, 92, 101, 102, 107, 108, 110, 115, 119, 126, 150, 168, 170, 175, 177, 181, 183, 191, 196, 198, 208, 212, 214, 225, 237, 239, 241, 243]
expected_main_3_species.sort()

actual_main_3_species = [t.i for t in main_3.species.tokens]
actual_main_3_species.sort()

print("TEST 3:")
print(f"You've Identified: {expected_main_3_species}")
print(f"Identified Species: {actual_main_3_species}")
print(f"Difference: {set(expected_main_3_species) - set(actual_main_3_species)}")
print()
assert set(actual_main_3_species) >= set(expected_main_3_species)

# for token in main_4.sp_doc:
#     if token.pos_ in ["NOUN", "PROPN"]:
#         print(token.i, token)

expected_main_4_species = [11, 14, 17, 18, 19, 21, 22, 23, 34, 47, 48, 49, 76, 78, 91, 93, 103, 104, 110, 118, 120, 128, 130, 138, 143, 148, 158, 160, 170, 172, 179]
expected_main_4_species.sort()

actual_main_4_species = [t.i for t in main_4.species.tokens]
actual_main_4_species.sort()

print("TEST 4:")
print(f"You've Identified: {expected_main_4_species}")
print(f"Identified Species: {actual_main_4_species}")
print(f"Difference: {set(expected_main_4_species) - set(actual_main_4_species)}")
print()
assert set(actual_main_4_species) >= set(expected_main_4_species)

## 9. Find Same Species

In [None]:
# Test 1
A = main_1.species.spans[0:5]
b = main_1.species.spans[6]

print("TEST 1")
print(f"A: {A}")
print(f"b: {b}")

result = main_1.species.find_same_species(A, b)
print(f"Result: {result}\n")
assert result == A[1]

# Test 2
A = main_1.species.spans[0:10]
b = main_1.species.spans[6]

print("TEST 2")
print(f"A: {A}")
print(f"b: {b}")

result = main_1.species.find_same_species(A, b)
print(f"Result: {result}\n")
assert result == A[6]

# Test 3
A = main_1.species.spans[0:3]
b = main_1.species.spans[3]

print("TEST 3")
print(f"A: {A}")
print(f"b: {b}")

result = main_1.species.find_same_species(A, b)
print(f"Result: {result}\n")
assert result == A[0]

# Test 4
A = main_1.species.spans[14:20]
b = main_1.species.spans[13]

print("TEST 4")
print(f"A: {A}")
print(f"b: {b}")

result = main_1.species.find_same_species(A, b)
print(f"Result: {result}\n")
assert result == A[0]

## 10. Experiment Keywords

In [None]:
# # for token in main_1.sp_doc:
# #     if token.pos_ not in ["PART", "ADP", "DET"]:
# #         print(token.i, token)

expected_main_1_experiments = [103, 147, 163, 174, 183, 196, 222, ]
expected_main_1_experiments.sort()

actual_main_1_experiments = [t.i for t in main_1.experiment.tokens]
actual_main_1_experiments.sort()

print("TEST 1:")
print(f"You've Identified: {expected_main_1_experiments}")
print(f"Identified Experimental Words: {actual_main_1_experiments}")
print(f"Identified Experimental Words (Text): {main_1.experiment.tokens}")
print(f"Difference: {set(expected_main_1_experiments) - set(actual_main_1_experiments)}")
print()
assert set(actual_main_1_experiments) >= set(expected_main_1_experiments)

# # for token in main_2.sp_doc:
# #     if token.pos_ not in ["PART", "ADP", "DET"]:
# #         print(token.i, token)

expected_main_2_experiments = [32, 55, 57, 64, 109, 114,]
expected_main_2_experiments.sort()

actual_main_2_experiments = [t.i for t in main_2.experiment.tokens]
actual_main_2_experiments.sort()

print("TEST 2:")
print(f"You've Identified: {expected_main_2_experiments}")
print(f"Identified Experimental Words: {actual_main_2_experiments}")
print(f"Identified Experimental Words (Text): {main_2.experiment.tokens}")
print(f"Difference: {set(expected_main_2_experiments) - set(actual_main_2_experiments)}")
print()
assert set(actual_main_2_experiments) >= set(expected_main_2_experiments)

# # for token in main_3.sp_doc:
# #     if token.pos_ not in ["PART", "ADP", "DET"]:
# #         print(token.i, token)

expected_main_3_experiments = [56, 65, 124, 142, 254]
expected_main_3_experiments.sort()

actual_main_3_experiments = [t.i for t in main_3.experiment.tokens]
actual_main_3_experiments.sort()

print("TEST 3:")
print(f"You've Identified: {expected_main_3_experiments}")
print(f"Identified Experimental Words: {actual_main_3_experiments}")
print(f"Identified Experimental Words (Text): {main_3.experiment.tokens}")
print(f"Difference: {set(expected_main_3_experiments) - set(actual_main_3_experiments)}")
print()
assert set(actual_main_3_experiments) >= set(expected_main_3_experiments)

# for token in main_4.sp_doc:
#     if token.pos_ not in ["PART", "ADP", "DET"]:
#         print(token.i, token)

expected_main_4_experiments = [1, 5, 27, 51, 65, 68, 140, 163]
expected_main_4_experiments.sort()

actual_main_4_experiments = [t.i for t in main_4.experiment.tokens]
actual_main_4_experiments.sort()

print("TEST 4:")
print(f"You've Identified: {expected_main_4_experiments}")
print(f"Identified Experimental Words: {actual_main_4_experiments}")
print(f"Identified Experimental Words (Text): {main_4.experiment.tokens}")
print(f"Difference: {set(expected_main_4_experiments) - set(actual_main_4_experiments)}")
print()
assert set(actual_main_4_experiments) >= set(expected_main_4_experiments)

## 11. Cause Keywords

In [None]:
tests = [
    {
        "expected_keywords": [121, 203, 220],
        "main": main_1
    },
    {
        "expected_keywords": [],
        "main": main_2
    },
    {
        "expected_keywords": [13, 18, 19, 118, 207, 240],
        "main": main_3
    },
    {
        "expected_keywords": [95, 121, 133],
        "main": main_4
    }
]

for i, test in enumerate(tests):
    actual_keywords = [t.i for t in test["main"].causes.tokens]
    actual_keywords.sort()
    
    print(f"TEST {i+1}:")
    print(f"You've Identified: {test['expected_keywords']}")
    print(f"Identified Cause Words: {actual_keywords}")
    print(f"Identified Cause Words (Text): {test['main'].causes.tokens}")
    print(f"Difference: {set(test['expected_keywords']) - set(actual_keywords)}")
    print()
    assert set(actual_keywords) >= set(test['expected_keywords'])

## 12. Change Keywords

In [None]:
tests = [
    {
        "expected_keywords": [177],
        "main": main_1
    },
    {
        "expected_keywords": [],
        "main": main_2
    },
    {
        "expected_keywords": [35, 48, 211, 269],
        "main": main_3
    },
    {
        "expected_keywords": [],
        "main": main_4
    }
]

for i, test in enumerate(tests):
    actual_keywords = [t.i for t in test["main"].changes.tokens]
    actual_keywords.sort()
    
    print(f"TEST {i+1}:")
    print(f"You've Identified: {test['expected_keywords']}")
    print(f"Identified Change Words: {actual_keywords}")
    print(f"Identified Change Words (Text): {test['main'].changes.tokens}")
    print(f"Difference: {set(test['expected_keywords']) - set(actual_keywords)}")
    print()
    assert set(actual_keywords) >= set(test['expected_keywords'])

## 13. Trait Keywords

In [None]:
tests = [
    {
        "expected_keywords": [49, 166],
        "main": main_1
    },
    {
        "expected_keywords": [67],
        "main": main_2
    },
    {
        "expected_keywords": [121],
        "main": main_3
    },
    {
        "expected_keywords": [99, 108, 136],
        "main": main_4
    }
]

for i, test in enumerate(tests):
    actual_keywords = [t.i for t in test["main"].traits.tokens]
    actual_keywords.sort()
    
    print(f"TEST {i+1}:")
    print(f"You've Identified: {test['expected_keywords']}")
    print(f"Identified Trait Words: {actual_keywords}")
    print(f"Identified Trait Words (Text): {test['main'].traits.tokens}")
    print(f"Difference: {set(test['expected_keywords']) - set(actual_keywords)}")
    print()
    assert set(actual_keywords) >= set(test['expected_keywords'])