In [1]:
# Trait-Mediated Interaction Modification
# Empirical:
# You could look for words like "hypothesis", "experiment", "found", and "discovered". That may point
# towards there being an experiment in the paper. There are also words like "control group", "compared",
# "findings", "results", "study", and more.
# Qualitative vs. Quantitative:
# To infer whether something is quantitative, you could look for numeric tokens and units.
# However, you can only do so much with the abstract. Therefore, this is likely not good enough.
# Yet, you could still take advantage of words like "fewer" and "increased" to show that there is a change.
# However, this would be more suited for the above category.
# Traits:
# There is no NLP tool for traits that I can use or create so I think that I could instead use keywords.
# For example, "snail feeding rates" is a trait. You may be able to spot this by looking for a word like
# "rate". You'd expand that word to include "snail feeding rates". As "snail" is a species you can infer
# that "rates" is a trait. I would be more decisive and use a dependency parser to ensure that the trait
# is a property of the species (like before). However, with all the cases that may exist, I think checking
# to see whether a species can be found by traveling back and/or forward without finding certain tokens could
# work well enough.
# 3 Species or More:
# This is simple. However, I think using a dictionary and TaxoNerd would be beneficial (for higher accuracy).
# To handle the potential differences in tokenization, character offsets should be used.
# Standardization:
# There is a lot of variance in the scores. To squash this issue, I think that we could assign each sentence
# a value from 0 to 1. We would add these values and divide by the number of sentences. This would result in
# a number that is also from 0 to 1. However, there are categories that we would like to inspect. So, we must
# create an overall score in the interval from [0, 1] while also scoring each category. Well, for each sentence
# we could add a point for each category that is observed. The sentence would receive said score divided by the
# number of categories. At the end, we add up all the sentence scores and divide by the number of sentences.
# The aggregate score for each category would also be divided by the number of sentences.

In [2]:
import re
import csv
import sys
import time
import spacy
import numpy as np
import pandas as pd
import random
import pickle
from fastcoref import FCoref, LingMessCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher, PhraseMatcher
from spacy.language import Language
from IPython.display import clear_output
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
%run -i "../utils.py"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Helper Functions
class Help:
    def __init__(self, main):
        self.main = main
        self.ignore = ["species", "deer", "fish", "moose", "sheep", "swine", "buffalo"]

    def remove_spaces(self, string):
        cleaned_str = string
        cleaned_str = re.sub("\s+", " ", cleaned_str) # Remove Duplicate Spaces
        cleaned_str = re.sub(r"\s+([?.!,])", r"\1", cleaned_str) # Remove Spaces Before Punctuation
        return cleaned_str.strip()

    def remove_non_alnum(self, string):
        while len(string) > 0:
            start_len = len(string)
            if len(string) > 0 and not string[-1].isalnum():
                string = string[:-1]
            if len(string) > 0 and not string[0].isalnum():
                string = string[1:]
            # No Changes Made
            if start_len == len(string):
                break
        return string
    
    def singularize(self, string):
        parts = re.split(r" ", string)

        if parts[-1] in self.ignore:
            return string

        singulars = []
        for singular_end in self.singularize_string(parts[-1]):
            singular = self.remove_spaces(" ".join([*parts[:-1], singular_end]))
            singulars.append(singular)
            
        return singulars
        
    def singularize_string(self, string):
        versions = []
        
        if re.fullmatch(r".*us$", string):
            versions.append(f'{string}es')
            versions.append(f'{string[:-2]}i')
        elif re.fullmatch(r".*(s|sh|ch|x|z)$", string):
            versions.append(f'{string}es')
        elif re.fullmatch(r".*([^aeiou])(y)$", string):
            versions.append(f'{string[:-1]}ies')
        # The rules are murky where there's -f.
        # To be safe, I'll add both versions (one of 
        # which is incorrect).
        elif (
            re.fullmatch(r".*(f)(e?)$", string) and
            not re.fullmatch(r".*ff$", string)
        ):
            last_clean = re.sub(r"(f)(e?)$", "", string)
            versions.append(f'{last_clean}fs')
            versions.append(f'{last_clean}ves')
        # Some people would just add -s to 'bonobo', some would add
        # -es. To be safe, I'll add both versions.
        elif re.fullmatch(r".*([^aeiou])o$", string):
            versions.append(f'{string}s')
            versions.append(f'{string}es')
        else:
            versions.append(f'{string}s')
        
        return versions

    def pluralize(self, string):
        parts = re.split(r" ", string)

        if parts[-1] in self.ignore:
            return string

        plurals = []
        for plural_end in self.singularize_string(parts[-1]):
            plural = self.remove_spaces(" ".join([*parts[:-1], plural_end]))
            plurals.append(plural)
            
        return plurals

    def pluralize_string(self, string):
        versions = []

        if re.fullmatch(r".*ies$", string):
            # 'butterflies' -> 'butterfly'
            versions.append(f'{string[:-3]}y')
        elif re.fullmatch(r".*ves$", string):
            # 'loaves' -> 'loaf' and 'lives' -> 'life'
            versions.append(f'{string[:-3]}f')
            versions.append(f'{string[:-3]}fe')
        elif re.fullmatch(r".*es$", string):
            versions.append(f'{string[:-2]}')
        elif re.fullmatch(r".*i$", string):
            versions.append(f'{string[:-1]}us')
        elif re.fullmatch(r".*s$", string):
            versions.append(f'{string[:-1]}')

        return versions

    def expand_unit(self, *, il_unit, ir_unit, il_boundary, ir_boundary, allowed_speech=[], allowed_literals=[], disallowed_literals=[], direction='BOTH', verbose=False):
        # Move Left
        if direction in ['BOTH', 'LEFT']:
            while il_unit > il_boundary:
                prev_token = self.sp_doc[il_unit-1]
                
                if verbose:
                    print(f"il_unit: {il_unit}, il_boundary: {il_boundary}, prev_token: {prev_token}, prev_token.pos_: {prev_token.pos_}")

                in_disallowed = prev_token.lower_ in disallowed_literals
                not_in_allowed = prev_token.pos_ not in allowed_speech and prev_token.lower_ not in allowed_literals
                
                if in_disallowed or not_in_allowed:
                    break
                
                il_unit -= 1

        # Move Right
        if direction in ['BOTH', 'RIGHT']:
            while ir_unit < ir_boundary:
                next_token = self.sp_doc[ir_unit+1]
                
                if verbose:
                    print(f"ir_unit: {ir_unit}, ir_boundary: {ir_boundary}, next_token: {next_token}, next_token.pos_: {next_token.pos_}")
                
                in_disallowed = prev_token.lower_ in disallowed_literals
                not_in_allowed = next_token.pos_ not in allowed_speech and next_token.lower_ not in allowed_literals
                
                if in_disallowed or not_in_allowed:
                    break
                
                ir_unit += 1

        assert il_unit >= il_boundary and ir_unit <= ir_boundary
        
        # Expanded Unit
        expanded_unit = self.sp_doc[il_unit:ir_unit+1]
        
        if verbose:
            print(f"Expanded Unit: {expanded_unit}")
        
        return expanded_unit

    def contract_unit(self, *, il_unit, ir_unit, speech=[], literals=[], must_not_be_in_set=False, direction='BOTH', verbose=False):
        # Move Left
        if verbose:
            print("LEFT")
            
        if direction in ['BOTH', 'LEFT']:
            while il_unit < ir_unit:
                curr_token = self.sp_doc[il_unit]
                
                if verbose:
                    print(f"il_unit: {il_unit}, ir_unit: {ir_unit}, curr_token: {curr_token}, curr_token.pos_: {curr_token.pos_}")

                in_set = curr_token.pos_ in speech or curr_token.lower_ in literals
                if (in_set and must_not_be_in_set) or (not in_set and not must_not_be_in_set):
                    break
            
                il_unit += 1

        # Move Right
        if verbose:
            print("RIGHT")
        
        if direction in ['BOTH', 'RIGHT']:
            while ir_unit > il_unit:
                curr_token = self.sp_doc[ir_unit]
                
                if verbose:
                    print(f"il_unit: {il_unit}, ir_unit: {ir_unit}, curr_token: {curr_token}, curr_token.pos_: {curr_token.pos_}")
                
                if curr_token.pos_ in allowed_speech or curr_token.lower_ in allowed_literals:
                    break
                
                ir_unit -= 1

        contracted_unit = self.sp_doc[il_unit:ir_unit+1]
        
        if verbose:
            print(f"Contracted Unit: {contracted_unit}")
        
        return contracted_unit

    def find_unit_context(self, *, il_unit, ir_unit, il_boundary, ir_boundary, verbose=False):
        # Check if in Punctuation (e.g. (...))
        matching_puncts = {"[": "]", "(": ")", "-": "-", "--": "--"}
        
        opening_puncts = list(matching_puncts.keys())
        closing_puncts = list(matching_puncts.values())
        
        puncts = [*closing_puncts, *opening_puncts]
        
        i = il_unit
        l_punct = None
        while i >= il_boundary:
            i_token = self.main.sp_doc[i]
            if i_token.text in puncts:
                l_punct = i_token
                if verbose:
                    print(f"{i_token} in {puncts}")
                break
            i -= 1

        i = ir_unit
        r_punct = None
        while i <= ir_boundary:
            i_token = self.main.sp_doc[i]
            if i_token.text in puncts:
                r_punct = i_token
                if verbose:
                    print(f"{i_token} in {puncts}")
                break
            i += 1

        # Return Text inside Punctuation
        if verbose:
            print(f"L Punct: {l_punct} v. R Punct: {r_punct}")
        
        in_punct = l_punct and r_punct and matching_puncts.get(l_punct.lower_, '') == r_punct.text
        if in_punct:
            if verbose:
                print(f"Matching Punctuation, Return Inner Text")
            return self.main.sp_doc[l_punct.i:r_punct.i+1]

        exclude = []
        
        # Expand Left
        while il_unit > il_boundary:
            prev_token = self.main.sp_doc[il_unit-1]
            if verbose:
                print(f"Expand L:\nToken: {prev_token} and Position: {prev_token.pos_}")

            # Closing Punctuation
            if prev_token.lower_ in closing_puncts:
                i = il_unit-1
                while i >= il_boundary and matching_puncts.get(self.main.sp_doc[i].lower_, '') != prev_token.lower_:
                    exclude.append(self.main.sp_doc[i])
                    i -= 1
                exclude.append(self.main.sp_doc[i])

                if verbose:
                    print(f"Skipped to Token {i}")
                
                il_unit = i
                continue
            else:
                if prev_token.pos_ not in ["ADJ", "NOUN", "ADP", "ADV", "PART", "PROPN", "VERB", "PRON"]:
                    if verbose:
                        print(f"Break")
                    break
                else:
                    il_unit -= 1

        # Expand Right
        while ir_unit < ir_boundary:
            next_token = self.main.sp_doc[ir_unit+1]
            if verbose:
                print(f"Expand R:\nToken: {next_token} and Position: {next_token.pos_}")

            # Opening Punctuation
            if next_token.lower_ in opening_puncts:
                if verbose:
                    print(f"Next Token is an Opening Punctuation")
                
                i = ir_unit + 1
                if verbose:
                    print(f"i: {i}, ir_boundary: {ir_boundary}, self.sp_doc[i]: '{self.main.sp_doc[i].lower_}', matching punctuation: '{matching_puncts.get(next_token.lower_, '')}'")
                    print(f"{i <= ir_boundary} and {self.main.sp_doc[i].lower_ != matching_puncts.get(next_token.lower_, '')}")
                    
                while i <= ir_boundary and self.main.sp_doc[i].lower_ != matching_puncts.get(next_token.lower_, ''):
                    if verbose:
                        print(f"\t'{self.main.sp_doc[i]}' != '{matching_puncts.get(next_token.lower_, '')}'")
                    exclude.append(self.main.sp_doc[i])
                    i += 1
                exclude.append(self.main.sp_doc[i])

                if verbose:
                    print(f"Skipped to Token {i}")
                
                ir_unit = i
                continue
            else:
                if next_token.pos_ not in ["ADJ", "NOUN", "ADP", "ADV", "PART", "PROPN", "VERB", "PRON"]:
                    if verbose:
                        print(f"Break")
                    break
                else:
                    ir_unit += 1

        if verbose:
            print(f"Exclude: {exclude}")

        context = [t for t in self.main.sp_doc[il_unit:ir_unit+1] if t not in exclude]
        if verbose:
            print(f"Context: {context}")
        
        return context

In [3]:
# Used for the Dictionary
@Language.component("lower_case_lemmas")
def lower_case_lemmas(doc) :
    for token in doc :
        token.lemma_ = token.lemma_.lower()
    return doc

class Species:
    def __init__(self, main):
        # Tools
        self.main = main
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler"])
        self.tn_nlp.add_pipe("lower_case_lemmas", after="lemmatizer")
        self.tn_doc = None
        
        # Contains any spans that have been identified
        # as a species.
        self.spans = None
        
        # Contains any tokens that have been identified
        # as a species or being a part of a species.
        self.tokens = None
        
        # Used to quickly access the span that a token
        # belongs to.
        self.token_to_span = None
        
        # Maps a string to an array of strings wherein
        # any string involved in the key-value pair 
        # has been identified as an alternate name of each other.
        self.alternate_names = None
        
        # Used to increase TaxoNERD's accuracy.
        self.dictionary = None
        self.load_dictionary()

    def load_dictionary(self):
        self.dictionary = ["juvenile", "adult", "prey", "predator", "species"]
        # df = pd.read_csv("VernacularNames.csv")
        # self.dictionary += df.VernacularName.to_list()

        patterns = []
        for name in self.dictionary:
            doc = self.tn_nlp(name)
            patterns.append({"label": "LIVB", "pattern": [{"LEMMA": token.lemma_} for token in doc]})
        ruler = self.tn_nlp.add_pipe("entity_ruler")
        ruler.add_patterns(patterns)
        
    def update(self, text, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        self.tn_doc = self.tn_nlp(text)
        self.spans, self.tokens, self.token_to_span, self.alternate_names = self.load_species(verbose=verbose)

    def load_species(self, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        # These three contain the species that have been
        # identified in the text. Tokens that aren't adjectives,
        # nouns, or proper nouns will be stripped.
        spans = []
        tokens = []
        token_to_span = {}

        # It's useful to know if a different name refers to a
        # species we have already seen. For example, in "predatory crab (Carcinus maenas)",
        # "predatory crab" is an alternative name for "Carcinus maenas"
        # and vice versa. This is used so that the species can be
        # properly tracked and redundant points are less likely to be given.
        alternate_names = {}

        # TaxoNerd
        if verbose:
            print(f"TN Entities ({len(self.tn_doc.ents)}): {self.tn_doc.ents}\n")

        # The TN pipeline does not appear to have .pos_ or .tag_. Since,
        # has it, I'll just immediately convert the spans.
        sp_doc_species = []
        for span in self.tn_doc.ents:
            char_i0 = self.tn_doc[span.start].idx
            char_i1 = self.tn_doc[span.end-1].idx

            sp_token_i0 = self.main.token_at_char(char_i0).i
            sp_token_i1 = self.main.token_at_char(char_i1).i

            sp_doc_species.append(self.main.sp_doc[sp_token_i0:sp_token_i1+1])

        if verbose:
            print(f"SP Entities ({len(sp_doc_species)}): {sp_doc_species}")

        # Fix: Adding Singular and Plural Versions
        # Let's say you're looking for words that contain "ant" in the text.
        # "Pants" and "antebellum" would match. To prevent this, we need to make sure
        # that the entire word matches; but, with this change, "ants" wouldn't match either.
        # To fix this issue, we add the singular and plural versions of the recognized
        # species so that we "ant" and "ants" are matched, but "Pants" and "antebellum"
        # aren't.

        # This contains strings (not span objects) as we're looking throughout the text
        # for any matches based on the content and not position.
        base_species = []

        for span in sp_doc_species:
            string = span.text.lower()
            
            # Remove Non-Alphanumeric Characters on Outside
            while len(string) > 0:
                start_len = len(string)
                if len(string) > 0 and not string[-1].isalnum():
                    string = string[:-1]
                if len(string) > 0 and not string[0].isalnum():
                    string = string[1:]
                # No Changes Made
                if start_len == len(string):
                    break

            if span[-1].pos_ in ["NOUN"]:
                # Singular Noun => Add Plural Versions
                if span[-1].tag_ in ["NN"]:
                    if re.fullmatch(r".*us$", string):
                        print(1)
                        # 'octopus' -> 'octopuses'
                        base_species.append(f'{string}es')
                        # 'cactus' -> 'cacti'
                        base_species.append(f'{string[:-2]}o')
                    elif re.fullmatch(r".*(s|sh|ch|x|z)$", string):
                        # 'beach' -> 'beaches'
                        print(2)
                        base_species.append(f'{string}es')
                    elif re.fullmatch(r".*([^aeiou])(y)$", string):
                        # 'canary' -> 'canaries'
                        print(3)
                        base_species.append(f'{string[:-1]}ies')
                    # The rules are murky where there's -f.
                    # To be safe, I'll add both versions (one of which is incorrect).
                    elif re.fullmatch(r".*(f)(e?)$", string) and not re.fullmatch(r".*ff$", string):
                        # 'chief' -> 'chiefs' and 'loaf' -> 'loaves'
                        print(4)
                        last_clean = re.sub(r"(f)(e?)$", "", string)
                        base.species.append(f'{last_clean}fs')
                        base.species.append(f'{last_clean}ves')
                    # Some people would just add -s to 'bonobo', some would add
                    # -es. To be safe, I'll add both versions.
                    elif re.fullmatch(r".*([^aeiou])o$", string):
                        base.species.append(f'{string}s')
                        base.species.append(f'{string}es')
                    else:
                        print(5)
                        base_species.append(f'{string}s')
                # Plural Noun => Add Singular Versions
                elif span[-1].tag_ in ["NNS"]:
                    if re.fullmatch(r".*ies$", string):
                        # 'butterflies' -> 'butterfly'
                        print(6)
                        base_species.append(f'{string[:-3]}y')
                    elif re.fullmatch(r".*ves$", string):
                        # 'loaves' -> 'loaf' and 'lives' -> 'life'
                        print(7)
                        base_species.append(f'{string[:-3]}f')
                        base_species.append(f'{string[:-3]}fe')
                    elif re.fullmatch(r".*es$", string):
                        print(8)
                        base_species.append(f'{string[:-2]}')
                    elif re.fullmatch(r".*i$", string):
                        print(9)
                        base_species.append(f'{string[:-1]}us')
                    elif re.fullmatch(r".*s$", string):
                        print(10)
                        base_species.append(f'{string[:-1]}')

            # Default
            base_species.append(span.text.lower())

        # Now, we'll find all the words in the text that
        # have been recognized as a species (and a 
        # singular or plural version of such).
        text = self.main.sp_doc.text.lower()
        base_species = list(set(base_species))
        
        if verbose:
            print(f"Base Species: {base_species}")
        
        for species in base_species:
            if verbose:
                print(f"Searching '{species}'")

            if verbose:
                print(f"Search Species w/o Symbols: {species}")

            # Fix: Bootstrapping
            # TaxoNERD sometimes identifies one instance of a species and not the other.
            # For example, "ant" will be latered three times, and not the other four.
            # To fix this, I'll use the recognized species to look for all the instances 
            # in the document that also match those species.
            # Retrieve the start and end position for each of the substrings
            # in the text that matched the species span.
            matches = re.finditer(re.escape(species), text)
            for char_i0, char_i1 in [(match.start(), match.end()) for match in matches]:
                if verbose:
                    print(f"Match ({char_i0}-{char_i1}): '{text[char_i0:char_i1]}'")

                # As previously mentioned, the full word must match, not just a substring inside of it.
                # So, if the species we're looking for is "ant", only "ant" will match -- not "pants"
                # or "antebellum".
                letter_to_l = char_i0 > 0 and text[char_i0-1].isalpha()
                letter_to_r = char_i1 < len(text) - 1 and text[char_i1+1].isalpha()
                
                if letter_to_l or letter_to_r:
                    if verbose:
                        print(f"Letter on L/R -> Skip")
                    continue
                    
                sp_li = self.main.token_at_char(char_i0).i
                sp_ri = self.main.token_at_char(char_i1-1).i

                if verbose:
                    print(f"sp_li: {sp_li}")
                    print(f"sp_ri: {sp_ri}")
                    
                species_span = self.main.sp_doc[sp_li:sp_ri+1]
                
                if verbose:
                    print(f"Species Span: {species_span}")
    
                # Expand Species
                # Let's say there's a word like "squirrel". That's a bit ambiguous. Is it a brown squirrel, a zebra?
                # If the species is possibly missing information (like an adjective to the left of it), we should expand
                # in order to get a full picture of the species.
                ambiguous = len(species_span) == 1 and species_span[0].pos_ == "NOUN"
                missing_info = species_span.start > 0 and self.main.sp_doc[species_span.start-1].pos_ in ["ADJ"]
                
                if ambiguous or missing_info:
                    species_span = self.main.expand_unit(
                        il_unit=species_span.start, 
                        ir_unit=species_span.end-1,
                        il_boundary=0,
                        ir_boundary=len(self.main.sp_doc),
                        direction='LEFT',
                        allowed_speech=["ADJ", "PROPN"],
                        allowed_literals=["-"],
                        verbose=verbose
                    )
    
                    if verbose:
                        print(f"Expanded Species Span: {species_span}")

                # Remove Outer Symbols
                # There are times where a species is identified with a parenthesis nearby.
                # Here, we remove that parenthesis (and any other symbols).
                # This should no longer be needed if the prior removing of symbols is working
                # properly.
                # species_span = self.main.contract_unit(
                #     il_unit=species_span.start, 
                #     ir_unit=species_span.end-1, 
                #     speech=["PUNCT", "SYM"],
                #     must_not_be_in_set=True,
                #     verbose=verbose
                # )
    
                # if verbose:
                #     print(f"Contracted Species Span: {species_span}")

                # Nothing Burger Species
                # If each token in the span is a punctuation or symbol,
                # then is it even a species? I don't think so. However,
                # in case this happens, this can't be added to the spans
                # or else it will cause an error later on.
                non_punct_sym_found = False
                for token in species_span:
                    if token.pos_ not in ["PUNCT", "SYM"]:
                        non_punct_sym_found = True
                        break

                if not non_punct_sym_found:
                    continue
                
                spans.append(species_span)
                for token in species_span:
                    if token.pos_ in ["PUNCT", "SYM"]:
                        continue
                    tokens.append(token)
                    token_to_span[token] = species_span

        # Removing Duplicates and Sorting (for Next Part)
        if verbose:
            print(f"Spans Before Removing Duplicates and Sorting: {spans}")
            
        mapped_spans = {}
        for span in spans:
            mapped_spans[span.start] = span

        spans = list(mapped_spans.values())
        spans.sort(key=lambda span: span.start)

        if verbose:
            print(f"Spans Before Removing Duplicates and Sorting: {spans}")
        
        # Finding and Storing Alternative Names
        if verbose:
            print("Finding Alternate Names")
        
        for i, species_span in enumerate(spans):
            if i + 1 >= len(spans):
                break
            
            next_species_span = spans[i+1]
            
            if verbose:
                print(f"SPECIES 1: {species_span}")
                print(f"SPECIES 2: {next_species_span}")
                print(f"DIST == 1: {next_species_span.start - species_span.end == 1}")
            
            # If there's one token between the species and the next species,
            # we check if the next species is surrounded by punctuation.
            if next_species_span.start - species_span.end == 1:
                # Token Before and After the Next Species
                before_next = self.main.sp_doc[next_species_span.start-1]
                after_next = self.main.sp_doc[next_species_span.end]

                if verbose:
                    print(f"Token Before SPECIES 2: {before_next} and Token After SPECIES 2: {after_next}")

                # Adding K-V Pair for Names
                if before_next.pos_ in ["PUNCT", "SYM"] and after_next.pos_ in ["PUNCT", "SYM"]:
                    # Instead of using the span objects, the text (string literals)
                    # are used. This is because we're focusing on the content (the name)
                    # rather than where it appears in the document.
                    sp_1_text = species_span.text.lower()
                    sp_2_text = next_species_span.text.lower()
                    
                    if sp_1_text not in alternate_names:
                        alternate_names[sp_1_text] = []
                    
                    if sp_2_text not in alternate_names:
                        alternate_names[sp_2_text] = []
                    
                    alternate_names[sp_1_text].append(sp_2_text)
                    alternate_names[sp_2_text].append(sp_1_text)

        if verbose:
            print(f"Spans: {spans}")
            print(f"Tokens: {tokens}")
            print(f"Alternate Spans: {alternate_names}")
        
        return (spans, tokens, token_to_span, alternate_names)

    def span_at_token(self, token):
        if token in self.token_to_span:
            return self.token_to_span[token]
        return None
    
    def is_species(self, token):
        return token in self.tokens
        
    def has_species(self, tokens, verbose=False):
        for token in tokens:
            if token in self.tokens:
                return True
        return False

    def find_same_species(self, species_A, species_b, verbose=False):
        # METHOD 1: Check for Literal Matches
        sp_b_text = sp_b.text.lower()
        
        for sp_a in sp_A:
            # Verbatim Text
            sp_a_text = sp_a.text.lower()

            if sp_a_text == sp_b_text:
                return sp_a

            # Singularized Text
            sp_a_singular_texts = self.main.help.singularize(sp_a_text)
            sp_b_singular_texts = self.main.help.singularize(sp_b_text)

            if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
                return sp_a

        # METHOD 2: Check Alternate Names
        for sp_a in sp_A:
            if sp_b_text in self.alternate_names.get(sp_a_text, []):
                return sp_a
            if sp_a_text in self.alternate_names
        
        
        if verbose:
            print("Comparing Species")
            print(f"SPECIES 1: {sp_1}")
            print(f"SPECIES 2: {sp_2}")

        sp_1_text = sp_1.text.lower()
        sp_2_text = sp_2.text.lower()

        if verbose:
            print(f"SPECIES 1 TEXT: {sp_1_text}")
            print(f"SPECIES 2 TEXT: {sp_2_text}")
        
        # METHOD 1: Check if Texts are Equivalent
        equivalent = sp_1.text.lower() == sp_2.text.lower()
        
        if equivalent:
            return True

        # METHOD 2: Check Alternate Names
        if verbose:
            print("Check Alternate Names")
        
        if sp_1_text in self.alternate_names:
            if verbose:
                print(f"SPECIES 1 Alternate Names: {self.alternate_names[sp_1_text]}")
            if sp_2_text in self.alternate_names[sp_1_text]:
                return True
        
        if sp_2_text in self.alternate_names:
            if verbose:
                print(f"SPECIES 2 Alternate Names: {self.alternate_names[sp_2_text]}")
            if sp_1_text in self.alternate_names[sp_2_text]:
                return True

        # Singular Version of Phrase (e.g. "fewer crabs" becomes "fewer crab")
        singular_version = lambda tokens : " ".join([*[token.text for token in tokens[:-1]], tokens[-1].lemma_]).lower()

        # METHOD 3: Check Substrings (More or Less)
        # Via this method, pairs like (1) "dog" and "dog red"; and
        # (2) "red dog" and "dog" should match.

        # Common Name at Start
        if sp_1[0].lower_ == sp_2[0].lower_ and (sp_1[0].pos_ in ["NOUN", "PROPN"] or sp_2[0].pos_ in ["NOUN", "PROPN"]):
            if sp_1_text in sp_2_text or sp_2_text in sp_1_text:
                if verbose:
                    print(f"{sp_1} and {sp_2} are the same species.")
                return True
        # Common Name at End
        else:
            # Only used when there's 1 adjective in one of the species and
            # no adjectives in the other (e.g. "fewer crabs" v. "crabs").
            sp_1_nouns = []
            sp_1_num_adjectives = 0
            for token in sp_1:
                if not sp_1_nouns and token.pos_ == "ADJ":
                    sp_1_num_adjectives += 1
                elif token.pos_ in ["PROPN", "NOUN"]:
                    sp_1_nouns.append(token)
            
            sp_2_nouns = []
            sp_2_num_adjectives = 0
            for token in sp_2:
                if not sp_2_nouns and token.pos_ == "ADJ":
                    sp_2_num_adjectives += 1
                elif token.pos_ in ["PROPN", "NOUN"]:
                    sp_2_nouns.append(token)
    
            if verbose:
                print(f"Number of Adjectives in 1: {sp_1_num_adjectives}")
                print(f"Number of Adjectives in 2: {sp_2_num_adjectives}")
    
            if sp_1_nouns and sp_2_nouns and (
                (sp_1_num_adjectives == 1 and sp_2_num_adjectives == 0) or 
                (sp_2_num_adjectives == 1 and sp_1_num_adjectives == 0)
            ):
                sp_singular_nouns_1 = singular_version(sp_1_nouns)
                sp_singular_nouns_2 = singular_version(sp_2_nouns)
    
                if verbose:
                    print(f"Comparing Singular Nouns: '{sp_singular_nouns_1}' == '{sp_singular_nouns_2}'")
                
                return sp_singular_nouns_1 == sp_singular_nouns_2

        # METHOD 4: Check Singular Version
        # This method targets spans like "predatory crab" and "predatory crabs".
        sp_singular_1 = singular_version(sp_1)
        sp_singular_2 = singular_version(sp_2)

        if verbose:
            print(f"Comparing Singular Spans: '{sp_singular_1}' == '{sp_singular_2}'")
        
        if sp_singular_1 == sp_singular_2:
            return True

        # At this point, I don't see 
        return False

In [4]:
class Keywords:
    def __init__(self, main, base=[], phrases=[], speech=[], threshold=0.7, sub_base=[]):
        self.main = main
        # For a token to count towards a base word, it must be the same word.
        # For a token to count towards a sub_base word, it must contain the word.
        self.base = [b.lower() for b in base]
        self.sub_base = [b.lower() for b in sub_base]
        self.speech = [s.upper() for s in speech]
        self.phrases = [p.lower() for p in phrases]
        self.threshold = threshold
        self.vocab = [self.main.sp_nlp(word) for word in self.base]
        self.tokens = []

    def update(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        self.tokens = self.match_tokens(verbose=verbose)

    def match_tokens(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        
        matched_tokens = []

        # Check Words
        for token in self.main.sp_doc:
            if verbose:
                print(f"Potential Keyword: {token, token.pos_} v. Speech: {self.speech}")
            if token.pos_ not in self.speech:
                continue

            token_lower = token.lower_
            token_lemma_lower = token.lemma_.lower()
            
            # Comparing Literal Text
            if token_lemma_lower in self.base or token_lower in self.base:
                matched_tokens.append(token)
                continue
            # Comparing Substrings
            for sub_base_word in self.sub_base:
                if sub_base_word in token_lemma_lower or sub_base_word in token_lower:
                    matched_tokens.append(token)
                    break

            # Cannot quickly continue onto the next loop (outer)
            # because we were already in a loop.
            if matched_tokens and matched_tokens[-1] == token:
                continue
            
            # Comparing Similarity
            token_doc = self.main.sp_nlp(token_lower)
            for word in self.vocab:
                similarity = word.similarity(token_doc)
                if verbose:
                    print(f"{token_doc} and {word} Similarity: {similarity}")
                if similarity >= self.threshold:
                    matched_tokens.append(token)
                    break

        # Check Phrases
        text = self.main.sp_doc.text.lower()
        for phrase in self.phrases:
            for char_index in [match.start() for match in re.finditer(phrase, text)]:
                matched_tokens.append(self.main.token_at_char(char_index))
                
        return matched_tokens

class ExperimentKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=["study", "hypothesis", "experiment", "found", "discover", "compare", "finding", "result", "test", "examine", "model"],
            phrases=["control group", "independent", "dependent"],
            speech=["VERB", "NOUN"], 
            threshold=0.8
        )

class CauseKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=["increase", "decrease", "change", "shift", "cause", "produce", "trigger", "suppress", "inhibit", "encourage", "allow"], 
            speech=["VERB", "NOUN"], 
            threshold=0.8
        )

class ChangeKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=["few", "more", "increase", "decrease", "less", "short", "long", "greater"], 
            speech=["NOUN", "ADJ", "ADV"], 
            threshold=0.8
        )

In [5]:
class TraitKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            base=["behavior", "rate", "color", "mass", "size", "length", "pattern", "weight", "shape", "efficiency", "trait", "ability", "capacity", "height", "width", "span"],
            sub_base=["mass", "span", "length", "color", "rate"],
            speech=["NOUN", "ADJ"], 
            threshold=0.8
        )

    def update(self, verbose=False):
        Keywords.update(self, verbose)
        if verbose:
            print(f"Unfiltered Tokens: {self.tokens}")
        self.tokens = self.filter_tokens(self.tokens, verbose)
        if verbose:
            print(f"Filtered Tokens: {self.tokens}")

    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        filtered = []
        for token in tokens:
            expanded_token = self.main.expand_unit(
                il_unit=token.i, 
                ir_unit=token.i, 
                il_boundary=0, 
                ir_boundary=len(self.main.sp_doc) - 1, 
                allowed_speech=["ADJ", "NOUN", "ADP", "PART", "DET", "PROPN",],
                allowed_literals=["-", ","],
                disallowed_literals=["!", ".", "?"],
                verbose=verbose
            )
            
            if verbose:
                print(f"Token: {token}")
                print(f"Expanded Token: {expanded_token}")

            if self.main.species.has_species(expanded_token):
                if verbose:
                    print(f"\tContains Species")
                filtered.append(token)
        
        return filtered

In [6]:
class Main:
    def __init__(self):
        # Tools
        self.sp_nlp = spacy.load("en_core_web_lg")
        self.fcoref = FCoref(enable_progress_bar=False, device='cpu')
        self.sp_doc = None

        # Maps Character Position to Token in Document
        # Used to handle differences between different
        # pipelines and tools.
        self.index_map = None
    
        # Parsers
        self.species = Species(self)
        self.traits = TraitKeywords(self)
        self.causes = CauseKeywords(self)
        self.changes = ChangeKeywords(self)
        self.experiment = ExperimentKeywords(self)

    def update_doc(self, doc, verbose=False):
        self.sp_doc = doc
        self.index_map = self.load_index_map()
        self.species.update(doc.text, verbose=True)
        self.traits.update(verbose=False)
        self.causes.update(verbose=False)
        self.changes.update(verbose=False)
        self.experiment.update(verbose=False)

    def update_text(self, text, verbose=False):
        self.sp_doc = self.sp_nlp(text)
        self.update_doc(self.sp_doc, verbose=verbose)
        
    def token_at_char(self, char_index):
        # SpaCy Doc or Indexing Map Not Found
        if not self.sp_doc or not self.index_map:
            raise Exception("DNE")

        # Index into Map
        if char_index in self.index_map:
            return self.index_map[char_index]

        # Looking in Tokens
        # Depending on the tokenizer, the character being
        # used to find a token may not be the first character
        # of the token.
        # This shouldn't be needed anymore as I am pairing
        # each character in the document to its token.
        # for token in self.sp_doc:
        #     if char_index >= token.idx and char_index < token.idx + len(token):
        #         return token

        # There must be a token that corresponds to the
        # given character index. If there's not, there's
        # an issue.
        raise Exception("Token Not Found")
        
    def load_index_map(self):
        # SpaCy Doc Not Found
        if self.sp_doc is None:
            raise Exception("DNE")

        # Map Character Index to Token
        index_map = {}
        for token in self.sp_doc:
            # char_i0 is the index of the token's starting character.
            # char_i1 is the index of the character after the token's ending character.
            char_i0 = token.idx
            char_i1 = token.idx + len(token)
        
            for i in range(char_i0, char_i1):
                index_map[i] = token
            
        return index_map

    def score(self, verbose=False):
        NUM_CATEGORIES = 4

        # Requires the mention of a trait and a cause or change word.
        # The cause or change word indicates some variation.
        # Index 0 in Array
        TRAIT = 0

        # Requires the mention of a species and a cause or change word.
        # The cause or change word indicates that the species is being
        # affected or is affecting something else.
        # Index 1 in Array
        SPECIES = 1

        # Requires a word that has been defined as "experiment"-related.
        # Index 2 in Array
        EXPERIMENT = 2

        # Requires the mention of several species (more or less).
        # Index 3 in Array
        INTERACTION = 3

        # Max # of Points of Category per Sentence (MPC)
        # A sentence collects points from its categories.
        # For example, a sentence could get a maximum of 2 points from one category
        # and a maximum of 1 point from another. The MPC determines the maximum number
        # of points a category could contribute to a sentence. To have a range of [0, 1]
        # the maximum number of points, across categories, when added should be 1.
        MPC = [0] * NUM_CATEGORIES
        MPC[TRAIT] = 0.1
        MPC[SPECIES] = 0.3
        MPC[EXPERIMENT] = 0.3
        MPC[INTERACTION] = 0.3

        assert np.sum(MPC) == 1
        
        # Points per Instance of Category (PIC)
        # Each token is evaluated to check whether a category
        # can be given points. The number of points given, if
        # the token is determined to be satisfactory, is the PIC.
        # The PIC is less than or equal to the MPC for the corresponding
        # category. The idea behind the PIC and MPC is similar to how
        # sets work in tennis: you're not immediately awarded the full points
        # for the set (MPC) if your opponent fails to return the ball,
        # instead you're given a smaller # of points (PIC) that allow you to
        # incrementally win the set (category).
        PIC = [0] * NUM_CATEGORIES
        PIC[TRAIT] = MPC[TRAIT]*1.0
        PIC[SPECIES] = MPC[SPECIES]/3.0
        PIC[EXPERIMENT] = MPC[EXPERIMENT]/1.0
        PIC[INTERACTION] = MPC[INTERACTION]/3.0

        for i in range(NUM_CATEGORIES):
            assert PIC[i] <= MPC[i]

        # Category Weights (CW)
        # It may be helpful to weigh a certain category's fraction of total
        # points more or less than another's. Thus, at the end, we'll take a
        # weighted average of the category's FTP. The weights must add up to 1.
        CW = [0] * NUM_CATEGORIES
        CW[TRAIT] = 0.25
        CW[SPECIES] = 0.25
        CW[EXPERIMENT] = 0.25
        CW[INTERACTION] = 0.25

        assert np.sum(MPC) == 1

        # Points
        points = [0] * NUM_CATEGORIES

        # Extracted Information
        cause_tokens = self.causes.tokens
        change_tokens = self.changes.tokens
        trait_tokens = self.traits.tokens
        species_tokens = [self.sp_doc[span.start] for span in self.species.spans]
        experiment_tokens = self.experiment.tokens

        if verbose:
            print(f"Cause Tokens: {[(t.i, t) for t in self.causes.tokens]}")
            print(f"Change Tokens: {[(t.i, t) for t in self.changes.tokens]}")
            print(f"Trait Tokens: {[(t.i, t) for t in self.traits.tokens]}")
            print(f"Species Tokens: {[(t.i, t) for t in self.species.tokens]}")
            print(f"Reduced Species Tokens: {[(t.i, t) for t in species_tokens]}")
            print(f"Experiment Tokens: {[(t.i, t) for t in self.experiment.tokens]}")
         
        # This is used to ensure that at least three species
        # are mentioned.
        seen_species = {}

        for sent in self.sp_doc.sents:
            # This contains the number of points
            # each category has accumulated in the sentence.
            curr_points = [0] * NUM_CATEGORIES

            # Contains the tokens in the sentence.
            sent_tokens = [token for token in sent]

            # This is used for the species (must have a nearby cause and/or
            # change word).
            sent_cause_tokens = set(sent_tokens).intersection(cause_tokens)
            sent_change_tokens = set(sent_tokens).intersection(change_tokens)

            # We don't want to visit the same species more than one
            # in the same sentence as to avoid redundant points.
            sent_seen_species = []
            
            if verbose:
                print(f"\n\nSentence: {sent}\n")
                print(f"Sentence Tokens: {[(t.i, t) for t in sent]}")
                print(f"Sentence Cause Tokens: {sent_cause_tokens}")
                print(f"Sentence Change Tokens: {sent_change_tokens}\n")
            
            for token in sent_tokens:
                # If each category has reached their maximum number of points,
                # we can end the loop early.
                all_maxed = True
                for i in range(NUM_CATEGORIES):
                    if curr_points[i] < MPC[i]:
                        all_maxed = False

                if all_maxed:
                    break
                
                if verbose:
                    print(f"Token ({token.pos_}): '{token}'")

                # TRAIT CATEGORY
                if curr_points[TRAIT] < MPC[TRAIT] and token in trait_tokens:
                    # To get points in the trait category, there must 
                    # be (1) a trait; and (2) a change or cause in the token's
                    # context.
                    token_context = set(self.find_unit_context(il_unit=token.i, ir_unit=token.i, il_boundary=token.sent.start, ir_boundary=token.sent.end-1, verbose=verbose))
                    cause_tokens_in_context = set(sent_cause_tokens).intersection(token_context)
                    change_tokens_in_context = set(sent_change_tokens).intersection(token_context)
                    
                    if verbose:
                        print(f"Token Context: {token_context}")
                        print(f"Cause Tokens in Context: {cause_tokens_in_context}")
                        print(f"Change Tokens in Context: {change_tokens_in_context}")

                    if cause_tokens_in_context or change_tokens_in_context:
                        curr_points[TRAIT] += PIC[TRAIT]

                        if verbose:
                            print(f"Added {PIC[TRAIT]} Points to Trait")

                # EXPERIMENT CATEGORY
                if curr_points[EXPERIMENT] < MPC[EXPERIMENT] and token in experiment_tokens:
                    curr_points[EXPERIMENT] += PIC[EXPERIMENT]

                    if verbose:
                        print(f"Added {PIC[EXPERIMENT]} Points to Experiment")

                # SPECIES CATEGORY
                if token in species_tokens:
                    # Find Species Span
                    species_span = self.species.span_at_token(token)
                    
                    if verbose:
                        print(f"Species Span: {species_span}")
                        print(f"Seen Species:\n{seen_species}")                 

                    # Updating Seen Species (in Entire Text)
                    past_visits = 0
                    for seen_species_span in seen_species.keys():
                        if self.species.same_species(species_span, seen_species_span, verbose=verbose):
                            past_visits = seen_species[seen_species_span]
                            if verbose:
                                print(f"\t'{species_span}' == '{seen_species_span}'")
                                print(f"\tNumber of Visits: {past_visits}")
                            seen_species[seen_species_span] += 1
                            break

                    if not past_visits:
                        seen_species[species_span] = 1
                    is_new_species = past_visits == 0
                    
                    if verbose:
                        print(f"Is New Species: {is_new_species}")
                        print(f"Seen Species Updated:\n{seen_species}")
                        print("Checking Seen Species in Sentence")

                    # Checking Seen Species (in Sentence)
                    # We only add points if it's a species that has not been seen
                    # in the sentence. This is to avoid redundant points. 
                    # Also, if it species has not been seen at all (is_new_species),
                    # then it cannot be a redundant species (we couldn't have seen it in the sentence
                    # either).
                    redundant_species = False

                    if not is_new_species:
                        for seen_species_span in sent_seen_species:
                            if self.species.same_species(species_span, seen_species_span, verbose=verbose):
                                redundant_species = True
                                break
                    
                    sent_seen_species.append(species_span)
                    if redundant_species:
                        continue

                    # INTERACTION CATEGORY
                    # It is helpful to have this category here because (if we've reached here)
                    # we're dealing with a new species in the sentence.
                    if curr_points[INTERACTION] < MPC[INTERACTION]:
                        curr_points[INTERACTION] += PIC[INTERACTION]

                        if verbose:
                            print(f"Added {PIC[INTERACTION]} Points to Interaction")
                        
                    if curr_points[SPECIES] < MPC[SPECIES]:
                        # To get points in the species category, there must be 
                        # (1) a species; and (2) a change or cause in the phrase
                        # (or clause) that the token is a part of.
                        token_context = set(self.find_unit_context(il_unit=token.i, ir_unit=token.i, il_boundary=token.sent.start, ir_boundary=token.sent.end-1, verbose=verbose))
                        cause_tokens_in_context = set(sent_cause_tokens).intersection(token_context)
                        change_tokens_in_context = set(sent_change_tokens).intersection(token_context)
                        
                        if verbose:
                            print(f"Token Context: {token_context}")
                            print(f"Cause Tokens in Context: {cause_tokens_in_context}")
                            print(f"Change Tokens in Context: {change_tokens_in_context}")
                        
                        if cause_tokens_in_context or change_tokens_in_context:
                            curr_points[SPECIES] += PIC[SPECIES]

                            if verbose:
                                print(f"Added {PIC[SPECIES]} Points to Species")
         
            # SENTENCE DONE
            # Add Sentence Points to Total Points
            for category in [TRAIT, SPECIES, EXPERIMENT, INTERACTION]:
                points[category] += min(curr_points[category], MPC[category])

            if verbose:
                print(f"Overall Points: {points}")

        # Enforcing 3 or More Species
        if verbose:
            print(f"Seen Species: {seen_species}")
            
        if len(seen_species) < 3:
            return 0
        
        # Calculating Score
        if verbose:
            print(f"Points: {points}")
            
        NUM_SENTENCES = len(list(self.sp_doc.sents))

        score = 0
        for i in range(NUM_CATEGORIES):
            FTP = points[i] / (MPC[i] * NUM_SENTENCES)
            if verbose:
                print(f"Category 1's FTP: {FTP}")
            score += FTP * CW[i]
            
        assert 0.0 <= score <= 1.0

        if verbose:
            print(f"Score: {score}")

        return score

In [7]:
df = pd.read_csv("../../Datasets/Baseline-1.csv")
text = df.Abstract[0]
print(text)

In simple, linear food chains, top predators can have positive indirect effects on basal resources by causing changes in the traits (e.g. behaviour, feeding rates) of intermediate consumers. Although less is known about trait-mediated indirect interactions (TMIIs) in more complex food webs, it has been suggested that such complexity dampens trophic cascades. We examined TMIIs between a predatory crab ( Carcinus maenas ) and two ecologically important basal resources, fucoid algae ( Ascophyllum nodosum ) and barnacles ( Semibalanus balanoides ), which are consumed by herbivorous ( Littorina littorea ) and carnivorous ( Nucella lapillus ) snails, respectively. Because crab predation risk suppresses snail feeding rates, we hypothesized that crabs would also shape direct and indirect interactions among the multiple consumers and resources. We found that the magnitude of TMIIs between the crab and each resource depended on the suite of intermediate consumers present in the food web. Carnivo

In [8]:
main = Main()
for abstract in [df.Abstract.to_list()[1]]:
    print(abstract)
    main.update_text(abstract, verbose=True)
    score = main.score(verbose=True)
    print(score)
    print()

06/16/2025 07:36:05 - INFO - 	 missing_keys: []
06/16/2025 07:36:05 - INFO - 	 unexpected_keys: []
06/16/2025 07:36:05 - INFO - 	 mismatched_keys: []
06/16/2025 07:36:05 - INFO - 	 error_msgs: []
06/16/2025 07:36:05 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M


Alternative methods to achieve sustainable agricultural production while reducing the use of chemical pesticides, such as biological control, are increasingly needed. The exploitation of trait-mediated indirect interactions (TMIIs), in which pests modify their behavior in response to some cues (e.g., pheromones and other semiochemicals) to avoid predation risk, may be a possible strategy. In this study, we tested the effect of TMIIs of two Mediterranean ant species, Crematogaster scutellaris and Tapinoma nigerrimum, on the oviposition behaviour of Ceratitis capitata (Diptera: Tephritidae), one of the world's most economically damaging pests, which attacks fruits. For each ant species, we performed choice experiments using ant-scented and control plums, counting the time spent by medflies on fruits and the number of pupae emerging from them. Results of both ant species tests showed a significantly shorter time spent by ovipositing medflies on ant-exposed plums and a lower number of pupa

In [9]:
for token in main.sp_doc:
    if token.lower_ == "hypothesized":
        print(token.pos_)

In [10]:
for token in main.sp_doc:
    if token.pos_ not in ["NOUN", "PROPN", "ADJ"]:
        continue
    print(f"Token ({token.pos_}): {token}\n\t{'Species' if token in main.species.tokens else 'Not Species'}")

Token (ADJ): Alternative
	Not Species
Token (NOUN): methods
	Not Species
Token (ADJ): sustainable
	Not Species
Token (ADJ): agricultural
	Not Species
Token (NOUN): production
	Not Species
Token (NOUN): use
	Not Species
Token (ADJ): chemical
	Not Species
Token (NOUN): pesticides
	Not Species
Token (ADJ): such
	Not Species
Token (ADJ): biological
	Not Species
Token (NOUN): control
	Not Species
Token (NOUN): exploitation
	Not Species
Token (NOUN): trait
	Not Species
Token (ADJ): indirect
	Not Species
Token (NOUN): interactions
	Not Species
Token (PROPN): TMIIs
	Not Species
Token (NOUN): pests
	Species
Token (NOUN): behavior
	Not Species
Token (NOUN): response
	Not Species
Token (NOUN): cues
	Not Species
Token (NOUN): pheromones
	Not Species
Token (ADJ): other
	Not Species
Token (NOUN): semiochemicals
	Not Species
Token (NOUN): predation
	Not Species
Token (NOUN): risk
	Not Species
Token (ADJ): possible
	Not Species
Token (NOUN): strategy
	Not Species
Token (NOUN): study
	Not Species
Token

In [11]:
species_tokens = [main.sp_doc[span.start] for span in main.species.spans]
for span in species_tokens:
    print(span, span.pos_)

pests NOUN
species NOUN
Tapinoma PROPN
Ceratitis PROPN
Diptera PROPN
Tephritidae PROPN
damaging ADJ
species NOUN
pupae NOUN
ants NOUN


In [12]:
main.species.tn_doc.ents[0][0].tag_

''