In [1]:
class Species:
    def __init__(self, main):
        # Tools
        self.main = main
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler"])
        self.tn_doc = None
        
        # Contains any spans that have been identified
        # as a species.
        self.spans = None
        self.span_starts = None
        
        # Contains any tokens that have been identified
        # as a species or being a part of a species.
        self.tokens = None
        
        # Used to quickly access the span that a token
        # belongs to.
        self.token_to_span = None
        
        # Maps a string to an array of strings wherein
        # the strings involved in the key-value pair 
        # have been identified as an alternate name of each other.
        self.alternate_names = None
        
        # Includes words that (1) are to be identified as species; and
        # (2) are sometimes not identified as species, more or less.
        self.dictionary = ["juvenile", "juveniles", "adult", "adults", "prey", "predator", "predators", "species", "tree", "cat", "dog", "fly", "flies", "plant", "plants"]



    def update(self, text, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        self.tn_doc = self.tn_nlp(text)
        self.spans, self.tokens, self.token_to_span, self.span_starts = self.load_species(verbose=verbose)
        self.alternate_names = self.load_alternate_names(self.spans)



    def convert_tn_spans_to_sp_spans(self, tn_spans):
        sp_spans = []

        for tn_span in tn_spans:
            l_char_index = self.tn_doc[tn_span.start].idx
            r_char_index = l_char_index + len(tn_span.text) - 1

            try:
                l_sp_token_i = self.main.token_at_char(l_char_index).i
                r_sp_token_i = self.main.token_at_char(r_char_index).i
            except Exception as e:
                print(f"Error: Couldn't find token at character index of {l_char_index} and token index of {l_sp_token_i}.")
                print(f"Error: Couldn't find token at character index of {r_char_index} and token index of {r_sp_token_i}.")
                print(e)
                continue

            sp_span = self.main.sp_doc[l_sp_token_i:r_sp_token_i+1]
            if sp_span.text != tn_span:
                print(f"Error: SpaCy span does not match TaxoNerd span.")
                continue
            
            sp_spans.append(sp_span)

        return sp_spans



    def load_search_strings(self, verbose=False):
        search_strings = [*self.dictionary]
        
        # Creating a Broad Set of Species
        spans = self.convert_tn_spans_to_sp_spans(self.tn_doc.ents)
        spans = self.main.separate_spans_by_parenthetical(spans)

        # Add Ending Nouns to Set
        all_nouns = []
        for span in spans:
            nouns = self.main.get_base_nouns(span, immediate_stop=True)
            # print(f"Base Nouns for '{span}': {nouns}")
            if nouns:
                all_nouns.extend(nouns)
        # print(f"'All Nouns': {all_nouns}")
        spans.extend(all_nouns)

        # Adding Plural and Singular Versions of Spans
        for span in spans:
            text = span.text.lower()
            text = self.main.delete_extra_whitespace(self.main.delete_outer_non_alnum(text))

            # Blank Text or No Letters
            if not text or not [c for c in text if c.isalpha()]:
                continue

            search_strings.append(text)

            # Add Plural Version
            singular = span[-1].pos_ == "NOUN" and span[-1].tag_ == "NN"
            if singular:
                plural_version = self.main.pluralize(text)
                search_strings.extend(plural_version)

            # Add Singular Version
            plural = span[-1].pos_ == "NOUN" and span[-1].tag_ == "NNS"
            if plural:
                singular_version = self.main.singularize(text)
                search_strings.extend(singular_version)

        # Remove Duplicates
        search_strings = list(set(search_strings))

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Search Strings: {search_strings}")
        
        return search_strings



    def load_alternate_names(self, spans, verbose=False):
        spans.sort(key=lambda span: span.start)

        # It's useful to know if a different name refers to a
        # species we have already seen. For example, in
        # "predatory crab (Carcinus maenas)", "predatory crab"
        # is an alternative name for "Carcinus maenas" and
        # vice versa. This is used so that the species can be
        # properly tracked and redundant points are less
        # likely to be given.
        alternate_names = {}
        
        # Finding and Storing Alternative Names
        for i, species_span in enumerate(spans):
            # There's not a next species to
            # evaluate.
            if i + 1 >= len(spans):
                break
            
            next_species_span = spans[i+1]
            
            # If there's one token between the species and the next species,
            # we check if the next species is surrounded by punctuation.
            if next_species_span.start - species_span.end == 1:
                # Token Before and After the Next Species
                before_next = self.main.sp_doc[next_species_span.start-1]

                # Out of Bounds Error
                if next_species_span.end >= len(self.main.sp_doc):
                    continue
                    
                after_next = self.main.sp_doc[next_species_span.end]

                if before_next.lower_ in ["("] and after_next.lower_ in [")"]:
                    sp_1_text = species_span.text.lower()
                    sp_2_text = next_species_span.text.lower()
                    
                    if sp_1_text not in alternate_names:
                        alternate_names[sp_1_text] = []
                    
                    if sp_2_text not in alternate_names:
                        alternate_names[sp_2_text] = []
                    
                    alternate_names[sp_1_text].append(sp_2_text)
                    alternate_names[sp_2_text].append(sp_1_text)
            
            # If there's no token between the species and the next,
            # species we assume that they refer to the same species.
            elif next_species_span.start - species_span.end == 0:
                sp_1_text = species_span.text.lower()
                sp_2_text = next_species_span.text.lower()
                
                if sp_1_text not in alternate_names:
                    alternate_names[sp_1_text] = []
                
                if sp_2_text not in alternate_names:
                    alternate_names[sp_2_text] = []

                alternate_names[sp_1_text].append(sp_2_text)
                alternate_names[sp_2_text].append(sp_1_text)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Alternate Names: {alternate_names}")

        return alternate_names



    # Forgot what this is for, unfortunately.
    # Who would've thought that the name "bar" would help
    # with recollection. Not I.
    # I think it has something to do with duplicates.
    def bar(self, foo):
        if not foo or len(foo) == 1:
            return foo

        foo.sort()
        b = [foo[0]]
        
        l = 0
        
        for i in range(1, len(foo)):
            if foo[i] - foo[l] <= 1:
                l = i
            else:
                b.append(foo[i])
                l = i
            
        return b


    
    def load_species(self, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        # Load Search Strings from Species Spans
        search_strings = self.load_search_strings(verbose=verbose)

        # Search for Species
        # The results are stored in different 
        # forms below.
        spans = []
        tokens = []
        token_to_span = {}

        # Where we're searching for species.
        text = self.main.sp_doc.text.lower()

        for string in search_strings:
            matches = re.finditer(re.escape(string), text, re.IGNORECASE)

            for l_char_index, r_char_index, matched_text in [(match.start(), match.end(), match.group()) for match in matches]:    
                # The full word must match, not just a substring inside of it.
                # So, if the species we're looking for is "ant", only "ant"
                # will match -- not "pants" or "antebellum". Therefore, the
                # characters to the left and right of the matched string cannot
                # be letters.
                l_char_is_letter = l_char_index > 0 and text[l_char_index-1].isalpha()
                r_char_is_letter = r_char_index < len(text) and text[r_char_index].isalpha()
                
                if l_char_is_letter or r_char_is_letter or not matched_text:
                    continue

                try:
                    l_token_i = self.main.token_at_char(l_char_index).i
                    r_token_i = self.main.token_at_char(r_char_index-1).i
                except Exception as e:
                    print(f"Error: Unable to find token at index of {l_char_index}.")
                    print(f"Error: Unable to find token at index of {r_char_index}.")
                    print(f"\tMatched: '{matched_text}'")
                    print(e)
                    continue

                # This is the matched substring (which would be
                # a species) as a span in the parent document.
                span = self.main.sp_doc[l_token_i:r_token_i+1]
                
                # Expand Species
                # Let's say there's a word like "squirrel". That's a bit ambiguous. 
                # Is it a brown squirrel, a bonobo? If the species is possibly missing
                # information (like an adjective to the left of it), we should expand
                # in order to get a full picture of the species.
                unclear_1 = len(span) == 1 and span[0].pos_ == "NOUN"
                unclear_2 = span.start > 0 and self.main.sp_doc[span.start-1].pos_ in ["ADJ"]
                
                if unclear_1 or unclear_2:
                    span = self.main.expand_unit(
                        il_unit=span.start, 
                        ir_unit=span.end-1,
                        il_boundary=0,
                        ir_boundary=len(self.main.sp_doc),
                        speech=["ADJ", "PROPN"],
                        literals=["-"],
                        include=True,
                        direction="LEFT",
                        verbose=verbose
                    )
                
                # Remove Outer Symbols
                # There are times where a species is identified with a parenthesis
                # nearby. Here, we remove that parenthesis (and any other symbols).
                span = self.main.contract_unit(
                    il_unit=span.start, 
                    ir_unit=span.end-1, 
                    speech=["PUNCT", "SYM", "DET", "PART"],
                    include=False,
                    verbose=verbose
                )

                if not span:
                    print(f"Error: Span does not exist; left character index {l_char_index}.")
                    print(f"\tMatched: '{matched_text}'")
                    continue
            
                # A species must have a noun or a
                # proper noun. This may help discard
                # bad results.
                letter_found = False
                for token in span:
                    if token.pos_ in ["NOUN", "PROPN"] or token.lower_ in self.dictionary:
                        letter_found = True
                        break

                if not letter_found:
                    continue

                # Adding Species
                spans.append(span)
                for token in span:
                    if token in tokens or token.pos_ in ["PUNCT", "SYM", "DET", "PART"]:
                        continue
                    tokens.append(token)
                    token_to_span[token] = span
        
        spans = list({span.start: span for span in spans}.values())
        spans.sort(key=lambda span: span.start)
        
        # Remove Overlapping Spans
        i = 0
        while i < len(spans):
            j = i + 1
            while j < len(spans):
                if spans[i].start <= spans[j].start <= spans[i].end and spans[i].start <= spans[j].end <= spans[i].end:
                    spans.pop(j)
                else:
                    j += 1
            i += 1
        
        
        
        span_starts = [span[0] for span in spans]
        # If you asked me to either tell you what the bar function is
        # or to jump out of a plane with no parachute, I would choose the
        # plane.
        span_indices = self.bar([span[0].i for span in spans])
        span_starts = [self.main.sp_doc[i] for i in span_indices]

        if verbose and VERBOSE_LEVEL >= 1:
            print("Output of load_species:")
            print(f"Spans: {spans}")
            print(f"Tokens: {tokens}")
            print(f"Mapped Tokens: {token_to_span}")
            print(f"Span Starts: {span_starts}")
        
        return (spans, tokens, token_to_span, span_starts)
    
    
    
    def is_same_scientific_name(self, sp_a, sp_b):
        sp_a_text = sp_a.text.lower().split()[-2:]
        sp_b_text = sp_b.text.lower().split()[-2:]

        if len(sp_a_text) != 2 or len(sp_b_text) != 2:
            return False

        return sp_a_text[0][0] == sp_b_text[0][0] and sp_a_text[1] == sp_b_text[1]


    
    def is_alternate(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_a_text = sp_a.text.lower()
            
        # Species B is an alternate name for Species A
        if sp_b_text in self.alternate_names.get(sp_a_text, []):
            return True
        
        # Species A is an alternate name for Species B
        if sp_a_text in self.alternate_names.get(sp_b_text, []):
            return True

        return False



    def is_same_text(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_a_text = sp_a.text.lower()

        if sp_a_text == sp_b_text:
            return True
            
        sp_a_singular_texts = [sp_a_text] if sp_a[-1].tag_ in ["NN", "NNP"] else self.main.singularize(sp_a_text)
        sp_b_singular_texts = [sp_b_text] if sp_b[-1].tag_ in ["NN", "NNP"] else self.main.singularize(sp_b_text)

        if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
            return True
        return False



    def has_same_base_nouns(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_b_0_text = sp_b[0].lower_
        sp_b_0_is_noun = sp_b[0].pos_ in ["NOUN", "PROPN"]
        
        sp_b_nouns = []
        sp_b_num_adjectives = 0
        
        for token in sp_b:
            if not sp_b_nouns and token.pos_ == "ADJ":
                sp_b_num_adjectives += 1
            elif token.pos_ in ["PROPN", "NOUN"]:
                sp_b_nouns.append(token)

        if not sp_b_nouns:
            return False

        sp_b_nouns_text = [noun.lower_ for noun in sp_b_nouns]
        sp_b_singular_texts = [" ".join(sp_b_nouns_text)] if sp_b_nouns[-1].tag_ in ["NN", "NNP"] else self.main.singularize(" ".join(sp_b_nouns_text))

        sp_a_text = sp_a.text.lower()
        sp_a_0_text = sp_a[0].lower_
        sp_a_0_is_noun = sp_a[0].pos_ in ["NOUN", "PROPN"]

        # Case Example: 'Hyla' v. 'Hyla tadpoles'
        if sp_a_0_text == sp_b_0_text and (sp_a_0_is_noun or sp_b_0_is_noun):
            if sp_a_text in sp_b_text or sp_b_text in sp_a_text:
                return True
        
        # Case Example: 'dogs' v. 'red dogs'
        sp_a_nouns = []
        sp_a_num_adjectives = 0
        for token in sp_a:
            if not sp_a_nouns and token.pos_ == "ADJ":
                sp_a_num_adjectives += 1
            elif token.pos_ in ["PROPN", "NOUN"]:
                sp_a_nouns.append(token)
        
        if not sp_a_nouns:
            return False
        
        sp_a_nouns_text = [noun.lower_ for noun in sp_a_nouns]
        
        if sp_a_nouns and sp_b_nouns and (
            (sp_a_num_adjectives == 1 and sp_b_num_adjectives == 0) or 
            (sp_b_num_adjectives == 1 and sp_a_num_adjectives == 0)
        ):
            sp_a_singular_texts = [" ".join(sp_a_nouns_text)] if sp_a_nouns[-1].tag_ in ["NN", "NNP"] else self.main.singularize(" ".join(sp_a_nouns_text))
            if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
                return True

        return False



    def find_same_species(self, sp_A, sp_b, verbose=False):
        # METHOD 1: Check for Literal Matches
        for sp_a in sp_A:
            if self.is_same_text(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 1: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        # METHOD 1.1: Check for Scientific Names
        for sp_a in sp_A:
            if self.is_same_scientific_name(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 1.1: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        # METHOD 2: Check Alternate Names
        for sp_a in sp_A:
            if self.is_alternate(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 2: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a
        
        # METHOD 3: Check Nouns
        # This is used if one or none of the species being compared
        # has 1 adjective.
        for sp_a in sp_A:
            if self.has_same_base_nouns(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 3: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        # METHOD 4: Last Ditch Effort
        # If there's been no matches, we just look for one string inside of
        # another.
        for sp_a in sp_A:
            sp_a_text = sp_a.text.lower()
            sp_b_text = sp_b.text.lower()
            
            r_sp_a_text = re.compile(f"(\s|^){re.escape(sp_a_text)}(\s|$)", re.IGNORECASE)
            r_sp_b_text = re.compile(f"(\s|^){re.escape(sp_b_text)}(\s|$)", re.IGNORECASE)
            
            if re.match(r_sp_a_text, sp_b_text) or re.match(r_sp_b_text, sp_a_text):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 4: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"No Matches Between {sp_A} and {sp_b}")
        
        return None



    def span_at_token(self, token):
        if token in self.token_to_span:
            return self.token_to_span[token]
        return None



    def is_species(self, token):
        return token in self.tokens



    def has_species(self, tokens, verbose=False):
        for token in tokens:
            if token in self.tokens:
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tToken '{token}' is Species")
                return True
        return False