In [1]:
import re
import csv
import sys
import json
import math
import spacy
import textacy
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from taxonerd import TaxoNERD
from spacy.matcher import Matcher, DependencyMatcher, PhraseMatcher

In [2]:
VERBOSE_LEVEL = 0

In [3]:
# Helper Functions
def flatten(arr):
    if not isinstance(arr, list):
        return [arr]

    flat = []
    for val in arr:
        flat.extend(flatten(val))

    return flat

def find(arr, foo):
    for val in arr:
        if foo(val):
            return val
    return None

def find_all(arr, foo):
    bar = []
    for val in arr:
        if foo(val):
            bar.append(val)
    return bar

def find_index(arr, foo):
    for i in range(len(arr)):
        if foo(arr[i]):
            return i
    return -1

In [204]:
class Entity:
    # Labels
    LIST = 1
    ITEM = 2
    QUOTE = 3
    BREAK = 4
    END = 5
    AND_OR_END = 6
    COLON = 7
    COLON_BREAK = 8
    I_CLAUSE = 9
    D_CLAUSE = 10
    P_PHRASE = 11
    BRACKETS = 12
    FRAGMENT = 13

    def __init__(self, doc, label=None, l=None, r=None, children=None):
        self.doc = doc
        self.label = label
        self.l = l
        self.r = r
        self.children = children or []

    def label_(self):
        if self.label == Entity.LIST:
            return "List"
        if self.label == Entity.ITEM:
            return "Item"
        if self.label == Entity.QUOTE:
            return "Quote"
        if self.label == Entity.BREAK:
            return "Break"
        if self.label == Entity.END:
            return "End"
        if self.label == Entity.AND_OR_END:
            return "And or End"
        if self.label == Entity.COLON:
            return "Colon"
        if self.label == Entity.COLON_BREAK:
            return "Colon Break"
        if self.label == Entity.I_CLAUSE:
            return "Independent Clause"
        if self.label == Entity.D_CLAUSE:
            return "Dependent Clause"
        if self.label == Entity.P_PHRASE:
            return "Prepositional Phrase"
        if self.label == Entity.BRACKETS:
            return "Brackets"
        if self.label == Entity.FRAGMENT:
            return "Fragment"
        return "None"
        
    def size(self):
        return self.r - self.l + 1

    def span(self):
        return self.doc[self.l:self.r+1]

    def lower(self):
        return self.doc[self.l:self.r+1].text.lower()

    def start(self):
        return self.doc[self.l]

    def end(self):
        return self.doc[self.r]

    @staticmethod
    def tokens(*, ent=None, ents=None):
        if ents:
            tokens = flatten([list(ent.span()) for ent in ents])
            tokens = sorted(tokens, key=lambda token: token.i)
            return tokens
        if ent:
            tokens = list(ent.span())
            return tokens
        return None

    @staticmethod
    def is_conjunction(token):
        return token.lower_ in ["and", "or"]

    @staticmethod
    def same_speech(speech_1, speech_2):
        nouns = ["NOUN", "PRON", "PROPN"]
        if speech_1 in nouns and speech_2 in nouns:
            return True
        return speech_1 == speech_2

In [5]:
class Quotes:
    def __init__(self, main, entities):
        self.main = main
        self.entities = entities

    def is_quote(self, i):
        return i < len(self.entities) and self.entities[i].lower() == "\""
    
    def identify(self):
        i = 0
        
        while i < len(self.entities):
            if not self.is_quote(i):
                i += 1
                continue
            
            self.entities[i].label = Entity.QUOTE
            
            while not self.is_quote(i+1):
                self.entities[i].r = self.entities[i+1].r
                self.entities.pop(i+1)

            if self.is_quote(i+1):
                self.entities[i].r = self.entities[i+1].r
                self.entities.pop(i+1)

        return self.entities

In [6]:
class Brackets:
    MATCHES = {
        "[": "]", 
        "(": ")",
        "—": "—",
    }

    OPENING = MATCHES.keys()
    CLOSING = MATCHES.values()

    def __init__(self, main, entities):
        self.main = main
        self.stack = []
        self.entities = [*entities]

    def is_opening(self, i):
        return i < len(self.entities) and self.entities[i].lower()[0] in Brackets.OPENING

    def is_closing(self, i):
        return i < len(self.entities) and self.entities[i].lower()[0] in Brackets.CLOSING

    def closes(self, i):
        opener = self.entities[self.stack[-1]].lower()[0]
        closer = self.entities[i].lower()[0]
        return Brackets.MATCHES[opener] == closer
    
    def identify(self):
        self.stack = []
        
        i = 0
        while i < len(self.entities):
            # print(i, self.entities[i].span())
            
            # Closing
            if self.is_closing(i) and self.stack:
                j = None if not self.closes(i) else self.stack.pop()
                
                if not self.stack and j != None:
                    self.entities[j].r = self.entities[i].r
                    self.entities.pop(i)
                    continue
                else:
                    i += 1

            # Opening
            elif self.is_opening(i):
                if not self.stack:
                    self.entities[i].label = Entity.BRACKETS
                self.stack.append(i)
                i += 1

            # Consuming
            elif self.stack:
                # If you're at the end of the possible entities,
                # and the list is unclosed, we must stop.
                if i + 1 >= len(self.entities):
                    break
                self.entities[self.stack[0]].r = self.entities[i+1].r
                self.entities.pop(i)

            else:
                i += 1

        # for ent in self.entities:
        #     if ent.label == Entity.BRACKETS:
        #         print(f"Bracket: {ent.span()}")
        
        return self.entities

In [7]:
class Separators:
    def __init__(self, main, entities):
        self.main = main
        self.entities = [*entities]

    def is_break(self, i):
        if i >= len(self.entities):
            return False
        
        if self.entities[i].lower() not in [";", ","]:
            return False

        # Breaks cannot have a following conjunction.
        # Else, it would be an end and not a break.
        return not bool(
            i + 1 < len(self.entities) and 
            self.entities[i+1].size() == 1 and 
            self.entities[i+1].span()[0].pos_ in ["CCONJ"]
        )

    def is_end(self, i):
        if i >= len(self.entities):
            return False
        
        if self.entities[i].lower() not in [";", ","]:
            return False
        
        return not self.is_break(i)

    def identify(self):
        i = 0

        while i < len(self.entities):
            # Break
            if self.is_break(i):
                self.entities[i].label = Entity.BREAK
                i += 1

            # End
            elif self.is_end(i):
                conj = self.entities[i+1].start().lower_
                self.entities[i].label = Entity.AND_OR_END if conj in ["and", "or"] else Entity.END
                self.entities[i].r += 1
                self.entities.pop(i+1)

            else:
                i += 1
                
        return self.entities

In [8]:
class Colons:
    def __init__(self, main, entities):
        self.main = main
        self.entities = [*entities]

    def identify(self):
        i = 0

        while i < len(self.entities):
            if self.entities[i].lower()[-1] != ":":
                i += 1
                continue

            if not self.entities[i].label:
                self.entities[i].label = Entity.COLON_BREAK

            if i + 1 < len(self.entities):
                self.entities[i+1].label = Entity.COLON
                self.entities[i+1].r = self.entities[-1].r
                self.entities = self.entities[:i+2]
            
            break

        return self.entities        

In [9]:
class Independent_Clauses:
    def __init__(self, main, entities):
        self.main = main
        self.entities = [*entities]
        self.allowed = []

    def end(self, i):    
        if i >= len(self.entities):
            return True

        if self.entities[i].label in self.allowed:
            return True
        
        # Here, we check if the entity after
        # the supposed end is a clause. If it
        # is, then we can end at the current entity.
        return bool(
            i + 1 < len(self.entities) and 
            self.entities[i+1].label in [
                Entity.COLON,
                Entity.COLON_BREAK,
                Entity.I_CLAUSE,
                Entity.D_CLAUSE,
                Entity.P_PHRASE
            ]
        )

    def identify(self, allowed):
        self.allowed = allowed
        
        i = 0
        
        while i < len(self.entities):
            if self.entities[i].label not in self.allowed:
                i += 1
                continue

            # Skip Clause
            if self.entities[i].label in [
                Entity.I_CLAUSE, 
                Entity.D_CLAUSE, 
                Entity.P_PHRASE
            ]:
                i = entities[i].r + 1
                continue

            # Create Clause
            self.entities[i].label = Entity.I_CLAUSE
            while not self.end(i+1):
                self.entities[i].r = self.entities[i+1].r

                # Add Child
                if self.entities[i+1].label in [Entity.BRACKETS, Entity.QUOTE]:
                    self.entities[i].children.append(self.entities[i+1])
                    
                self.entities.pop(i+1)

            i += 1
            
        return self.entities

In [10]:
class Dependent_Clauses:
    RELATIVE_NOUNS = [
        "who",
        "whom",
        "which",
        "what",
        "that",
        "whose",
        "whomever",
        "whoever",
        "whichever",
        "whatever"
    ]
    
    def __init__(self, main, entities):
        self.main = main
        self.entities = entities
        self.separator = None

    def end(self, i):
        if i >= len(self.entities):
            return True

        # Here, we check if the entity after
        # is a clause. As we don't combine two
        # clauses, we must end here if that is
        # the case.
        if bool(
            i + 1 < len(self.entities) and 
            self.entities[i+1].label in [
                Entity.COLON, 
                Entity.COLON_BREAK,
                Entity.I_CLAUSE,
                Entity.D_CLAUSE,
                Entity.P_PHRASE
            ]
        ):
            return True

        return bool(
            self.entities[i].lower()[0] == self.separator or
            self.entities[i].lower() in Dependent_Clauses.RELATIVE_NOUNS or
            self.entities[i].start().pos_ in ["SCONJ"]
        )

    def identify(self, separator):
        self.separator = separator
        
        i = 0
        
        while i < len(self.entities):
            # Skip
            if self.entities[i].label in [
                Entity.COLON,
                Entity.COLON_BREAK,
                Entity.I_CLAUSE, 
                Entity.D_CLAUSE, 
                Entity.P_PHRASE
            ]:
                i = self.entities[i].r + 1
                continue

            # Indicators of Dependent Clause
            rel = self.entities[i].lower() in Dependent_Clauses.RELATIVE_NOUNS
            sub = self.entities[i].start().pos_ == "SCONJ"
            
            if not sub and not rel:
                i += 1
                continue

            # Create Clause
            self.entities[i].label = Entity.D_CLAUSE
            while not self.end(i+1):
                self.entities[i].r = self.entities[i+1].r

                # Add Child
                if self.entities[i+1].label in [Entity.BRACKETS, Entity.QUOTE]:
                    self.entities[i].children.append(self.entities[i+1])
                
                self.entities.pop(i+1)

            i += 1
        
        return self.entities

In [11]:
class Prepositional_Phrases:
    
    def __init__(self, main, entities):
        self.main = main
        self.entities = [*entities]

    # A prepositional phrase is typically ended by a noun.
    # Therefore, when we run into a noun, we end the phrase.
    # We must also check that it is the last of the first noun(s)
    # we encounter.
    def last_noun(self, i):
        if bool(
            # 1. End
            i >= len(self.entities) or 
            
            # 2. Noun
            self.entities[i].start().pos_ not in [
                "NOUN", 
                "PROPN", 
                "PRON"
            ]
        ):
            return False
        
        return bool(
            i + 1 > len(self.entities) - 1 or 
            (
                self.entities[i+1].size() == 1 and 
                self.entities[i+1].start().pos_ not in [
                    "NOUN", 
                    "PROPN", 
                    "PRON", 
                    "PART"
                ]
            )
        )
    
    def end(self, i):
        return bool(
            # 1. End of List
            i + 1 >= len(self.entities) or
            
            # 2. Clause
            self.entities[i+1].label in [
                Entity.COLON,
                Entity.COLON_BREAK,
                Entity.I_CLAUSE,
                Entity.D_CLAUSE,
                Entity.P_PHRASE
            ] or
            
            # 3. Noun
            self.last_noun(i)
        )

    def identify(self):    
        i = 0
        
        while i < len(self.entities):
            # Skip
            if bool(
                self.entities[i].size() != 1 or
                self.entities[i].start().pos_ != "ADP"
            ):
                i += 1
                continue

            # Create Clause
            self.entities[i].label = Entity.P_PHRASE
            while not self.end(i+1):
                self.entities[i].r = self.entities[i+1].r

                # Add Child
                if self.entities[i+1].label in [Entity.BRACKETS, Entity.QUOTE]:
                    self.entities[i].children.append(self.entities[i+1])
                
                self.entities.pop(i+1)

            if self.last_noun(i+1):
                self.entities[i].r = self.entities[i+1].r
                self.entities.pop(i+1)
            
            i += 1
        
        return self.entities   

In [12]:
class Lists:
    NOUNS = ["NOUN", "PRON", "PROPN"]
    
    def __init__(self, main, entities):
        self.main = main
        self.entities = [*entities]
        self.separator = None

    def is_stop(self, entity):
        is_break = entity.label == Entity.BREAK and entity.lower()[0] == self.separator
        is_clause = entity.label in [
            Entity.I_CLAUSE, 
            Entity.D_CLAUSE, 
            Entity.P_PHRASE,
            Entity.COLON,
            Entity.COLON_BREAK
        ]
        return is_break or is_clause

    def find_lists(self, sep):
        self.separator = sep
        
        lists = [
            [
                [None, None]
            ]
        ]

        i = 0
        while i < len(self.entities):
            entity = self.entities[i]

            opened = lists[-1][0] != [None, None]
            remove_list = entity.label in [Entity.COLON, Entity.COLON_BREAK]
            close_list = entity.label in [Entity.AND_OR_END] and entity.lower()[0] == sep
            close_item = entity.label in [Entity.BREAK] and entity.lower() == sep
        
            # Close List
            if opened and close_list:
                # Invalid List, Remove
                if len(lists[-1]) < 2:
                    lists[-1] = [[None, None]]
                    i += 1
                    continue
                    
                # Find the L Index of Last Item
                last_item_l = i + 1

                # Find the R Index of Last Item
                last_item_r = last_item_l
                
                length = find_index(self.entities[last_item_l:], lambda e: self.is_stop(e))
                if length > 0:
                    last_item_r += length - 1
                elif length == -1:
                    last_item_r = len(self.entities) - 1

                # Add Last Item
                lists[-1].append([last_item_l, last_item_r])
                lists.append([[None, None]])
                i += 1

            # Close Item
            elif opened and close_item:
                lists[-1].append([i + 1, i])
                i += 1
                
            # Remove List
            elif opened and remove_list:
                lists[-1] = [[None, None]]
                i += 1
            
            # Continue Item
            else:
                if not opened:
                    lists[-1][0] = [i, i]
                else:
                    lists[-1][-1][1] += 1
                i += 1
        
        # If we reach the end of the list and the last
        # list is invalid (< 3 items), we remove it.
        if bool(
            lists and len(lists[-1]) < 3 or 
            (
                lists and
                not find(self.entities[lists[-1][0][0]:], lambda e: e.label == Entity.AND_OR_END and e.lower()[0] == sep)
            )
        ):
            lists.pop()
        
        # In each item, we look for pairs (e.g. X and Y).
        # We only handle one conjunction.
        num_lists = len(lists)
        for i, lst in enumerate(lists):
            if i >= num_lists:
                break
            
            for l, r in lst:
                tokens = Entity.tokens(ents=self.entities[l:r+1])
                conj = find_all(tokens, lambda t: Entity.is_conjunction(t))
                if len(conj) == 1:
                    lists.append([[l, r]])

        # If there's no lists at all, we check if there's a pairing.
        # We should divvy the entities up by any separators, but
        # pairs aren't of too much importance.
        # TODO: ADD ABOVE FUNCTIONALITY
        tokens = Entity.tokens(ents=self.entities)
        num_conj = len(find_all(tokens, lambda t: Entity.is_conjunction(t)))
        if not lists and num_conj == 1:
            lists.append([[0, len(self.entities) - 1]])

        # Here we remove duplicates, I'm not sure if duplicates still
        # occur, I observed them once, but this is here in case.
        i = 0
        while i < len(lists):
            if lists[i] in lists[i+1:]:
                lists.pop(i)
            else:
                i += 1

        # Remove Invalid Lists
        i = 0
        while i < len(lists):
            # The list contains one item and that item only contains one
            # token, or the list has two items.
            if bool(
                (
                    len(lists[i]) == 1 and 
                    lists[i][0][0] == lists[i][0][1]
                ) or
                len(lists[i]) == 2
            ):
                lists.pop(i)
            else:
                i += 1
         
        return lists

    def clean_lists(self, lists):
        overlaps = []

        i = 0
        while i + 1 < len(lists):
            a = lists[i]
            b = lists[i+1]
                  
            if a[-1] != b[0]:
                i += 1
                continue

            if len(a) <= 1 or len(b) <= 1:
                i += 1
                continue

            # No Way to Split
            if a[-1][1] - a[-1][0] <= 1:
                overlaps.extend([i, i + 1])
                i += 2
            else:
                a[-1][1] = a[-1][0]
                b[0][0] = b[0][1]
                i += 2
        
        lists = [l for i, l in enumerate(lists) if i not in overlaps]
        return lists

    def expand_noun(self, tokens, start, direction):
        for group in [*self.main.sp_doc.noun_chunks, *self.main.sp_doc.ents]:
            tokens_i = [t.i for t in group]
            if tokens[start].i in tokens_i:
                while start >= 0 and start < len(tokens) and tokens[start].i in tokens_i:
                    start += 1 * direction
                start += 1 * direction * -1
            break
        
        return start
        
    def char_bound_list(self, lst):
        # print("Character Bound List")
        # We bound each item according to characters or a speech.
        # We find these bounds from the "base item", the second to last item.
        base_tokens = Entity.tokens(ents=self.entities[lst[-2][0]:lst[-2][1]+1])
        
        # As we're bounding by characters, primarily, the left bound is just
        # the characters of the first token
        l_bound = base_tokens[0].lower_

        # The right bound is the first tag, of the below set of tags, that we
        # encounter in the base tokens. If there's not such a token, we cannot
        # bound the items.
        speech = ["NOUN", "PROPN", "PRON", "VERB", "NUM"]
        r_bound = None
        for i in range(len(base_tokens) - 1, -1, -1):
            if base_tokens[i].pos_ in speech:
                r_bound = base_tokens[i]
                break

        if not r_bound:
            return None

        # The inner items are already bounded on the left and right sides.
        # All we need to check is whether the start matches with the left bound.
        inner_items = lst[1:-2]

        for i, item in enumerate(inner_items):
            l = item[0]
            r = item[1]
            
            tokens = Entity.tokens(ents=self.entities[l:r+1])

            # If it doesn't match, we check if the next set of items can be
            # bounded. If not, we cannot bound the list.
            if tokens[0].lower_ != l_bound:
                if len(inner_items) - i - 1 >= 2:
                    return self.bound_list(lst[i+2:])
                return None
            
        # Check for L Bound in Starting Item
        start_tokens = Entity.tokens(ents=self.entities[lst[0][0]:lst[0][1]+1])
        start_l = len(start_tokens) - 1
        while start_l >= 0 and start_tokens[start_l].lower_ != l_bound:
            start_l -= 1

        # L Bound Not Found
        if start_l < 0:
            # If the list is greater than 4 items, we can
            # cut off the starting item, and try again.
            if len(inner_items) >= 2:
                return self.bound_list(lst[1:])
            return None

        # If the first of the start tokens is a noun, there may be more
        # to include.
        if start_tokens[start_l].pos_ in Lists.NOUNS:
            start_l = self.expand_noun(start_tokens, start_l, -1)
                    
        # Check for R Bound in Ending Item
        end_tokens = Entity.tokens(ents=self.entities[lst[-1][0]:lst[-1][1]+1])
        end_r = 0
        num_end_tokens = len(end_tokens)
        while end_r < num_end_tokens and end_tokens[end_r].pos_ not in speech:
            end_r += 1

        if end_r >= num_end_tokens:
            return None

        # If the last of the end tokens is a noun, there may be more
        # to include.
        if end_tokens[end_r].pos_ in Lists.NOUNS:
            end_r = self.expand_noun(end_tokens, end_r, 1)
        
        # Create List
        entity_start_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=start_tokens[start_l].i, r=start_tokens[-1].i)
        entity_end_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=end_tokens[0].i, r=end_tokens[end_r].i)
        
        entity_list = Entity(self.main.sp_doc, label=Entity.LIST, l=start_tokens[start_l].i, r=end_tokens[end_r].i)
        entity_list.children.extend([entity_start_item, entity_end_item])
        
        for item in lst[1:-1]:
            tokens = Entity.tokens(ents=self.entities[item[0]:item[1]+1])
            entity_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=tokens[0].i, r=tokens[-1].i)
            entity_list.children.append(entity_item)

        return entity_list
            
    def bound_list(self, lst):
        # print("Bound List")
        # Base Item (2nd to Last Item) Tokens
        base_tokens = Entity.tokens(ents=self.entities[lst[-2][0]:lst[-2][1]+1])
        num_base_tokens = len(base_tokens)

        # print(f"Base Tokens: {base_tokens}")
        # print(f"Number Base Tokens: {num_base_tokens}")
        
        # Bound
        speech = ["NOUN", "PROPN", "PRON", "VERB", "NUM"]

        # Find L Bound
        l_bound = None
        for i in range(0, num_base_tokens):
            if base_tokens[i].pos_ in speech:
                l_bound = base_tokens[i]
                break

        if not l_bound:
            return None
        
        # Find R Bound
        r_bound = None
        for i in range(num_base_tokens - 1, -1, -1):
            if base_tokens[i].pos_ in speech:
                r_bound = base_tokens[i]
                break

        if not r_bound:
            return None

        # print(f"L Bound Speech: {l_bound.pos_}")
        # print(f"R Bound Speech: {r_bound.pos_}")
        
        # Check Inner Items
        # The inner items must have the left bound,
        # the right bound isn't as important.
        inner_items = lst[1:-1]

        verb_seen = False
        for i, item in enumerate(inner_items):
            l = item[0]
            r = item[1]
            
            item_tokens = Entity.tokens(ents=self.entities[l:r+1])
            item_speech = [token.pos_ for token in item_tokens]

            # Must be Homogeneous
            if "VERB" not in item_speech and verb_seen:
                if len(inner_items) >= 2:
                    return self.bound_list(lst[1:])  
                else:
                    return None
            elif "VERB" in item_speech:
                verb_seen = True

            # Not Found
            if l_bound.pos_ not in item_speech:
                # We check if the list starting at the next
                # item has a chance. If it does, that becomes
                # the list.
                if len(inner_items) - i + 1 >= 2:
                    return self.bound_list(lst[i+2:])
                return None
        
        # Check Starting Item
        start_tokens = Entity.tokens(ents=self.entities[lst[0][0]:lst[0][1]+1])
        start_l = len(start_tokens) - 1
        
        while start_l >= 0 and not Entity.same_speech(start_tokens[start_l].pos_, l_bound.pos_):
            start_l -= 1

        if start_l < 0:
            if len(inner_items) >= 2:
                return self.bound_list(lst[1:])
            return None

        # Adjust Starting Item
        if l_bound.pos_ in Lists.NOUNS:
            start_l = self.expand_noun(start_tokens, start_l, -1)
        
        # Check Ending Item
        end_tokens = Entity.tokens(ents=self.entities[lst[-1][0]:lst[-1][1]+1])
        end_r = 0
        num_end_tokens = len(end_tokens)

        while end_r < num_end_tokens and not Entity.same_speech(end_tokens[end_r].pos_, r_bound.pos_):
            end_r += 1

        if end_r >= num_end_tokens:
            return None

        # Adjust Ending Item
        if r_bound.pos_ in Lists.NOUNS:
            end_r = self.expand_noun(end_tokens, end_r, 1)

        # Create List
        entity_start_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=start_tokens[start_l].i, r=start_tokens[-1].i)
        entity_end_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=end_tokens[0].i, r=end_tokens[end_r].i)
        
        entity_list = Entity(self.main.sp_doc, label=Entity.LIST, l=start_tokens[start_l].i, r=end_tokens[end_r].i)
        entity_list.children.extend([entity_start_item, entity_end_item])

        for item in lst[1:-1]:
            tokens = Entity.tokens(ents=self.entities[item[0]:item[1]+1])
            entity_item = Entity(self.main.sp_doc, label=Entity.ITEM, l=tokens[0].i, r=tokens[-1].i)
            entity_list.children.append(entity_item)

        return entity_list

    def char_bound_pair(self, pair):
        # print("Character Bound Pair")
        tokens = Entity.tokens(ents=self.entities[pair[0][0]:pair[0][1]+1])
        num_tokens = len(tokens)
        
        m = find_index(tokens, lambda t: Entity.is_conjunction(t))
        l = m - 1
        r = m + 1

        # Bound L by R Token Characters
        i = m - 1
        while i >= 0 and tokens[i].lower_ != tokens[m + 1].lower_:
            i -= 1

        if i < 0:
            return None

        # Bound R by L Token Speech
        j =  m + 1
        while j < num_tokens and not Entity.same_speech(tokens[m-1].pos_, tokens[j].pos_):
            j += 1

        if j >= num_tokens:
            return None
        
        e_item_l = Entity(self.main.sp_doc, label=Entity.ITEM, l=tokens[i].i, r=tokens[m-1].i)
        e_item_r = Entity(self.main.sp_doc, label=Entity.ITEM, l=tokens[m+1].i, r=tokens[j].i)
        e_list = Entity(self.main.sp_doc, label=Entity.LIST, l=tokens[i].i, r=tokens[j].i, children=[e_item_l, e_item_r])
        return e_list
    
    def bound_pair(self, pair):
        # print("Bound Pair")
        tokens = Entity.tokens(ents=self.entities[pair[0][0]:pair[0][1]+1])
        tokens = sorted(tokens, key=lambda t: t.i)
        num_tokens = len(tokens)
        
        m = find_index(tokens, lambda t: Entity.is_conjunction(t))
        l = m - 1
        r = m + 1

        speech = ["NOUN", "PROPN", "PRON", "VERB", "NUM"]

        # Find L Bound
        l_bound = None
        l_bound_i = None
        for i in range(m + 1, num_tokens):
            if tokens[i].pos_ in speech:
                l_bound = tokens[i].pos_
                l_bound_i = tokens[i].i
                break

        if not l_bound:
            return None

        # Find R Bound
        r_bound = None
        r_bound_i = None
        for i in range(m - 1, -1, -1):
            if tokens[i].pos_ in speech:
                r_bound = tokens[i].pos_
                r_bound_i = tokens[i].i
                break

        if not r_bound:
            return None

        # Bound L
        while l >= 0 and not Entity.same_speech(tokens[l].pos_, l_bound):
            l -= 1

        if l < 0:
            return None

        # Adjust L if Noun
        if l_bound in Lists.NOUNS:
            l = self.expand_noun(tokens, l, -1)
            
        # Bound R
        while r < num_tokens and not Entity.same_speech(tokens[r].pos_, r_bound):
            r += 1
        
        if r >= num_tokens:
            return None

        # Adjust R if Noun
        if r_bound in Lists.NOUNS:
            r = self.expand_noun(tokens, r, 1)
        
        e_item_l = Entity(self.main.sp_doc, label=Entity.ITEM, l=tokens[l].i, r=r_bound_i)
        e_item_r = Entity(self.main.sp_doc, label=Entity.ITEM, l=l_bound_i, r=tokens[r].i)

        e_list = Entity(self.main.sp_doc, label=Entity.LIST, l=tokens[l].i, r=tokens[r].i)
        e_list.children.extend([e_item_l, e_item_r])
        
        return e_list

    def bound_lists(self, lists):
        # print(f"Lists: {lists}")
        # for items in lists:
        #     print(f"\tList: {items}")
        #     for item in items:
        #         print(f"\t\tItem: {Entity.tokens(ents=self.entities[item[0]:item[1]+1])}")
        
        bound_lists = []
        
        for lst in lists:
            bound = None
        
            if len(lst) == 1:
                bound = self.char_bound_pair(lst)
                if not bound:
                    bound = self.bound_pair(lst)
            else:
                bound = self.char_bound_list(lst)
                if not bound:
                    bound = self.bound_list(lst)
            
            if bound:
                bound_lists.append(bound)

        # print(f"Bounded Lists: {bound_lists}")
        # for bound_list in bound_lists:
        #     print(f"\tBound List ({bound_list.l}, {bound_list.r}): {bound_list.span()}")
        
        # Map (L, R) to Entity List
        mapped_bounds = {}
        for lst in bound_lists:
            mapped_bounds[(lst.l, lst.r)] = lst
        bounds = list(mapped_bounds.keys())

        # Find Largest Coverage of Bounds
        max_coverage = []
        
        for bound in bounds:
            overlap = False
            for i, max_bound in enumerate(max_coverage):
                contains = max_bound[0] <= bound[0] <= max_bound[1] or max_bound[0] <= bound[1] <= max_bound[1]
                surround = bound[0] <= max_bound[0] <= bound[1] or bound[0] <= max_bound[1] <= bound[1]
                
                if contains or surround:
                    overlap = True
                
                    if bound[1] - bound[0] > max_bound[1] - max_bound[0]:
                        max_coverage[i] = bound
            
            if not overlap:
                max_coverage.append(bound)

        # print(f"(Max Coverage) Bounds: {max_coverage}")
        # print(f"Entities: {[(e.l, e.r) for e in self.entities]}")
        
        # Integrate Lists
        for bound in max_coverage:
            l_overlap = None
            l_overlap_i = None
            
            r_overlap = None
            r_overlap_i = None
            
            i = 0
            while i < len(self.entities):
                entity = self.entities[i]
                
                # Overlap w/ Left
                if not l_overlap and entity.l <= bound[0] <= entity.r:
                    l_overlap = entity
                    l_overlap_i = i
    
                # Overlap w/ Right
                if entity.l <= bound[1] <= entity.r:
                    r_overlap = entity
                    r_overlap_i = i

                if l_overlap and r_overlap:
                    break

                i += 1

            # if l_overlap:
            #     print(f"L Overlap: {l_overlap.span()}, {l_overlap.label}")

            # if r_overlap:
            #     print(f"R Overlap: {r_overlap.span()}, {r_overlap.label}")
            
            if l_overlap.label in [Entity.I_CLAUSE, Entity.D_CLAUSE, Entity.P_PHRASE]:
                if l_overlap.l == mapped_bounds[bound].l:
                    # Replace (Not in Use)
                    # self.entities = self.entities[:l_overlap_i] + self.entities[r_overlap_i+1:]
                    # self.entities.insert(l_overlap_i, mapped_bounds[bound])

                    # Add Children
                    l_overlap.r = max(l_overlap.r, mapped_bounds[bound].r)
                    l_overlap.children.append(mapped_bounds[bound])
                    self.entities = self.entities[:l_overlap_i+1] + self.entities[r_overlap_i+1:]
                else:
                    # Split (Not in Use)
                    # l_overlap.r = mapped_bounds[bound].l - 1
                    # self.entities = self.entities[:l_overlap_i+1] + self.entities[r_overlap_i+1:]
                    # self.entities.insert(l_overlap_i + 1, mapped_bounds[bound])
                    
                    # Add Children
                    l_overlap.r = max(l_overlap.r, mapped_bounds[bound].r)
                    l_overlap.children.append(mapped_bounds[bound])
                    self.entities = self.entities[:l_overlap_i+1] + self.entities[r_overlap_i+1:]
                    
            else:
                self.entities = self.entities[:l_overlap_i] + self.entities[r_overlap_i+1:]
                self.entities.insert(l_overlap_i, mapped_bounds[bound])
        
        return self.entities
        
    def identify(self, sep):
        lists = self.find_lists(sep)
        lists = self.clean_lists(lists)
        lists = self.bound_lists(lists)   
        return lists

In [107]:
class Parts:
    def __init__(self, main):
        self.main = main
        self.root = Entity(self.main.sp_doc)
        self.reg = []

    def matches(self, token_i, sent_i, label):
        sent_reg = self.reg[sent_i]
        for k, v in sent_reg:
            if k[0] <= token_i <= k[1]:
                return v.label == label
        return False

    def load_registry(self, ent):
        reg = {(ent.l, ent.r): ent}
        for child in ent.children:
            if not child.label:
                continue
            reg.update(self.load_registry(child))
        return reg
    
    def update(self):
        reg = []
        for sent in self.main.sp_doc.sents:
            tokens = list(sent)
            # print(f"Parsing Sentence: {tokens}")
            # print(f"\tSize: {len(tokens)}")
            reg.append(self.load_entities(tokens))
        self.reg = reg
    
    def load_entities(self, tokens):
        entities = []
        for token in tokens:
            entity = Entity(
                self.main.sp_doc, 
                l=token.i, 
                r=token.i
            )
            entities.append(entity)

        entities = Quotes(self.main, entities).identify()
        entities = Brackets(self.main, entities).identify()
        entities = Separators(self.main, entities).identify()
        
        sep = ","
        for entity in entities:
            if ";" == entity.lower()[0]:
                sep = ";"
                break
        
        entities = Colons(self.main, entities).identify()
        entities = Dependent_Clauses(self.main, entities).identify(sep)
        entities = Independent_Clauses(self.main, entities).identify([Entity.END])
        entities = Prepositional_Phrases(self.main, entities).identify()
        entities = Lists(self.main, entities).identify(sep)
        entities = Independent_Clauses(self.main, entities).identify([Entity.AND_OR_END])

        # Merge Individual Entities
        for ent in entities:
            print(ent.label, ent.lower())
        
        i = 0
        while i < len(entities):
            if not entities[i].label:
                while i + 1 < len(entities) and not entities[i+1].label:
                    entities.pop(i+1)
                    entities[i].r += 1
                entities[i].label = Entity.FRAGMENT

            i += 1

        for ent in entities:
            print(ent.label, ent.lower())
        
        # Create Registry
        parent = Entity(self.main.sp_doc, l=-1, r=-1, children=entities)
        
        registry = self.load_registry(parent)
 
        return registry

In [108]:
class Base:
    # There is not a defined conversion method for these words.
    # This is the default list of irregular nouns. It maps the
    # the singular version to the plural version (SP).
    IRREGULAR_NOUNS_SP = {
        "ox": "oxen",
        "goose": "geese",
        "mouse": "mice",
        "bacterium": "bacteria"
    }

    # This is the reversed version of the dictionary above, meaning 
    # that the plural version is mapped to the singular version 
    # (PS).
    IRREGULAR_NOUNS_PS = {v: k for k, v in IRREGULAR_NOUNS_SP.items()}
    
    # The singular and plural versions of these words are the same. 
    # This is the default list of zero plural nouns.
    ZERO_PLURAL_NOUNS = [
        "species", 
        "deer", 
        "fish", 
        "moose", 
        "sheep", 
        "swine", 
        "buffalo", 
        "trout", 
        "cattle"
    ]

    # These pairs of characters define symbols that enclose other
    # information in a text.
    ENCLOSURES = {
        "(": ")",
        "[": "]",
        "{": "}"
    }

    LAX_ENCLOSURES = {
        "(": ")",
        "[": "]",
        "{": "}",
        "—": "—"
    }


    
    def __init__(self, main, irregular_nouns_sp=IRREGULAR_NOUNS_SP, irregular_nouns_ps=IRREGULAR_NOUNS_PS, zero_plural_nouns=ZERO_PLURAL_NOUNS):
        self.main = main
        self.zero_plural_nouns = zero_plural_nouns
        self.irregular_nouns_sp = irregular_nouns_sp
        self.irregular_nouns_ps = irregular_nouns_ps
        self.irregular_plural_nouns = list(self.irregular_nouns_sp.values())
        self.irregular_singular_nouns = list(self.irregular_nouns_sp.keys())



    def delete_extra_whitespace(self, string):
        # Duplicate spaces, spaces before punctuation marks,
        # and outside spaces are removed.
        string = re.sub(r"\s+", " ", string)
        string = re.sub(r"\s+([?.!,])", r"\1", string)
        string = string.strip()
        return string



    def delete_outer_non_alnum(self, string):
        while string:
            start_len = len(string)
            # Remove Leading Non-Alphanumeric Character
            if string and not string[0].isalnum():
                string = string[1:]
            # Remove Trailing Non-Alphanumeric Character
            if string and not string[-1].isalnum():
                string = string[:-1]
            # No Changes Made
            if start_len == len(string):
                break
        return string



    def get_parentheticals(self, text, enclosures=ENCLOSURES, flatten=False):
        # The parenthetical would be the content inside of a pair
        # of matching parentheses, brackets, or braces.
        parentheticals = []
        
        # This contains the text that's not inside of any
        # enclosure.
        base_text = []
        
        # This is used for building groups, which often has a 
        # nested structure.
        stacks = []
        
        # These are the pairs of characters that we recognize
        # as defining the parenthetical.
        openers = list(enclosures.keys())
        closers = list(enclosures.values())
        
        # This contains the opening characters of the groups 
        # that are currently open (e.g. '(', '['). We use it 
        # so that we know whether to open or close a group.
        opened = []
        
        for i, char in enumerate(text):
            # Open Group
            if char in openers:
                stacks.append([])
                opened.append(char)
            # Close Group
            elif opened and char == enclosures.get(opened[-1], ""):
                parentheticals.append(stacks.pop())
                opened.pop()
            # Add to Group
            elif opened:
                stacks[-1].append(i)
            # Add to Base Text
            else:
                base_text.append(i)
        
        # We close the remaining groups that have not
        # been closed.
        while stacks:
            parentheticals.append(stacks.pop())
            
        # Cluster Groups' Indices
        # A list in the lists of indices (where each list represents a group of text) could have 
        # an interruption (e.g. [0, 1, 2, 10 15]) because of a parenthetical. So, we cluster the
        # indices in each list to make the output more useful (e.g. [(0, 3), (10, 16)]).
        lists_of_indices = [*parentheticals, base_text]        
        lists_of_clustered_indices = []

        for list_of_indices in lists_of_indices:
            if not list_of_indices:
                continue

            # We start off with a single cluster that is made up of the
            # first index. If the next index follows the first index, 
            # we continue the cluster. If it doesn't, we create a new cluster.
            clustered_indices = [[list_of_indices[0], list_of_indices[0] + 1]]
            
            for index in list_of_indices[1:]:
                if clustered_indices[-1][1] == index:
                    clustered_indices[-1][1] = index + 1
                else:
                    clustered_indices.append([index, index + 1])

            # Add Clustered Indices
            lists_of_clustered_indices.append(clustered_indices)
            
        if flatten:
            flattened_clusters = []
            # We are placing each cluster of indices into one list.
            # This removes the context of the larger parenthetical,
            # but the context may be cumbersome instead of useful.
            for list_of_clustered_indices in lists_of_clustered_indices:
                for clustered_indices in list_of_clustered_indices:
                    flattened_clusters.append(clustered_indices)
            lists_of_clustered_indices = flattened_clusters
        
        return lists_of_clustered_indices



    def separate_span_by_parenthetical(self, span):
        span_parentheticals = []
        
        # The clusters of the span represented with tuples of char indices
        # (e.g. [(0, 1), (1, 5), (5, 10)]. This is a list of clustered
        # indices (like above).
        text_clusters = self.get_parentheticals(span.text, flatten=True)
        
        for cluster in text_clusters:
            if span.text[cluster[0]:cluster[1]].isspace():
                continue

            l_char_index = span[0].idx + cluster[0]
            r_char_index = span[0].idx + cluster[1] - 1

            # Instead of having a tuple dictating the start and end of a cluster,
            # we can use a span -- it's much simpler.
            cluster_as_span = self.get_span_at_indices(l_char_index, r_char_index)
            if not cluster_as_span:
                continue
            
            span_parentheticals.append(cluster_as_span)

        return span_parentheticals



    def separate_spans_by_parenthetical(self, spans):
        all_span_parentheticals = []
        for span in spans:
            all_span_parentheticals.extend(self.separate_span_by_parenthetical(span))
        return all_span_parentheticals

    
 
    def singularize(self, string):
        string = string.lower()
        
        # The string to singularize should not have any
        # non-alphanumeric characters at the end, or else
        # the algorithm will not work.
        words = re.split(r" ", string)

        if not words:
            return [string]

        # If the last word in the string is a zero plural
        # or a singular irregular noun, there's no changes
        # to make. For example, "red sheep" and "ox" are 
        # already singular.
        if (
            words[-1] in self.zero_plural_nouns or 
            words[-1] in self.irregular_singular_nouns
        ):
            return [string]

        # If the last word in the string is an irregular
        # plural noun, we rely on a dictionary with the
        # corresponding mapping.
        if words[-1] in self.irregular_plural_nouns:
            words[-1] = self.irregular_nouns_ps[words[-1]]
            singulars = [self.delete_extra_whitespace(" ".join(words))]
            return singulars
        
        # We take the singular form of the last word and
        # add it back in to the other words. As there could
        # be multiple forms (due to uncertainty), we need to
        # include all possible versions.
        singulars = []
        singular_endings = self.get_singular(words[-1])

        if not singular_endings:
            return [string]
        
        for singular_ending in singular_endings:
            singular = self.delete_extra_whitespace(" ".join([*words[:-1], singular_ending]))
            singulars.append(singular)
            
        return singulars



    def get_singular(self, string):
        versions = []

        # Replace -ies with -y
        if re.fullmatch(r".*ies$", string):
            versions.append(f'{string[:-3]}y')
            return versions

        # Replace -ves with -f and -fe
        if re.fullmatch(r".*ves$", string):
            versions.append(f'{string[:-3]}f')
            versions.append(f'{string[:-3]}fe')
            return versions

        # Delete -es 
        if re.fullmatch(r".*es$", string):
            versions.append(f'{string[:-2]}')
            return versions

        # Replace -i with -us
        if re.fullmatch(r".*i$", string):
            versions.append(f'{string[:-1]}us')
            return versions

        # Delete -s
        if re.fullmatch(r".*s$", string):
            versions.append(f'{string[:-1]}')
            return versions

        return versions


    
    def pluralize(self, string):
        string = string.lower()
        
        # The string to pluralize should not have any
        # non-alphanumeric characters at the end, or else
        # the algorithm will not work.
        words = re.split(r" ", string)

        if not words:
            return [string]

        # If the last word in the string is a zero plural
        # or a plural irregular noun, there's no changes
        # to make. For example, "red sheep" and "oxen" are 
        # already singular.
        if (
            words[-1] in self.zero_plural_nouns or 
            words[-1] in self.irregular_plural_nouns
        ):
            return [string]

        # If the last word in the string is an irregular
        # singular noun, we rely on a dictionary with the
        # corresponding mapping.
        if words[-1] in self.irregular_singular_nouns:
            words[-1] = self.irregular_nouns_sp[words[-1]]
            return [self.delete_extra_whitespace(" ".join(words))]
        
        # We take the singular form of the last word and
        # add it back in to the other words. As there could
        # be multiple forms (due to error), we need to
        # handle them all.
        plurals = []
        plural_endings = self.get_plural(words[-1])

        if not plural_endings:
            return [string]
            
        for plural_ending in plural_endings:
            plural = self.delete_extra_whitespace(" ".join([*words[:-1], plural_ending]))
            plurals.append(plural)
            
        return plurals

    
  
    def get_plural(self, string):
        versions = []

        # Words that end with -us often have
        # two different plural versions: -es and -i.
        # For example, the plural version of cactus 
        # can be cactuses or cacti.
        if re.fullmatch(r".*us$", string):
            versions.append(f'{string}es')
            versions.append(f'{string[:-2]}i')
            return versions

        # The -es ending is added to the words below.
        if re.fullmatch(r".*([^l]s|sh|ch|x|z)$", string):
            versions.append(f'{string}es')
            return versions

        # Words that end with a consonant followed by 'y'
        # are made plural by replacing the 'y' with -ies.
        # For example, the plural version of canary is
        # canaries.
        if re.fullmatch(r".*([^aeiou])(y)$", string):
            versions.append(f'{string[:-1]}ies')
            return versions
            
        # The plural version of words ending with -f
        # and -fe aren't clear. To be safe, I will add
        # both versions.
        if (re.fullmatch(r".*(f)(e?)$", string) and not re.fullmatch(r".*ff$", string)):
            last_clean = re.sub(r"(f)(e?)$", "", string)
            versions.append(f'{last_clean}fs')
            versions.append(f'{last_clean}ves')
            return versions

        # People add -s or -es to words that end with 'o'.
        # To be safe, both versions are added.
        if re.fullmatch(r".*([^aeiou])o$", string):
            versions.append(f'{string}s')
            versions.append(f'{string}es')
            return versions

        # If there's no -s at the end of the string and
        # the other cases didn't run, we add an -s.
        if re.fullmatch(r".*[^s]$", string):
            versions.append(f'{string}s')
        
        return versions


 
    def expand_unit(self, *, il_unit, ir_unit, il_boundary, ir_boundary, speech=[], literals=[], include=True, direction='BOTH', verbose=False):
        UNIT = self.main.sp_doc[il_unit:ir_unit+1]
        
        if il_unit > ir_unit:
            print(f"Error: il_unit of {il_unit} greater than ir_unit of {ir_unit}")
            return None
        
        if direction in ['BOTH', 'LEFT'] and il_boundary > il_unit:
            print(f"Error: il_unit of {il_unit} less than il_boundary of {il_boundary}")
            return None
        
        if direction in ['BOTH', 'RIGHT'] and ir_boundary < ir_unit:
            print(f"Error: ir_unit of {ir_unit} greater than ir_boundary of {ir_boundary}")
            return None
        
        # Move Left
        if direction in ['BOTH', 'LEFT']:
            # The indices are inclusive, therefore, when 
            # the condition fails, il_unit will be equal
            # to il_boundary.
            while il_unit > il_boundary:
                # We assume that the current token is allowed,
                # and look to the token to the left.
                l_token = self.main.sp_doc[il_unit-1]

                # If the token is invalid, we stop expanding.
                in_set = l_token.pos_ in speech or l_token.lower_ in literals

                # Case 1: include=False, in_set=True
                # If we're not meant to include the defined tokens, and the
                # current token is in that set, we stop expanding.
                # Case 2: include=True, in_set=False
                # If we're meant to include the defined tokens, and the current
                # token is not in that set, we stop expanding.
                # Case 3: include=in_set
                # If we're meant to include the defined tokens, and the current
                # token is in that set, we continue expanding. If we're not meant
                # to include the defined tokens, and the current token is not
                # in that set, we continue expanding.
                if include ^ in_set:
                    break
                
                # Else, the left token is valid, and
                # we continue to expand.
                il_unit -= 1

        # Move Right
        if direction in ['BOTH', 'RIGHT']:
            # Likewise, when the condition fails,
            # ir_unit will be equal to the ir_boundary.
            # The ir_boundary is also inclusive.
            while ir_unit < ir_boundary:
                # Assuming that the current token is valid,
                # we look to the right to see if we can
                # expand.
                r_token = self.main.sp_doc[ir_unit+1]

                # If the token is invalid, we stop expanding.
                in_set = r_token.pos_ in speech or r_token.lower_ in literals
                if include ^ in_set:
                    break

                # Else, the token is valid and
                # we continue.
                ir_unit += 1

        assert il_unit >= il_boundary and ir_unit <= ir_boundary
        
        expanded_unit = self.main.sp_doc[il_unit:ir_unit+1]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Expanded Unit of '{UNIT}': {expanded_unit}")
        
        return expanded_unit


    
    def contract_unit(self, *, il_unit, ir_unit, speech=[], literals=[], include=True, direction='BOTH', verbose=False):
        UNIT = self.main.sp_doc[il_unit:ir_unit+1]
        
        if il_unit > ir_unit:
            print(f"Error: il_unit of {il_unit} greater than ir_unit of {ir_unit}")
            return None
        
        # Move Right
        if direction in ['BOTH', 'LEFT']:
            while il_unit < ir_unit:
                # We must check if the current token is not allowed. If it's
                # not allowed, we contract (remove).
                token = self.main.sp_doc[il_unit]

                # include = True means that we want the tokens that match
                # the speech and/or literals in the contracted unit.
                
                # include = False means that we don't want the tokens that
                # match the speech and/or literals in the contracted unit.
                
                # Case 1: include = True, in_set = True
                # We have a token that's meant to be included in the set.
                # However, we're contracting, which means we would end up
                # removing the token if we continue. Therefore, we break.
                
                # Case 2: include = False, in_set = False
                # We have a token that's not in the set which defines the
                # tokens that aren't meant to be included. Therefore, we 
                # have a token that is meant to be included. If we continue,
                # we would end up removing this token. Therefore, we break.
                
                # Default:
                # If we have a token that's in the set (in_set=True) of
                # tokens we're not supposed to include in the contracted 
                # unit (include=False), we need to remove it. Likewise, if
                # we have a token that's not in the set (in_set=False) of
                # tokens to include in the contracted unit (include=True),
                # we need to remove it.
                
                in_set = token.pos_ in speech or token.lower_ in literals
                if include == in_set:
                    break

                # The token is valid, thus we continue.
                il_unit += 1

        # Move Left      
        if direction in ['BOTH', 'RIGHT']:
            while ir_unit > il_unit:
                token = self.main.sp_doc[ir_unit]

                # The token is invalid and we
                # stop contracting.
                in_set = token.pos_ in speech or token.lower_ in literals
                if include == in_set:
                    break

                # The token is valid and we continue.
                ir_unit -= 1

        assert il_unit <= ir_unit
        
        contracted_unit = self.main.sp_doc[il_unit:ir_unit+1]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Contracted Unit of '{UNIT}': {contracted_unit}")
        
        return contracted_unit


    
    def find_unit_context(self, *, il_unit, ir_unit, il_boundary, ir_boundary, speech=["ADJ", "NOUN", "ADP", "ADV", "PART", "PROPN", "VERB", "PRON", "DET", "AUX", "PART", "SCONJ"], literals=[], include=True, enclosures=LAX_ENCLOSURES, comma_encloses=False, verbose=False):
        UNIT = self.main.sp_doc[il_unit:ir_unit+1]
        
        if il_unit > ir_unit:
            print(f"Error: il_unit of {il_unit} greater than ir_unit of {ir_unit}")
            return None
        
        if il_boundary > il_unit:
            print(f"Error: il_unit of {il_unit} less than il_boundary of {il_boundary}")
            return None
        
        if ir_boundary < ir_unit:
            print(f"Error: ir_unit of {ir_unit} greater than ir_boundary of {ir_boundary}")
            return None
        
        # Caveat: Parentheticals
        # The context of a unit inside a set of enclosures should
        # not go farther than the boundaries of those enclosures.
        # However, we need to manually determine whether the unit
        # is in parentheses (or any set of the matching symbols
        # below).
        openers = list(enclosures.keys())
        closers = list(enclosures.values())
        enclosing_chars = [*closers, *openers]

        # Look for Group Punctuation on the Left
        i = il_unit
        opener = None
        while i > il_boundary:
            token = self.main.sp_doc[i]
            if token.lower_ in enclosing_chars and token.lower_ != ",":
                opener = token
                break
            i -= 1

        # Look for Group Punctuation on the Right
        i = ir_unit
        closer = None
        while i < ir_boundary:
            token = self.main.sp_doc[i]
            if token.lower_ in enclosing_chars and token.lower_ != ",":
                closer = token
                break
            i += 1

        # If there's a group punctuation on the left
        # and right, and they match each other (e.g. '(' and ')'),
        # we return the text between the punctuations.
        parenthetical = opener and closer and enclosures.get(opener.lower_) == closer.text
        if parenthetical:
            context = [t for t in self.main.sp_doc[opener.i:closer.i+1]]
            
            if verbose and VERBOSE_LEVEL >= 1:
                print(f"Parenthetical - Unit Context of '{UNIT}': {context}")
            
            return context

        # We can also check whether the unit it enclosed
        # in a comma or two, only if a comma can enclose.
        if comma_encloses:
            i = il_unit
            i_token = self.main.sp_doc[i]
            while i > il_boundary:
                i_token = self.main.sp_doc[i]
                if i_token.lower_ in [",", ";", "—"]:
                    break
                i -= 1

            j = ir_unit
            j_token = self.main.sp_doc[j]
            while j < ir_boundary:
                j_token = self.main.sp_doc[j]
                if j_token.lower_ in [",", ";", "—"]:
                    break
                j += 1

            if (i_token and i_token.lower_ == ",") or (j_token and j_token.lower_ == ","):
                context = [t for t in self.main.sp_doc[i:j+1]]
            
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Comma - Unit Context of '{UNIT}': {context}")
                    
                return context
            
        # As the unit is not a parenthetical, we will expand
        # outwards until we run into a stopping token. The exclude
        # list contains tokens that should be excluded from the
        # context. Currently, it will contain any parentheticals
        # that we run into.
        exclude = []

        # We can modify the enclosures after handling the parenthetical
        # situation to make the code easier.
        if comma_encloses:
            enclosures[","] : ","
        
        # Expand Left
        while il_unit > il_boundary:
            # Assuming that the current token is fine,
            # we look to the left.
            l_token = self.main.sp_doc[il_unit-1]

            if l_token.lower_ not in closers:
                in_set = l_token.pos_ in speech or l_token.lower_ in literals
                if in_set ^ include:
                    break
                il_unit -= 1
            # If it's a closing enclosure (e.g. ')', ']'),
            # we need to skip over whatever is contained in
            # that punctuation.
            else:
                i = il_unit - 1
                
                token = self.main.sp_doc[i]
                exclude.append(token)

                # We continue until we reach the boundary or
                # we find the matching opening character.
                closed = []
                
                while i > il_boundary:
                    token = self.main.sp_doc[i]
                    # Found Closer
                    if token.lower_ in closers:
                        exclude.append(token)
                        closed.append(token.lower_)
                    # Currently Closed
                    elif closed:
                        exclude.append(token)
                        # Found Opener
                        if enclosures.get(token.lower_) == closed[-1]:
                            closed.pop()
                    else:
                        break
                    i -= 1
                
                il_unit = i

        # Expand Right
        while ir_unit < ir_boundary:
            # We're checking the token to the right
            # to see if we can expand or not.
            r_token = self.main.sp_doc[ir_unit+1]

            if r_token.lower_ not in openers:
                in_set = r_token.pos_ in speech or r_token.lower_ in literals
                if in_set ^ include:
                    break
                ir_unit += 1
            # If the token to the right is an opener (e.g. '(', '['), we must skip
            # it, the parenthetical inside, and the closer.
            else:
                i = ir_unit + 1
                
                token = self.main.sp_doc[i]
                exclude.append(token)

                # We continue until we reach the boundary or
                # we find all the closers for the openers.
                opened = []
                
                while i < ir_boundary:
                    token = self.main.sp_doc[i]
                    # Found Opener
                    if token.lower_ in openers:
                        exclude.append(token)
                        opened.append(token.lower_)
                    # Currently Opened
                    elif opened:
                        exclude.append(token)
                        # Found Closer
                        if token.lower_ == enclosures.get(opened[-1]):
                            opened.pop()
                    else:
                        break
                    i += 1
                
                ir_unit = i
        
        # We remove the excluded tokens and return the context.
        context = [t for t in self.main.sp_doc[il_unit:ir_unit+1] if t not in exclude]

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Unit Context of '{UNIT}': {context}")
        
        return context


    
    def get_span_at_indices(self, l_index, r_index):
        text = self.main.sp_doc.text.lower()

        while text[l_index].isspace():
            l_index += 1

        while text[r_index].isspace():
            r_index -= 1

        if l_index > r_index:
            print(f"Error: l_index of {l_index} greater than r_index of {r_index}")
            return None
            
        l_token_i = self.main.token_at_char(l_index).i
        r_token_i = self.main.token_at_char(r_index).i
        
        return self.main.sp_doc[l_token_i:r_token_i+1]


    
    def get_base_nouns(self, span, return_tokens=False, immediate_stop=False):
        ending_nouns = []
        
        reversed_span = [t for t in span]
        reversed_span.reverse()
        
        for token in reversed_span:
            if token.pos_ in ["NOUN", "PROPN"]:
                ending_nouns.append(token if return_tokens else self.main.sp_doc[token.i:token.i+1])
                if immediate_stop:
                    break
            else:
                break

        return ending_nouns



    def flatten(self, arr):
        flat_arr = []

        if not isinstance(arr, list):
            return [arr]

        for element in arr:
            flat_arr.extend(self.flatten(element))

        return flat_arr


    def is_same_text(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_a_text = sp_a.text.lower()

        if sp_a_text == sp_b_text:
            return True
            
        sp_a_singular_texts = [sp_a_text] if sp_a[-1].tag_ in ["NN", "NNP"] else self.main.singularize(sp_a_text)
        sp_b_singular_texts = [sp_b_text] if sp_b[-1].tag_ in ["NN", "NNP"] else self.main.singularize(sp_b_text)

        if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
            return True
        return False



    def has_same_base_nouns(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_b_0_text = sp_b[0].lower_
        sp_b_0_is_noun = sp_b[0].pos_ in ["NOUN", "PROPN"]
        
        sp_b_nouns = []
        sp_b_num_adjectives = 0
        
        for token in sp_b:
            if not sp_b_nouns and token.pos_ == "ADJ":
                sp_b_num_adjectives += 1
            elif token.pos_ in ["PROPN", "NOUN"]:
                sp_b_nouns.append(token)

        if not sp_b_nouns:
            return False

        sp_b_nouns_text = [noun.lower_ for noun in sp_b_nouns]
        sp_b_singular_texts = [" ".join(sp_b_nouns_text)] if sp_b_nouns[-1].tag_ in ["NN", "NNP"] else self.main.singularize(" ".join(sp_b_nouns_text))

        sp_a_text = sp_a.text.lower()
        sp_a_0_text = sp_a[0].lower_
        sp_a_0_is_noun = sp_a[0].pos_ in ["NOUN", "PROPN"]

        # Case Example: 'Hyla' v. 'Hyla tadpoles'
        if sp_a_0_text == sp_b_0_text and (sp_a_0_is_noun or sp_b_0_is_noun):
            if sp_a_text in sp_b_text or sp_b_text in sp_a_text:
                return True
        
        # Case Example: 'dogs' v. 'red dogs'
        sp_a_nouns = []
        sp_a_num_adjectives = 0
        for token in sp_a:
            if not sp_a_nouns and token.pos_ == "ADJ":
                sp_a_num_adjectives += 1
            elif token.pos_ in ["PROPN", "NOUN"]:
                sp_a_nouns.append(token)
        
        if not sp_a_nouns:
            return False
        
        sp_a_nouns_text = [noun.lower_ for noun in sp_a_nouns]
        
        if sp_a_nouns and sp_b_nouns and (
            (sp_a_num_adjectives == 1 and sp_b_num_adjectives == 0) or 
            (sp_b_num_adjectives == 1 and sp_a_num_adjectives == 0)
        ):
            sp_a_singular_texts = [" ".join(sp_a_nouns_text)] if sp_a_nouns[-1].tag_ in ["NN", "NNP"] else self.main.singularize(" ".join(sp_a_nouns_text))
            if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
                return True

        return False

In [109]:
class Species:
    def __init__(self, main):
        # Tools
        self.main = main
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler"])
        self.tn_doc = None
        
        # Contains any spans that have been identified
        # as a species.
        self.spans = None
        self.span_starts = None
        
        # Contains any tokens that have been identified
        # as a species or being a part of a species.
        self.tokens = None
        
        # Used to quickly access the span that a token
        # belongs to.
        self.token_to_span = None
        
        # Maps a string to an array of strings wherein
        # the strings involved in the key-value pair 
        # have been identified as an alternate name of each other.
        self.alternate_names = None
        
        # Includes words that (1) are to be identified as species; and
        # (2) are sometimes not identified as species, more or less.
        self.dictionary = ["juvenile", "juveniles", "adult", "adults", "prey", "predator", "predators", "species", "tree", "cat", "dog", "fly", "flies", "plant", "plants"]



    def update(self, text, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        self.tn_doc = self.tn_nlp(text)
        self.spans, self.tokens, self.token_to_span, self.span_starts = self.load_species(verbose=verbose)
        self.alternate_names = self.load_alternate_names(self.spans)



    def convert_tn_spans_to_sp_spans(self, tn_spans):
        sp_spans = []

        for tn_span in tn_spans:
            l_char_index = self.tn_doc[tn_span.start].idx
            r_char_index = l_char_index + len(tn_span.text) - 1

            try:
                l_sp_token_i = self.main.token_at_char(l_char_index).i
                r_sp_token_i = self.main.token_at_char(r_char_index).i
            except Exception as e:
                print(f"Error: Couldn't find token at character index of {l_char_index} and token index of {l_sp_token_i}.")
                print(f"Error: Couldn't find token at character index of {r_char_index} and token index of {r_sp_token_i}.")
                print(e)
                continue

            sp_span = self.main.sp_doc[l_sp_token_i:r_sp_token_i+1]
            if sp_span.text != tn_span:
                print(f"Error: SpaCy span does not match TaxoNerd span.")
                continue
            
            sp_spans.append(sp_span)

        return sp_spans



    def load_search_strings(self, verbose=False):
        search_strings = [*self.dictionary]
        
        # Creating a Broad Set of Species
        spans = self.convert_tn_spans_to_sp_spans(self.tn_doc.ents)
        spans = self.main.separate_spans_by_parenthetical(spans)

        # Add Ending Nouns to Set
        all_nouns = []
        for span in spans:
            nouns = self.main.get_base_nouns(span, immediate_stop=True)
            # print(f"Base Nouns for '{span}': {nouns}")
            if nouns:
                all_nouns.extend(nouns)
        # print(f"'All Nouns': {all_nouns}")
        spans.extend(all_nouns)

        # Adding Plural and Singular Versions of Spans
        for span in spans:
            text = span.text.lower()
            text = self.main.delete_extra_whitespace(self.main.delete_outer_non_alnum(text))

            # Blank Text or No Letters
            if not text or not [c for c in text if c.isalpha()]:
                continue

            search_strings.append(text)

            # Add Plural Version
            singular = span[-1].pos_ == "NOUN" and span[-1].tag_ == "NN"
            if singular:
                plural_version = self.main.pluralize(text)
                search_strings.extend(plural_version)

            # Add Singular Version
            plural = span[-1].pos_ == "NOUN" and span[-1].tag_ == "NNS"
            if plural:
                singular_version = self.main.singularize(text)
                search_strings.extend(singular_version)

        # Remove Duplicates
        search_strings = list(set(search_strings))

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Search Strings: {search_strings}")
        
        return search_strings



    def load_alternate_names(self, spans, verbose=False):
        spans.sort(key=lambda span: span.start)

        # It's useful to know if a different name refers to a
        # species we have already seen. For example, in
        # "predatory crab (Carcinus maenas)", "predatory crab"
        # is an alternative name for "Carcinus maenas" and
        # vice versa. This is used so that the species can be
        # properly tracked and redundant points are less
        # likely to be given.
        alternate_names = {}
        
        # Finding and Storing Alternative Names
        for i, species_span in enumerate(spans):
            # There's not a next species to
            # evaluate.
            if i + 1 >= len(spans):
                break
            
            next_species_span = spans[i+1]
            
            # If there's one token between the species and the next species,
            # we check if the next species is surrounded by punctuation.
            if next_species_span.start - species_span.end == 1:
                # Token Before and After the Next Species
                before_next = self.main.sp_doc[next_species_span.start-1]

                # Out of Bounds Error
                if next_species_span.end >= len(self.main.sp_doc):
                    continue
                    
                after_next = self.main.sp_doc[next_species_span.end]

                if before_next.lower_ in ["("] and after_next.lower_ in [")"]:
                    sp_1_text = species_span.text.lower()
                    sp_2_text = next_species_span.text.lower()
                    
                    if sp_1_text not in alternate_names:
                        alternate_names[sp_1_text] = []
                    
                    if sp_2_text not in alternate_names:
                        alternate_names[sp_2_text] = []
                    
                    alternate_names[sp_1_text].append(sp_2_text)
                    alternate_names[sp_2_text].append(sp_1_text)
            
            # If there's no token between the species and the next,
            # species we assume that they refer to the same species.
            elif next_species_span.start - species_span.end == 0:
                sp_1_text = species_span.text.lower()
                sp_2_text = next_species_span.text.lower()
                
                if sp_1_text not in alternate_names:
                    alternate_names[sp_1_text] = []
                
                if sp_2_text not in alternate_names:
                    alternate_names[sp_2_text] = []

                alternate_names[sp_1_text].append(sp_2_text)
                alternate_names[sp_2_text].append(sp_1_text)

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Alternate Names: {alternate_names}")

        return alternate_names



    def bar(self, foo):
        if not foo or len(foo) == 1:
            return foo

        foo.sort()
        b = [foo[0]]
        
        l = 0
        
        for i in range(1, len(foo)):
            if foo[i] - foo[l] <= 1:
                l = i
            else:
                b.append(foo[i])
                l = i
            
        return b


    
    def load_species(self, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        # Load Search Strings from Species Spans
        search_strings = self.load_search_strings(verbose=verbose)

        # Search for Species
        # The results are stored in different 
        # forms below.
        spans = []
        tokens = []
        token_to_span = {}

        # Where we're searching for species.
        text = self.main.sp_doc.text.lower()

        for string in search_strings:
            matches = re.finditer(re.escape(string), text, re.IGNORECASE)

            for l_char_index, r_char_index, matched_text in [(match.start(), match.end(), match.group()) for match in matches]:    
                # The full word must match, not just a substring inside of it.
                # So, if the species we're looking for is "ant", only "ant"
                # will match -- not "pants" or "antebellum". Therefore, the
                # characters to the left and right of the matched string cannot
                # be letters.
                l_char_is_letter = l_char_index > 0 and text[l_char_index-1].isalpha()
                r_char_is_letter = r_char_index < len(text) and text[r_char_index].isalpha()
                
                if l_char_is_letter or r_char_is_letter or not matched_text:
                    continue

                try:
                    l_token_i = self.main.token_at_char(l_char_index).i
                    r_token_i = self.main.token_at_char(r_char_index-1).i
                except Exception as e:
                    print(f"Error: Unable to find token at index of {l_char_index}.")
                    print(f"Error: Unable to find token at index of {r_char_index}.")
                    print(f"\tMatched: '{matched_text}'")
                    print(e)
                    continue

                # This is the matched substring (which would be
                # a species) as a span in the parent document.
                span = self.main.sp_doc[l_token_i:r_token_i+1]
                
                # Expand Species
                # Let's say there's a word like "squirrel". That's a bit ambiguous. 
                # Is it a brown squirrel, a bonobo? If the species is possibly missing
                # information (like an adjective to the left of it), we should expand
                # in order to get a full picture of the species.
                unclear_1 = len(span) == 1 and span[0].pos_ == "NOUN"
                unclear_2 = span.start > 0 and self.main.sp_doc[span.start-1].pos_ in ["ADJ"]
                
                if unclear_1 or unclear_2:
                    span = self.main.expand_unit(
                        il_unit=span.start, 
                        ir_unit=span.end-1,
                        il_boundary=0,
                        ir_boundary=len(self.main.sp_doc),
                        speech=["ADJ", "PROPN"],
                        literals=["-"],
                        include=True,
                        direction="LEFT",
                        verbose=verbose
                    )
                
                # Remove Outer Symbols
                # There are times where a species is identified with a parenthesis
                # nearby. Here, we remove that parenthesis (and any other symbols).
                span = self.main.contract_unit(
                    il_unit=span.start, 
                    ir_unit=span.end-1, 
                    speech=["PUNCT", "SYM", "DET", "PART"],
                    include=False,
                    verbose=verbose
                )

                if not span:
                    print(f"Error: Span does not exist; left character index {l_char_index}.")
                    print(f"\tMatched: '{matched_text}'")
                    continue
            
                # A species must have a noun or a
                # proper noun. This may help discard
                # bad results.
                letter_found = False
                for token in span:
                    if token.pos_ in ["NOUN", "PROPN"] or token.lower_ in self.dictionary:
                        letter_found = True
                        break

                if not letter_found:
                    continue

                # Adding Species
                spans.append(span)
                for token in span:
                    if token in tokens or token.pos_ in ["PUNCT", "SYM", "DET", "PART"]:
                        continue
                    tokens.append(token)
                    token_to_span[token] = span
        
        spans = list({span.start: span for span in spans}.values())
        spans.sort(key=lambda span: span.start)

        # for span in spans:
        #     print(span, span.start, span.end)
        
        # Remove Overlapping Spans
        i = 0
        while i < len(spans):
            j = i + 1
            while j < len(spans):
                if spans[i].start <= spans[j].start <= spans[i].end and spans[i].start <= spans[j].end <= spans[i].end:
                    # print(f"Span at i: ({spans[i].start}, {spans[i].end})")
                    # print(f"Span at j: ({spans[j].start}, {spans[j].end})")
                    # print(f'Overlap between "{spans[i]}" and "{spans[j]}"')
                    spans.pop(j)
                else:
                    j += 1
            i += 1
        
        
        
        span_starts = [span[0] for span in spans]
        span_indices = self.bar([span[0].i for span in spans])
        span_starts = [self.main.sp_doc[i] for i in span_indices]

        if verbose and VERBOSE_LEVEL >= 1:
            print("Output of load_species:")
            print(f"Spans: {spans}")
            print(f"Tokens: {tokens}")
            print(f"Mapped Tokens: {token_to_span}")
            print(f"Span Starts: {span_starts}")
        
        return (spans, tokens, token_to_span, span_starts)



    def is_alternate(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_a_text = sp_a.text.lower()
            
        # Species B is an alternate name for Species A
        if sp_b_text in self.alternate_names.get(sp_a_text, []):
            return True
        
        # Species A is an alternate name for Species B
        if sp_a_text in self.alternate_names.get(sp_b_text, []):
            return True

        return False



    def is_same_text(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_a_text = sp_a.text.lower()

        if sp_a_text == sp_b_text:
            return True
            
        sp_a_singular_texts = [sp_a_text] if sp_a[-1].tag_ in ["NN", "NNP"] else self.main.singularize(sp_a_text)
        sp_b_singular_texts = [sp_b_text] if sp_b[-1].tag_ in ["NN", "NNP"] else self.main.singularize(sp_b_text)

        if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
            return True
        return False



    def has_same_base_nouns(self, sp_a, sp_b):
        sp_b_text = sp_b.text.lower()
        sp_b_0_text = sp_b[0].lower_
        sp_b_0_is_noun = sp_b[0].pos_ in ["NOUN", "PROPN"]
        
        sp_b_nouns = []
        sp_b_num_adjectives = 0
        
        for token in sp_b:
            if not sp_b_nouns and token.pos_ == "ADJ":
                sp_b_num_adjectives += 1
            elif token.pos_ in ["PROPN", "NOUN"]:
                sp_b_nouns.append(token)

        if not sp_b_nouns:
            return False

        sp_b_nouns_text = [noun.lower_ for noun in sp_b_nouns]
        sp_b_singular_texts = [" ".join(sp_b_nouns_text)] if sp_b_nouns[-1].tag_ in ["NN", "NNP"] else self.main.singularize(" ".join(sp_b_nouns_text))

        sp_a_text = sp_a.text.lower()
        sp_a_0_text = sp_a[0].lower_
        sp_a_0_is_noun = sp_a[0].pos_ in ["NOUN", "PROPN"]

        # Case Example: 'Hyla' v. 'Hyla tadpoles'
        if sp_a_0_text == sp_b_0_text and (sp_a_0_is_noun or sp_b_0_is_noun):
            if sp_a_text in sp_b_text or sp_b_text in sp_a_text:
                return True
        
        # Case Example: 'dogs' v. 'red dogs'
        sp_a_nouns = []
        sp_a_num_adjectives = 0
        for token in sp_a:
            if not sp_a_nouns and token.pos_ == "ADJ":
                sp_a_num_adjectives += 1
            elif token.pos_ in ["PROPN", "NOUN"]:
                sp_a_nouns.append(token)
        
        if not sp_a_nouns:
            return False
        
        sp_a_nouns_text = [noun.lower_ for noun in sp_a_nouns]
        
        if sp_a_nouns and sp_b_nouns and (
            (sp_a_num_adjectives == 1 and sp_b_num_adjectives == 0) or 
            (sp_b_num_adjectives == 1 and sp_a_num_adjectives == 0)
        ):
            sp_a_singular_texts = [" ".join(sp_a_nouns_text)] if sp_a_nouns[-1].tag_ in ["NN", "NNP"] else self.main.singularize(" ".join(sp_a_nouns_text))
            if set(sp_a_singular_texts).intersection(sp_b_singular_texts):
                return True

        return False



    def find_same_species(self, sp_A, sp_b, verbose=False):
        verbose=False
        # METHOD 1: Check for Literal Matches
        for sp_a in sp_A:
            if self.is_same_text(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 1: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        # METHOD 2: Check Alternate Names
        for sp_a in sp_A:
            if self.is_alternate(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 2: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a
        
        # METHOD 3: Check Nouns
        # This is used if one or none of the species being compared
        # has 1 adjective.
        for sp_a in sp_A:
            if self.has_same_base_nouns(sp_a, sp_b):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 3: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        # METHOD 4: Last Ditch Effort
        # If there's been no matches, we just look for one string inside of
        # another.
        for sp_a in sp_A:
            sp_a_text = sp_a.text.lower()
            sp_b_text = sp_b.text.lower()
            
            r_sp_a_text = re.compile(f"(\s|^){re.escape(sp_a_text)}(\s|$)", re.IGNORECASE)
            r_sp_b_text = re.compile(f"(\s|^){re.escape(sp_b_text)}(\s|$)", re.IGNORECASE)
            
            if re.match(r_sp_a_text, sp_b_text) or re.match(r_sp_b_text, sp_a_text):
                if verbose and VERBOSE_LEVEL >= 1:
                    print(f"Method 4: Match Between '{sp_a}' and '{sp_b}'")
                return sp_a

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"No Matches Between {sp_A} and {sp_b}")
        
        return None



    def span_at_token(self, token):
        if token in self.token_to_span:
            return self.token_to_span[token]
        return None



    def is_species(self, token):
        return token in self.tokens



    def has_species(self, tokens, verbose=False):
        for token in tokens:
            if token in self.tokens:
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tToken '{token}' is Species")
                return True
        return False

In [110]:
class Keywords:
    REGEX = "regex"
    VOCAB = "vocab"
    RULES = "rules"



    def __init__(self, main, *, regexes=[], vocab=[], patterns=[], def_pos=[], def_tag=[], def_threshold=0.7, def_weight=1.0):
        self.main = main

        # Constraints
        self.def_threshold = def_threshold
        self.def_tag = def_tag
        self.def_pos = def_pos
        self.def_weight = def_weight
        
        # Three Types of Matching
        self.vocab, self.vocab_data = self.load_vocab(vocab)
        self.regex, self.regex_data = self.load_regex(regexes)
        self.rules, self.rules_data = self.load_rules(patterns)

        # Quick Lookup
        self.match_type_to_data = {
            Keywords.REGEX: self.regex_data,
            Keywords.VOCAB: self.vocab_data,
            Keywords.RULES: self.rules_data
        }

    

    def update(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        # Matched Tokens in Different Forms
        self.token_data, self.mapped_token_data, self.tokens = self.match_tokens(verbose=verbose)



    def load_regex(self, regexes):
        r = []
        r_data = {}

        for unit in regexes:
            if isinstance(unit, str):
                r.append(unit)
            else:
                regex = unit["regex"]
                r.append(regex)
                r_data[regex] = {
                    "types": unit.get("types", []),
                    "weight": unit.get("weight", self.def_weight)
                }

        return r, r_data



    def load_vocab(self, vocab):
        v = []
        v_data = {}
        
        for unit in vocab:
            if isinstance(unit, str):
                doc = self.main.sp_nlp(unit)
                v.append({
                    "doc": doc,
                    "lemma": " ".join([t.lemma_ for t in doc])
                })
            else:
                doc = self.main.sp_nlp(unit["word"])
                v.append({
                    "doc": doc,
                    "tag": unit.get("tag", self.def_tag),
                    "pos": unit.get("pos", self.def_pos),
                    "threshold": unit.get("threshold", self.def_threshold),
                    "lemma": " ".join([t.lemma_ for t in doc])
                })
                v_data[unit["word"]] = {
                    "types": unit.get("types") or [],
                    "weight": unit.get("weight", self.def_weight),
                }
        
        return v, v_data



    def load_rules(self, patterns):
        r = Matcher(self.main.sp_nlp.vocab)
        r_data = {}
        
        for i, unit in enumerate(patterns):
            if isinstance(unit, list):
                r.add(f"{i}", unit)
            else:
                r.add(unit["name"], unit["pattern"])
                r_data[unit["name"]] = {
                    "types": unit.get("types") or [],
                    "weight": unit.get("weight", self.def_weight),
                }

        return r, r_data



    def get_match_data(self, token, match_id, match_type):
        match_type_data = self.match_type_to_data[match_type]
        
        if match_id in match_type_data:
            return {
                "token": token,
                "types": match_type_data[match_id].get("types", []),
                "weight": match_type_data[match_id].get("weight", self.def_weight)
            }
        else:
            return {
                "token": token,
                "types": [],
                "weight": self.def_weight
            }



    def bad_pos(self, pos):
        return self.def_pos and pos not in self.def_pos



    def bad_tag(self, tag):
        return self.def_tag and tag not in self.def_tag



    def bad_token(self, token):
        return self.bad_pos(token.pos_) or self.bad_tag(token.tag_)



    def match_tokens(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        
        matched_data = []
        matched_tokens = []

        # Match by Regex
        text = self.main.sp_doc.text.lower()
        
        for regex in self.regex:
            matches = [(match.start(), match.end()) for match in re.finditer(regex, text, re.IGNORECASE)]
            
            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\t'{regex}' Regex Matches: {matches}")
            
            for l_char_index, r_char_index in matches:
                span = self.main.get_span_at_indices(l_char_index, r_char_index - 1)

                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tSpan Matched: {span}")

                for token in span:
                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tPossible Regex Match for Token '{token}' (Position: {token.pos_} and Tag: {token.tag_})")
                        
                    if self.bad_token(token):
                        continue
                    
                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tRegex Matched Token '{token}'")
                        
                    matched_tokens.append(token)
                    matched_data.append(self.get_match_data(token, regex, Keywords.REGEX))

        # Match by Rules
        matches = self.rules(self.main.sp_doc)

        if verbose and VERBOSE_LEVEL >= 2:
            print(f"\tRule Matches: {matches}")
        
        for match_id, start, end in matches:
            span = self.main.sp_doc[start:end]
            name = self.main.sp_nlp.vocab.strings[match_id]

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tPattern '{name}' Matched Span: {span}")
            
            for token in span:
                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tPossible Rule Match for Token '{token}' (Position: {token.pos_} and Tag: {token.tag_})")
                    
                if self.bad_token(token):
                    continue
                
                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tRule Matched Token '{token}'")

                matched_tokens.append(token)
                matched_data.append(self.get_match_data(token, name, Keywords.RULES))

        # Match by Vocab
        for token in self.main.sp_doc:
            if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tPossible Vocab Match for Token '{token}' (Position: {token.pos_} and Tag: {token.tag_})")
                    
            if self.bad_token(token) or token in matched_tokens:
                continue

            token_doc = self.main.sp_nlp(token.lower_)
            token_lemma = " ".join([t.lemma_ for t in token_doc])
            
            for vocab_word in self.vocab:
                # Ensure Correct Tag
                if vocab_word.get("tag"):
                    if not [t for t in token_doc if t.tag_ in vocab_word.get("tag")]:
                        if verbose and VERBOSE_LEVEL >= 4:
                            print(f"\t\t\tToken '{token_doc}' not in Vocab Word '{vocab_word['doc']}' Tags ({vocab_word.get('tag')})")
                        continue
                
                # Ensure Correct PoS
                if vocab_word.get("pos"):
                    if not [t for t in token_doc if t.pos_ in vocab_word.get("pos")]:
                        if verbose and VERBOSE_LEVEL >= 4:
                            print(f"\t\t\tToken '{token_doc}' not in Vocab Word '{vocab_word['doc']}' Speech ({vocab_word.get('pos')})")
                        continue

                # Check Lemma
                if verbose and VERBOSE_LEVEL >= 4:
                    print(f"\t\t\t{token_doc} Lemma ({token_lemma}) and {vocab_word['doc']} Lemma ({vocab_word['lemma']})")
                    
                if token_lemma == vocab_word["lemma"]:
                    matched_tokens.append(token)
                    matched_data.append(self.get_match_data(token, vocab_word["doc"].text, Keywords.VOCAB))

                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tVocab (Lemma) Matched Token '{token}'")
                    
                    break

                # Check Similarity
                similarity = vocab_word["doc"].similarity(token_doc)

                if verbose and VERBOSE_LEVEL >= 4:
                    print(f"\t\t\t{token_doc} and {vocab_word['doc']} Similarity: {similarity}")
                    
                if similarity >= vocab_word.get("threshold", self.def_threshold):
                    matched_tokens.append(token)
                    matched_data.append(self.get_match_data(token, vocab_word["doc"].text, Keywords.VOCAB))

                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tVocab Matched Token '{token}'")
                        
                    break

        # Mapping Match(ed Token) Data
        mapped_matched_data = {}
        for matched_token_data in matched_data:
            mapped_matched_data[matched_token_data["token"]] = matched_token_data

        if verbose and VERBOSE_LEVEL >= 1:
            print("Output of match_tokens")
            print(f"Token Data: {matched_data}")
            print(f"Mapped Token Data: {mapped_matched_data}")
            print(f"Token: {matched_tokens}")
        
        return matched_data, mapped_matched_data, matched_tokens

In [111]:
class ExperimentKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            vocab=[
                "study", 
                "hypothesis", 
                "experiment", 
                "found", 
                "discover", 
                "compare", 
                "finding", 
                "result", 
                "test", 
                "examine", 
                "model",
                "measure",
                "manipulate",
                "assess",
                "conduct",
                "data",
                "analyze",
                "sample",
                "observe",
                "observation",
                "predict",
                "suggest",
                "method",
                "investigation",
                "trial",
                "experimental",
                "evidence",
                "demonstrate",
                "analysis",
                "show",
                "compare",
                "comparable",
                "control group", 
                "independent",
                "dependent",
                "applied",
                "treatment",
                "survery",
                "evaluate",
                "ran"
            ],
            def_pos=["VERB", "NOUN", "ADJ"], 
            def_threshold=0.8
        )

In [112]:
class NegativeExperimentKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main,
            patterns=[
                [[{"LOWER": {"IN": ["theory", "theorized", "theories", "review", "reviews", "meta-analysis"]}}]]
            ],
            def_pos=["VERB", "NOUN", "ADJ"], 
            def_threshold=0.8
        )

In [113]:
class NegativeTopicKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            regexes=[
                r"co-?evolution",
                r"evolution",
            ],
            def_pos=["VERB", "NOUN", "ADJ"], 
            def_threshold=0.8
        )

In [114]:
class CauseKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            vocab=[
                "increase", 
                "decrease", 
                "change", 
                "shift", 
                "cause", 
                "produce", 
                "trigger", 
                "suppress", 
                "inhibit",
                "encourage",
                "allow",
                "influence",
                "affect",
                "alter",
                "induce",
                "produce",
                "result in",
                # "associated with",
                # "correlated with",
                "contribute",
                "impact",
                "deter",
                "depressed",
                "when",
                "because",
                "reduce",
                "killed",
                # "supported"
            ],
            def_pos=["VERB", "SCONJ", "NOUN"],
            # def_tag=["VB", "VBD", "WRB", "IN", "VBG"],
            # def_threshold=0.75
            def_threshold=0.8
        )


    
    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)


    
    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        
        return tokens

In [115]:
class ChangeKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            vocab=[
                "few", 
                "more", 
                "increase", 
                "decrease", 
                "less", 
                "short", 
                "long", 
                "greater"
                "shift",
                "fluctuate",
                "adapt",
                "grow",
                "rise"
                "surge",
                "intensify",
                "amplify",
                "multiply",
                "decline",
                "reduce",
                "drop",
                "diminish",
                "fall",
                "lessen",
                "doubled",
                "tripled",
                "lower",
                "adjust",
                "reject",
            ],
            regexes=[
                # Match Examples:
                # 1. "one... as..."
                # 2. "2x than..."
                r"(one|two|three|four|five|six|seven|eight|nine|ten|twice|thrice|([0-9]+|[0-9]+.[0-9]+)(x|%))[\s-]+[^\s]*[\s-]+(as|more|than|likely)([\s-]+|$)"
            ],
            def_pos=["NOUN", "ADJ", "ADV", "VERB", "NUM"],
            def_threshold=0.75
        )


    
    def update(self, verbose=False):
        Keywords.update(self, verbose=verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)


    
    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        filtered = []
        for token in self.main.sp_doc:
            # Already Matched
            if token in tokens:
                filtered.append(token)
            
            # Comparative Adjective
            # Looking for words like "bigger" and "better".
            elif token.pos_ == "ADJ" and token.tag_ == "JJR":
                filtered.append(token)
                continue
            
        return filtered

In [116]:
class TraitKeywords(Keywords):
    FOOD = "Food"
    PRESENT = "Present"
    NOT_APPLICABLE = "N/A"
    
    def __init__(self, main):
        super().__init__(
            main, 
            regexes=[
                r"behaviou?r", 
                r"[^A-Za-z]+rate", 
                "colou?r",
                "biomass",
                r"[^A-Za-z]+mass", 
                r"[^A-Za-z]+size",
                "number",
                "length", 
                "pattern", 
                "weight",
                "shape", 
                "efficiency", 
                "trait",
                "phenotype",
                "demography",
                "scent",
                "population (structure|mechanic)s?",
                "ability", 
                "capacity", 
                "height", 
                "width", 
                "[A-Za-z]+span",
                {"regex": "diet", "types": [TraitKeywords.FOOD]},
                {"regex": "food", "types": [TraitKeywords.FOOD, TraitKeywords.NOT_APPLICABLE]},
                {"regex": "feeding", "types": [TraitKeywords.FOOD]},
                "nest",
                "substrate",
                "breeding",
                r"[^A-Za-z]+age[^A-Za-z]+",
                "lifespan",
                "development",
                "output",
                "time",
                "period",
                "level",
                "configuration",
                "dimorphism",
                "capability",
                "regulation",
                "excretion",
                "luminescence",
                r"[^A-Za-z]+role",
                "sensitivity",
                "resistance",
                r"(un|(^|\s)[A-Za-z]*-)infected",
                "temperature",
                "density",
                {"regex": "presen(t|ce)", "types": [TraitKeywords.PRESENT]},
                {"regex": "absen(t|ce)", "types": [TraitKeywords.PRESENT]},
                "oviposition",
                "semiochemicals",
                "chemicals",
                "content",
                "level"
            ],
            # Ideally, I would only include nouns, but sometimes
            # they're recognized as adjectives.
            def_pos=["NOUN", "ADJ"]
        )


    
    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)


    
    def filter_tokens(self, tokens, verbose=False):
        verbose=True
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Unfiltered Trait Tokens: {tokens}")
        
        filtered = []
        for token in tokens:
            expanded_token = self.main.expand_unit(
                il_unit=token.i, 
                ir_unit=token.i, 
                il_boundary=token.sent.start, 
                ir_boundary=token.sent.end-1, 
                speech=["PUNCT"],
                include=False,
                verbose=verbose
            )

            # print("Filtering Token")
            # print(token)
            # print(expanded_token)
            
            valid_token = True
            
            if not self.main.species.has_species(expanded_token, verbose=verbose):
                valid_token = False

            for chunk in self.main.sp_doc.noun_chunks:
                # print(chunk)
                if token in chunk and chunk[-1] != token:
                    valid_token = False

            if valid_token:
                filtered.append(token)
            
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Filtered Trait Tokens: {filtered}")
        
        return filtered


    def merge_traits(self, traits):
        merged = {}
        
        for trait in traits:
            found_trait = False
            for m_trait in merged:
                if self.main.has_same_base_nouns(trait, m_trait) or self.main.is_same_text(trait, m_trait):
                    merged[m_trait].append(trait)
                    found_trait = True
                    break
                
                trait_types = []
                for thing in trait:
                    things = self.main.trait.mapped_token_data.get(thing)
                    if things:
                        trait_types.extend(things["types"])
                    
                m_trait_types = []
                for thing in m_trait:
                    things = self.main.trait.mapped_token_data.get(thing)
                    if things:
                        m_trait_types.extend(things["types"])
    
                # print(trait_types, m_trait_types)
    
                if not trait_types or not m_trait_types:
                    continue
    
                type_intersection = set(trait_types).intersection(m_trait_types)
                if type_intersection.intersection(['Food']):
                    merged[m_trait].append(trait)
                    found_trait = True
                    break
                
            if not found_trait:
                merged[trait] = [trait]
    
        return merged

In [117]:
class TestKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main,
            vocab=[
                "compare",
                "examine",
                "evaluate",
                "assess",
            ],
            def_pos=["VERB", "NOUN"], 
            def_threshold=0.8
        )

In [118]:
class VariabilityKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main,
            vocab=[
                {"word": "different", "pos": ["ADJ", "NOUN"]},
                {"word": "vary", "pos": ["VERB", "NOUN"]},
                {"word": "varied", "pos": ["VERB", "NOUN"]},
                {"word": "compare", "pos": ["VERB"]}
            ],
            regexes=[
                r"between",
                r"against",
                r"independen(t|ts|tly|cy)",
                r"dependen(t|ts|tly|cy)",
                r"treatments?",
                r"effect",
                r"control",
                r"(with|without)[A-Za-z]*(with|without)",
                r"(^| )(un|not)[-| ]?([A-Za-z]+) [^!;?.\n]* \3",
                r"([A-Za-z]+) [^!;?.\n]* (un|not)[-| ]?\1( |$)",
                # I've added these two words because they can sometimes
                # hint at a variable.
                r"when",
                r"where",
                
            ],
            patterns=[
                [[{"LOWER": {"IN": ["neither", "either", "both"]}}, {"OP": "*", "TAG": {"NOT_IN": ["."]}}, {"LOWER": {"IN": ["or", "and"]}}]],
                [[{"LOWER": {"IN": ["with", "without"]}}, {"OP": "*", "TAG": {"NOT_IN": ["."]}}, {"LOWER": {"IN": ["with", "without"]}}]],
                [[{"LOWER": {"IN": ["at"]}}, {"POS": "NUM"}]],
                [[{"LOWER": {"IN": ["at"]}}, {"LOWER": {"IN": ["several", "unique", "multiple", "different"]}}]],
            ],
        )

In [202]:
class Main(Base):
    def __init__(self):
        self.sp_nlp = spacy.load("en_core_web_lg")
        self.sp_doc = None
        super().__init__(self)

        # Maps Character Position to Token in Document
        # Used to handle differences between different
        # pipelines and tools.
        self.index_map = None

        # Parsers
        # This is currently not being used, I may
        # add it back later.
        self.parts = Parts(self)
        self.species = Species(self)
        self.trait = TraitKeywords(self)
        self.cause = CauseKeywords(self)
        self.change = ChangeKeywords(self)
        # self.experiment = ExperimentKeywords(self)
        # self.variability = VariabilityKeywords(self)
        # self.test = TestKeywords(self)
        # self.not_experiment = NegativeExperimentKplf)


    
    def update_doc(self, doc, verbose=False):
        self.sp_doc = doc
        self.index_map = self.load_index_map()
        self.parts.update()
        self.species.update(doc.text, verbose=False)
        self.trait.update(verbose=False)
        self.cause.update(verbose=False)
        self.change.update(verbose=False)
        # self.experiment.update(verbose=False)
        # self.not_experiment.update(verbose=False)
        # self.not_topic.update(verbose=False)
        # self.variability.update(verbose=False)
        # self.test.update(verbose=False)


    
    def update_text(self, text, verbose=False):
        self.sp_doc = self.sp_nlp(text)
        self.update_doc(self.sp_doc, verbose=verbose)



    def load_index_map(self):
        # SpaCy Doc Not Found
        if self.sp_doc is None:
            raise Exception("DNE")

        # Map Character Index to Token
        index_map = {}
        for token in self.sp_doc:
            l_char_index = token.idx
            r_char_index = token.idx + len(token)

            for i in range(l_char_index, r_char_index):
                index_map[i] = token

        return index_map



    def token_at_char(self, char_index):
        # SpaCy Doc or Indexing Map Not Found
        if not self.sp_doc or not self.index_map:
            raise Exception("DNE")

        if char_index in self.index_map:
            return self.index_map[char_index]

        raise Exception(f"Token at Index {char_index} Not Found")
    

    
    def valid_trait_token(self, data, verbose=False):
        verbose = True
        token = data["token"]
        
        # print(f"Validate Trait Token '{token}'")
        if token not in self.trait.tokens:
            return 0

        # print(f"Token '{token}' in Trait Tokens")
        
        # Check if Applicable
        token_data = self.trait.mapped_token_data[token]
        
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Token '{token}'")
            print(f"Types: {token_data['types']}")
            
        if TraitKeywords.NOT_APPLICABLE in token_data["types"]:
            # print(f"\tToken Not Applicable")
            return 0

        token_context = set(self.find_unit_context(
            il_unit=token.i, 
            ir_unit=token.i, 
            il_boundary=token.sent.start, 
            ir_boundary=token.sent.end-1, 
            verbose=verbose)
        )
        
        causes = set(data["sent_cause_tokens"]).intersection(token_context)
        changes = set(data["sent_change_tokens"]).intersection(token_context)
        
        return 1.0 if causes or changes else 0.25



    def valid_species_token(self, data, verbose=False):
        token = data["token"]
        if token not in self.species.tokens:
            return 0
        
        token_context = set(self.find_unit_context(
            il_unit=token.i, 
            ir_unit=token.i, 
            il_boundary=token.sent.start, 
            ir_boundary=token.sent.end-1, 
            verbose=verbose)
        )
        
        causes = set(data["sent_cause_tokens"]).intersection(token_context)
        changes = set(data["sent_change_tokens"]).intersection(token_context)

        return 1 if causes or changes else 0
    
    

    def valid_trait(self, verbose=False):
        traits = self.trait.merge_traits([self.sp_doc[token.i:token.i+1] for token in self.trait.tokens])

        # For full points, you should have a trait that's mentioned
        # at least two times in different areas.
        two_mentions = False

        for trait, instances in traits.items():
            if len(instances) < 2:
                continue

            locations = set([instance.sent.start for instance in instances])
            if len(locations) >= 2:
                return 1

        if traits:
            return 0.33
        return 0


    
    def valid_trait_variation(self, verbose=False):
        verbose=True
        max_trait_variation_points = 0
        
        sentences = list(self.sp_doc.sents)
        num_sentences = len(sentences)

        for i in range(num_sentences):
            sent_i = sentences[i]
            sent_i_tokens = set([token for token in sent_i])

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence I: {sent_i}")
            
            sent_i_test_tokens = sent_i_tokens.intersection(self.test.tokens)
            sent_i_experiment_tokens = sent_i_tokens.intersection(self.experiment.tokens)

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence I Test Tokens: {sent_i_test_tokens}")
                print(f"\tSentence I Experiment Tokens: {sent_i_experiment_tokens}")

            if not sent_i_test_tokens and not sent_i_experiment_tokens:
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tNo Experiment or Test Tokens in Sentence I")
                continue

            trait_variation_points_i = 0

            if sent_i_experiment_tokens:
                trait_variation_points_i = 0.10
            
            if sent_i_test_tokens:
                trait_variation_points_i = 0.25

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tTrait Variation Points for I: {trait_variation_points_i}")

            sent_i_trait_tokens = sent_i_tokens.intersection(self.trait.tokens)
            sent_i_species_tokens = sent_i_tokens.intersection(self.species.tokens)

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence I Trait Tokens: {sent_i_trait_tokens}")

            if not sent_i_trait_tokens and not sent_i_species_tokens:
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tNo Trait or Specie Tokens in Sentence I")
                continue

            t_variables = []
            s_variables = []
            sent_i_variability_tokens = sent_i_tokens.intersection(self.variability.tokens)

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence I Variability Tokens: {sent_i_variability_tokens}")

            deduct_points = not sent_i_variability_tokens
            
            if sent_i_variability_tokens:
                for token in sent_i_variability_tokens:
                    trait_in_context = set(self.find_unit_context(
                        il_unit=token.i, 
                        ir_unit=token.i, 
                        il_boundary=token.sent.start,
                        ir_boundary=token.sent.end-1, 
                        speech=["PUNCT"],
                        include=False,
                        comma_encloses=True,
                        verbose=verbose
                    )).intersection(self.trait.tokens)

                    specie_in_context = set(self.find_unit_context(
                        il_unit=token.i, 
                        ir_unit=token.i, 
                        il_boundary=token.sent.start,
                        ir_boundary=token.sent.end-1, 
                        speech=["PUNCT"],
                        include=False,
                        comma_encloses=True,
                        verbose=verbose
                    )).intersection(self.species.tokens)
                    
                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tVariability Token '{token}' Traits in Context: {trait_in_context}") 
                        print(f"\t\tVariability Token '{token}' Specie in Context: {specie_in_context}") 

                    if not specie_in_context and not trait_in_context:
                        if verbose and VERBOSE_LEVEL >= 3:
                            print(f"\t\tNo Traits in Variability Token '{token}' Context")
                        continue

                    deduct_points = False
                    t_variables.extend(trait_in_context)
                    s_variables.extend(specie_in_context)

            t_variables = list(set(t_variables))
            s_variables = list(set(s_variables))

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tt_variables: {t_variables}")
                print(f"\ts_variables: {s_variables}")
            
            if t_variables or s_variables:
                trait_variation_points_i += 0.25
            else:
                trait_variation_points_i += 0.15

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tTrait Variation Points for I: {trait_variation_points_i}")

            assert trait_variation_points_i <= 0.5

            for j in range(i, num_sentences):
                sent_j = sentences[j]
                sent_j_tokens = set([token for token in sent_j])

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tSentence J: {sent_j}")

                sent_j_cause_tokens = sent_j_tokens.intersection(self.cause.tokens)
                sent_j_change_tokens = sent_j_tokens.intersection(self.change.tokens)
                sent_j_species_tokens = sent_j_tokens.intersection(self.species.span_starts)
                sent_j_trait_tokens = sent_j_tokens.intersection(self.trait.tokens)

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tSentence J Cause Tokens: {sent_j_cause_tokens}")
                    print(f"\tSentence J Change Tokens: {sent_j_change_tokens}")
                    print(f"\tSentence J Species Tokens: {sent_j_species_tokens}")
                    print(f"\tSentence J Trait Tokens: {sent_j_trait_tokens}")
                
                if not sent_j_species_tokens or (not sent_j_cause_tokens and not sent_j_change_tokens):
                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\tUnsatisfied Conditions for Sentence J")
                    continue

                trait_variation_points_j = 0
                
                if (not sent_j_species_tokens and not sent_j_trait_tokens) or (not t_variables and not s_variables):
                    trait_variation_points_j += 0.25
                elif i == j:
                    trait_variation_points_j += 0.25
                elif i != j:
                    # Check if Variable Referenced Again via Types
                    variable_types = set(self.flatten([self.trait.mapped_token_data[token]["types"] for token in t_variables]))
                    sent_j_trait_types = set(self.flatten([self.trait.mapped_token_data[token]["types"] for token in sent_j_trait_tokens]))

                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\tVariable Types: {variable_types}")
                        print(f"\tTrait Types in Sentence J: {sent_j_trait_types}")
                        
                    # Check if Variable Referenced Again via Literals
                    variable_strings = set([token.lower_ for token in t_variables])
                    sent_j_trait_strings = set([token.lower_ for token in sent_j_trait_tokens])

                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\tVariable Trait (as Strings): {variable_strings}")
                        print(f"\tTrait (as Strings) in Sentence J: {sent_j_trait_strings}")

                    # Check if Trait Referenced
                    t_variable_referenced = bool(variable_types & sent_j_trait_types) or bool(variable_strings & sent_j_trait_strings)

                    # Check if Species Rerferenced
                    sent_j_species_spans = [self.sp_doc[token.i:token.i+1] for token in sent_j_species_tokens]
                    s_variables_spans = [self.sp_doc[token.i:token.i+1] for token in s_variables]
                    overlap = [self.species.find_same_species(sent_j_species_spans, species) for species in s_variables_spans]
                    s_variable_referenced = any(overlap)

                    if verbose and VERBOSE_LEVEL >= 2:
                        print(f"\tT Variable Referenced? {t_variable_referenced}")
                        print(f"\tS Variable Referenced? {s_variable_referenced}")
                    
                    if t_variable_referenced or s_variable_referenced:
                        trait_variation_points_j += 0.50
                    else:
                        trait_variation_points_j += 0.25

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tTrait Variation Points for J: {trait_variation_points_j}")

                assert trait_variation_points_j <= 0.5

                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\ti: {i}")
                    print(f"\tj: {j}")

                trait_variation_points = trait_variation_points_i + trait_variation_points_j

                if deduct_points:
                    trait_variation_points *= 0.6375
                
                if verbose and VERBOSE_LEVEL >= 2:
                    print(f"\tTrait Variation Points: {trait_variation_points}")
                
                max_trait_variation_points = max(max_trait_variation_points, trait_variation_points)
                
                if max_trait_variation_points >= 1:
                    return 1

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Max Trait Variation Points: {max_trait_variation_points}")
            
        return max_trait_variation_points


    
    def update_seen_species(self, data, verbose=False):
        # Unpacking Data
        token = data["token"]
        seen_species = data["seen_species"]
        sent_seen_species = data["sent_seen_species"]
        sent_num_unique_species = data["sent_num_unique_species"]
        
        # Update Seen Species in Entire Text
        span = self.species.span_at_token(token)
        prev_ref = self.species.find_same_species(seen_species.keys(), span, verbose=verbose)
        
        if prev_ref:
            seen_species[prev_ref] += 1
        else:
            seen_species[span] = 1

        # Check Seen Species in Sentence
        # We only add points if it's a species that has not been seen in the sentence. 
        # This is to avoid redundant points. The points are used so that a species that
        # has been previously seen can still be used as long as it hasn't been awarded
        # points already.
        prev_sent_ref = self.species.find_same_species(sent_seen_species.keys(), span, verbose=verbose)
        if prev_sent_ref:
            sent_seen_species[prev_sent_ref]["visits"] += 1
            ref = prev_sent_ref
        else:
            sent_seen_species[span] = {
                "visits": 1,
                "points": 0
            }
            ref = span
        
        # Update Number of Unique Species in Sentence
        if not prev_sent_ref:
            sent_num_unique_species += 1

        return {
            "seen_species": seen_species, 
            "sent_seen_species": sent_seen_species, 
            # Reference to the species, either its previous
            # reference, or the given species.
            "ref": ref,
            # Reference to the species that was last seen
            # in the sentence.
            "seen_in_sent": prev_sent_ref, 
            "sent_num_unique_species": sent_num_unique_species, 
        }

    
    
    def score(self, verbose=False):
        NUM_CATEGORIES = 6

        TRAIT = 0
        SPECIES = 1
        EXPERIMENT = 2
        INTERACTION = 3
        NOT_TOPIC = 4
        TRAIT_VARIATION = 5

        # Max # of Points of Category per Sentence (MPC)
        # A category can collect points from each sentence. However,
        # there's a maximum number of points it can collect. This is
        # determined by the MPC.
        MPC = [1] * NUM_CATEGORIES
    
        # Points per Instance of Category (PIC)
        # Each token is evaluated to check whether a category
        # can be given points. The number of points given, if
        # the token is determined to be satisfactory, is the PIC.
        # The PIC is less than or equal to the MPC for the corresponding
        # category. The idea behind the PIC and MPC is similar to how
        # sets work in tennis: you're not immediately awarded the full points
        # for the set (MPC) if your opponent fails to return the ball,
        # instead you're given a smaller # of points (PIC) that allow you to
        # incrementally win the set (category).
        PIC = [0] * NUM_CATEGORIES
        PIC[TRAIT] = MPC[TRAIT]
        PIC[SPECIES] = MPC[SPECIES]/2.0
        PIC[EXPERIMENT] = MPC[EXPERIMENT]*1
        PIC[INTERACTION] = MPC[INTERACTION]/2.0
        PIC[NOT_TOPIC] = MPC[NOT_TOPIC]

        for i in range(NUM_CATEGORIES):
            assert 0 <= PIC[i] <= MPC[i]

        # Category Weights (CW)
        # It may be helpful to weigh a certain category's fraction of total points
        # more or less than another's. Thus, at the end, we'll take a
        # weighted average of the category's FTP. The weights must add up to 1.
        CW = [0] * NUM_CATEGORIES
        CW[TRAIT] = 0.3
        CW[SPECIES] = 0.1
        CW[EXPERIMENT] = 0.1
        CW[INTERACTION] = 0.1
        CW[NOT_TOPIC] = 0.1
        CW[TRAIT_VARIATION] = 0.3

        assert round(np.sum(CW)) == 1

        # Leniency
        # There are certain categories that aren't going to be as frequent as others.
        # For example, the trait category. You could try and decrease the influence
        # of said category by lowering its MPC and/or increasing the PIC (so that it's
        # easier to achieve the FTP). However, this could make it harder to meaningfully
        # represent the category. The idea of leniency is to remove (some) sentences that had 0
        # points from the scoring. This increases the FTP as, for example, instead of comparing
        # 0.5 points to a total of 2.5 points, you can compare 0.5 to 2.0 points, and so on.
        # A leniency of 1 means that all sentences that received 0 points will be removed from
        # the scoring. A leniency of 0 means that all the sentences are included in the scoring.
        LEN = [0] * NUM_CATEGORIES
        LEN[TRAIT] = 0
        LEN[SPECIES] = 0.5
        LEN[EXPERIMENT] = 0.5
        LEN[INTERACTION] = 0.2
        LEN[NOT_TOPIC] = 0

        for i in range(NUM_CATEGORIES):
            assert 0 <= LEN[i] <= 1

        # Banned Sentences
        # Not allowed to benefit from leniency.
        banned = [[0] * len(list(self.sp_doc.sents)) for _ in range(NUM_CATEGORIES)]

        # Points
        points = [0] * NUM_CATEGORIES
        binned_points = [0] * NUM_CATEGORIES
        
        num_zero_pt_sents = [0] * NUM_CATEGORIES
        seen_species = {}

        # For Testing
        species_instances = []
        interaction_instances = []
        
        if verbose and VERBOSE_LEVEL >= 1:
            print("Extracted Information")
            print(f"Cause Tokens: {self.cause.tokens}")
            print(f"Change Tokens: {self.change.tokens}")
            print(f"Trait Tokens: {self.trait.tokens}")
            print(f"Species Tokens: {self.species.tokens}")
            print(f"Experiment Tokens: {self.experiment.tokens}")
            print(f"Not-Experiment Tokens: {self.not_experiment.tokens}")
            print(f"Not-Topic Tokens: {self.not_topic.tokens}")
            print(f"Variability Tokens: {self.variability.tokens}")
            print(f"Test Tokens: {self.test.tokens}")
        
        for sent_i, sent in enumerate(self.sp_doc.sents):
            # Current Points in Sentence
            curr_points = [0] * NUM_CATEGORIES
            sent_tokens = [token for token in sent]

            data = {
                "sent_cause_tokens": set(sent_tokens).intersection(self.cause.tokens),
                "sent_change_tokens": set(sent_tokens).intersection(self.change.tokens),
                "sent_seen_species": {},
                "sent_num_unique_species": 0    
            }
            
            species_instances.append([])
            interaction_instances.append([])

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tSentence: {sent}")
                print(f"\tSentence Cause Tokens: {data['sent_cause_tokens']}")
                print(f"\tSentence Change Tokens: {data['sent_change_tokens']}")
            
            for token in sent_tokens:
                # If each category has reached their maximum number of points,
                # we can end the loop early.
                all_maxed = True
                for i in range(NUM_CATEGORIES):
                    if i == TRAIT_VARIATION:
                        continue
                    if curr_points[i] < MPC[i]:
                        all_maxed = False

                if all_maxed:
                    break

                # Update Token in Data
                data["token"] = token
                
                # Not Topic Points
                if curr_points[NOT_TOPIC] < MPC[NOT_TOPIC]:
                    if token in self.not_topic.tokens:
                        curr_points[NOT_TOPIC] += PIC[NOT_TOPIC]

                        if verbose and VERBOSE_LEVEL >= 3:
                            print(f"\t\t+ Points for Not-Topic")

                        
                # Trait Points
                if curr_points[TRAIT] < MPC[TRAIT]:
                    if token in self.trait.tokens:
                        scale = self.valid_trait_token(data, verbose=verbose)
                        curr_points[TRAIT] += scale * PIC[TRAIT]

                        if verbose and VERBOSE_LEVEL >= 3 and scale:
                            print(f"\t\t+ Points for Trait")
                
                
                # Not Experiment Points
                if token in self.not_experiment.tokens:
                    curr_points[EXPERIMENT] -= 2 * PIC[EXPERIMENT]
                    banned[EXPERIMENT][sent_i] = 1
                    
                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\t- Points for Experiment")
                
                
                # Experiment Points
                elif curr_points[EXPERIMENT] < MPC[EXPERIMENT]:
                    if token in self.experiment.tokens:
                        scale = 1 if token.pos_ == "VERB" else 0.5
                        curr_points[EXPERIMENT] += scale * PIC[EXPERIMENT]

                        if verbose and VERBOSE_LEVEL >= 3:
                            print(f"\t\t+ Points for Experiment")
                
                        
                # Species and/or Interaction Points
                if token in self.species.span_starts:
                    # Update Species
                    update_data = data
                    update_data["seen_species"] = seen_species
                    updated_data = self.update_seen_species(update_data, verbose=verbose)

                    # Unpacking Updated Data
                    seen_species = updated_data["seen_species"]
                    seen_in_sent = updated_data["seen_in_sent"]
                    sent_seen_species = updated_data["sent_seen_species"]
                    sent_num_unique_species = updated_data["sent_num_unique_species"]
                    ref = updated_data["ref"]
                    
                    data["sent_seen_species"] = sent_seen_species
                    data["sent_num_unique_species"] = sent_num_unique_species

                    if not seen_in_sent and sent_num_unique_species == 1 and sent_seen_species[ref]["visits"] == 1:
                        interaction_instances[-1].append(ref.text)
                    
                    if not seen_in_sent:
                        # Interaction Points
                        # It would make sense to add this fix here as well.
                        # if sent_seen_species[ref]["points"] <= 0:
                        # No it wouldn't
                        if curr_points[INTERACTION] < MPC[INTERACTION]:
                            if sent_num_unique_species == 2:
                                curr_points[INTERACTION] = 2.0 * PIC[INTERACTION]
                                interaction_instances[-1].append(ref.text)
                                
                                if verbose and VERBOSE_LEVEL >= 3:
                                    print(f"\t\t+ Points for Interaction")
                            
                            elif sent_num_unique_species > 2:
                                curr_points[INTERACTION] += PIC[INTERACTION]
                                interaction_instances[-1].append(ref.text)
    
                                if verbose and VERBOSE_LEVEL >= 3:
                                    print(f"\t\t+ Points for Interaction")

                    # Species Points
                    # I added this to fix something, but I can't remember
                    # what it was supposed to fix. Shouldn't seen_in_sent handle this?
                    # Did I forget to indent this chunk of code, and reinvent seen_in_sent?
                    # Can't remember.
                    # I remember now, the species may have been seen in the sentence, but if
                    # it hasn't been awarded any points, it can still be used.
                    if sent_seen_species[ref]["points"] <= 0:
                        if curr_points[SPECIES] < MPC[SPECIES]:
                            scale = self.valid_species_token(data)
                            
                            curr_points[SPECIES] += scale * PIC[SPECIES]
                            sent_seen_species[ref]["points"] += scale * PIC[SPECIES]

                            if scale:
                                species_instances[-1].append(ref.text)
                            
                            if verbose and scale and VERBOSE_LEVEL >= 3:
                                print(f"\t\t+ Points for Species")

            
            # Add Sentence Points to Total Points
            for i in range(NUM_CATEGORIES):
                is_banned = banned[i][sent_i]

                if curr_points[i] <= 0 and not is_banned:
                    num_zero_pt_sents[i] += 1
                
                if not is_banned:
                    points[i] += max(0, min(curr_points[i], MPC[i]))

        
        # Trait & Trait Variation Points
        points[TRAIT_VARIATION] = self.valid_trait_variation(verbose=False)
        points[TRAIT] = self.valid_trait(verbose=False)
        
        # Bins
        bins = []
        for i in range(NUM_CATEGORIES):
            bins.append([-math.inf, 0.33, 0.66, math.inf])
        bins[TRAIT_VARIATION] = [-math.inf, 0.5, 1, math.inf]
        
        # Calculating Score            
        NUM_SENTENCES = len(list(self.sp_doc.sents))
        score = 0
        binned_score = 0

        for i in range(NUM_CATEGORIES):
            if i not in [TRAIT, TRAIT_VARIATION]:
                num_non_zero_pt_sents = NUM_SENTENCES - num_zero_pt_sents[i]
                
                banned_tax = 0
                for b in banned[i]:
                    if b:
                        banned_tax += 1
                
                # print(f"Banned Tax: {banned_tax}")
                lenient_num_sentences = max(num_non_zero_pt_sents, (1 - LEN[i]) * (NUM_SENTENCES) + banned_tax)
    
                # Calculating FTP
                points[i] = points[i] / (MPC[i] * lenient_num_sentences)
    
                # Take the Inverse for Not-Topic
                if i == NOT_TOPIC:
                    points[i] = 1 - points[i]

            # Bin Points
            for b in range(len(bins[i]) - 2, -1, -1):
                # print(bins[i][b], "<=", points[i], "<=", bins[i][b+1])

                if bins[i][b] <= points[i] <= bins[i][b+1]:
                    binned_points[i] = b + 1
                    # binned_points[i] = bins[i][b]
                    break

            points[i] = max(0, min(points[i], 1))
            binned_points[i] = max(0, min(binned_points[i], math.inf))
            
            score += max(0, min(points[i], 1)) * CW[i]
            # Redundant, but we move
            binned_score += max(0, min(binned_points[i], math.inf)) * CW[i]

        # Enforcing 3 or More Species            
        # if len(seen_species) < 3:
        #     return 0, 0

        # Removing, it's now on a scale from 0.0 to 3.0.
        # I could just change the values, but eeeh.
        # assert 0.0 <= score <= 1.0
        # assert 0.0 <= binned_score <= 1.0

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Score, Points: {score}, {points}")
            print(f"Binned Score, Binned Points: {binned_score}, {binned_points}")
    
        return {
            "s_score": 0 if len(seen_species) < 3 else score,
            "s_bin_score": 0 if len(seen_species) < 3 else binned_score,
            "score": score,
            "bin_score": binned_score,
            "points": {
                "trait": points[TRAIT],
                "species": points[SPECIES],
                "experiment": points[EXPERIMENT],
                "interaction": points[INTERACTION],
                "not_topic": points[NOT_TOPIC],
                "trait_variation": points[TRAIT_VARIATION],
            },
            "binned_points": {
                "trait": binned_points[TRAIT],
                "species": binned_points[SPECIES],
                "experiment": binned_points[EXPERIMENT],
                "interaction": binned_points[INTERACTION],
                "not_topic": binned_points[NOT_TOPIC],
                "trait_variation": binned_points[TRAIT_VARIATION],
            },
            "interaction_instances": interaction_instances,
            "species_instances": species_instances,
            "causes": self.cause.tokens,
            "changes": self.change.tokens,
            "traits": self.trait.tokens,
            "species": self.species.tokens,
            "experiments": self.experiment.tokens,
            "not_experiments": self.not_experiment.tokens,
            "not_topics": self.not_topic.tokens,
            "variability": self.variability.tokens,
            "tests": self.test.tokens,
            "seen_species": seen_species
        }