In [None]:
class Keywords:
    REGEX = "regex"
    VOCAB = "vocab"
    RULES = "rules"



    def __init__(self, main, *, regexes=[], vocab=[], patterns=[], def_pos=[], def_tag=[], def_threshold=0.7, def_weight=1.0):
        self.main = main

        # Constraints
        self.def_threshold = def_threshold
        self.def_tag = def_tag
        self.def_pos = def_pos
        self.def_weight = def_weight
        
        # Three Types of Matching
        self.vocab, self.vocab_data = self.load_vocab(vocab)
        self.regex, self.regex_data = self.load_regex(regexes)
        self.rules, self.rules_data = self.load_rules(patterns)

        # Quick Lookup
        self.match_type_to_data = {
            Keywords.REGEX: self.regex_data,
            Keywords.VOCAB: self.vocab_data,
            Keywords.RULES: self.rules_data
        }

    

    def update(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        # Matched Tokens in Different Forms
        self.token_data, self.mapped_token_data, self.tokens = self.match_tokens(verbose=verbose)



    def load_regex(self, regexes):
        r = []
        r_data = {}

        for unit in regexes:
            if isinstance(unit, str):
                r.append(unit)
            else:
                regex = unit["regex"]
                r.append(regex)
                r_data[regex] = {
                    "types": unit.get("types", []),
                    "weight": unit.get("weight", self.def_weight)
                }

        return r, r_data



    def load_vocab(self, vocab):
        v = []
        v_data = {}
        
        for unit in vocab:
            if isinstance(unit, str):
                doc = self.main.sp_nlp(unit)
                v.append({
                    "doc": doc,
                    "lemma": " ".join([t.lemma_ for t in doc])
                })
            else:
                doc = self.main.sp_nlp(unit["word"])
                v.append({
                    "doc": doc,
                    "tag": unit.get("tag", self.def_tag),
                    "pos": unit.get("pos", self.def_pos),
                    "threshold": unit.get("threshold", self.def_threshold),
                    "lemma": " ".join([t.lemma_ for t in doc])
                })
                v_data[unit["word"]] = {
                    "types": unit.get("types") or [],
                    "weight": unit.get("weight", self.def_weight),
                }
        
        return v, v_data



    def load_rules(self, patterns):
        r = Matcher(self.main.sp_nlp.vocab)
        r_data = {}
        
        for i, unit in enumerate(patterns):
            if isinstance(unit, list):
                r.add(f"{i}", unit)
            else:
                r.add(unit["name"], unit["pattern"])
                r_data[unit["name"]] = {
                    "types": unit.get("types") or [],
                    "weight": unit.get("weight", self.def_weight),
                }

        return r, r_data



    def get_match_data(self, token, match_id, match_type):
        match_type_data = self.match_type_to_data[match_type]
        
        if match_id in match_type_data:
            return {
                "token": token,
                "types": match_type_data[match_id].get("types", []),
                "weight": match_type_data[match_id].get("weight", self.def_weight)
            }
        else:
            return {
                "token": token,
                "types": [],
                "weight": self.def_weight
            }



    def bad_pos(self, pos):
        return self.def_pos and pos not in self.def_pos



    def bad_tag(self, tag):
        return self.def_tag and tag not in self.def_tag



    def bad_token(self, token):
        return self.bad_pos(token.pos_) or self.bad_tag(token.tag_)



    def match_tokens(self, verbose=False):
        # SpaCy Doc DNE or Indexing Map DNE
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        
        matched_data = []
        matched_tokens = []

        # Match by Regex
        text = self.main.sp_doc.text.lower()

        for regex in self.regex:
            matches = [(match.start(), match.end()) for match in re.finditer(regex, text, re.IGNORECASE)]
            
            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\t'{regex}' Regex Matches: {matches}")
            
            for l_char_index, r_char_index in matches:
                span = self.main.get_span_at_indices(l_char_index, r_char_index - 1)

                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tSpan Matched: {span}")

                for token in span:
                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tPossible Regex Match for Token '{token}' (Position: {token.pos_} and Tag: {token.tag_})")
                        
                    if self.bad_token(token):
                        continue
                    
                    if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tRegex Matched Token '{token}'")
                        
                    matched_tokens.append(token)
                    matched_data.append(self.get_match_data(token, regex, Keywords.REGEX))

        # Match by Rules        
        matches = self.rules(self.main.sp_doc)

        if verbose and VERBOSE_LEVEL >= 2:
            print(f"\tRule Matches: {matches}")
        
        for match_id, start, end in matches:
            span = self.main.sp_doc[start:end]
            name = self.main.sp_nlp.vocab.strings[match_id]

            if verbose and VERBOSE_LEVEL >= 2:
                print(f"\tPattern '{name}' Matched Span: {span}")
            
            for token in span:
                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tPossible Rule Match for Token '{token}' (Position: {token.pos_} and Tag: {token.tag_})")
                    
                if self.bad_token(token):
                    continue
                
                if verbose and VERBOSE_LEVEL >= 3:
                    print(f"\t\tRule Matched Token '{token}'")

                matched_tokens.append(token)
                matched_data.append(self.get_match_data(token, name, Keywords.RULES))

        # Match by Vocab
        if self.vocab:
            for token in self.main.sp_doc:
                if verbose and VERBOSE_LEVEL >= 3:
                        print(f"\t\tPossible Vocab Match for Token '{token}' (Position: {token.pos_} and Tag: {token.tag_})")
                        
                if self.bad_token(token) or token in matched_tokens:
                    continue
    
                token_doc = self.main.sp_nlp(token.lower_)
                token_lemma = " ".join([t.lemma_ for t in token_doc])
                
                for vocab_word in self.vocab:
                    # Ensure Correct Tag
                    if vocab_word.get("tag"):
                        if not [t for t in token_doc if t.tag_ in vocab_word.get("tag")]:
                            if verbose and VERBOSE_LEVEL >= 4:
                                print(f"\t\t\tToken '{token_doc}' not in Vocab Word '{vocab_word['doc']}' Tags ({vocab_word.get('tag')})")
                            continue
                    
                    # Ensure Correct PoS
                    if vocab_word.get("pos"):
                        if not [t for t in token_doc if t.pos_ in vocab_word.get("pos")]:
                            if verbose and VERBOSE_LEVEL >= 4:
                                print(f"\t\t\tToken '{token_doc}' not in Vocab Word '{vocab_word['doc']}' Speech ({vocab_word.get('pos')})")
                            continue
    
                    # Check Lemma
                    if verbose and VERBOSE_LEVEL >= 4:
                        print(f"\t\t\t{token_doc} Lemma ({token_lemma}) and {vocab_word['doc']} Lemma ({vocab_word['lemma']})")
                        
                    if token_lemma == vocab_word["lemma"]:
                        matched_tokens.append(token)
                        matched_data.append(self.get_match_data(token, vocab_word["doc"].text, Keywords.VOCAB))
    
                        if verbose and VERBOSE_LEVEL >= 3:
                            print(f"\t\tVocab (Lemma) Matched Token '{token}'")
                        
                        break
    
                    # Check Similarity
                    similarity = vocab_word["doc"].similarity(token_doc)
    
                    if verbose and VERBOSE_LEVEL >= 4:
                        print(f"\t\t\t{token_doc} and {vocab_word['doc']} Similarity: {similarity}")
                        
                    if similarity >= vocab_word.get("threshold", self.def_threshold):
                        matched_tokens.append(token)
                        matched_data.append(self.get_match_data(token, vocab_word["doc"].text, Keywords.VOCAB))
    
                        if verbose and VERBOSE_LEVEL >= 3:
                            print(f"\t\tVocab Matched Token '{token}'")
                            
                        break

        # Mapping Match(ed Token) Data
        mapped_matched_data = {}
        for matched_token_data in matched_data:
            mapped_matched_data[matched_token_data["token"]] = matched_token_data

        if verbose and VERBOSE_LEVEL >= 1:
            print("Output of match_tokens")
            print(f"Token Data: {matched_data}")
            print(f"Mapped Token Data: {mapped_matched_data}")
            print(f"Token: {matched_tokens}")
        
        return matched_data, mapped_matched_data, matched_tokens

In [None]:
class ExperimentKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            regexes=[
                "study", 
                "hypothesis", 
                "experiment", 
                "found", 
                "discover", 
                "compare", 
                "finding", 
                "result", 
                "test", 
                "examine", 
                "model",
                "measure",
                "manipulate",
                "assess",
                "conduct",
                "data",
                "analyze",
                "sample",
                "observe",
                "observation",
                "predict",
                "suggest",
                "method",
                "investigation",
                "trial",
                "experimental",
                "evidence",
                "demonstrate",
                "analysis",
                "show",
                "compare",
                "comparable",
                "control group", 
                "independent",
                "dependent",
                "applied",
                "treatment",
                "survery",
                "evaluate",
                "ran"
            ],
            def_pos=["VERB", "NOUN", "ADJ"], 
            def_threshold=0.8
        )

In [None]:
class NegativeExperimentKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main,
            patterns=[
                [[{"LOWER": {"IN": ["theory", "theorized", "theories", "review", "reviews", "meta-analysis"]}}]]
            ],
            def_pos=["VERB", "NOUN", "ADJ"], 
            def_threshold=0.8
        )

In [None]:
class NegativeTopicKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            regexes=[
                r"co-?evolution",
                r"evolution",
            ],
            def_pos=["VERB", "NOUN", "ADJ"], 
            def_threshold=0.8
        )

In [None]:
class CauseKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            regexes=[
                "increas(e|es|ed|ing)", 
                "decreas(e|es|ed|ing)",
                "chang(e|es|ed|ing)", 
                "shift", 
                "caus(e|es|ed|ing)", 
                "produc(e|es|ed|ing)", 
                "trigger", 
                "suppress", 
                "inhibit",
                "encourag(e|es|ed|ing)",
                "allow",
                "influenc(e|es|ed|ing)",
                "affect",
                "alter",
                "induc(e|es|ed|ing)",
                "result in",
                "contribute",
                "impact",
                "deter",
                "depressed",
                "when",
                "because",
                "reduc(e|es|ed|ing)",
                "kill(e|es|ed|ing)",
                "result",
                "made",
            ],
            def_pos=["VERB", "SCONJ", "NOUN"],
            def_threshold=0.8
        )


    
    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)


    
    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")
        
        return tokens

In [None]:
class ChangeKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main, 
            regexes=[
                "few", 
                "more", 
                "increase", 
                "decrease", 
                "less", 
                "short", 
                "long", 
                "greater"
                "shift",
                "fluctuate",
                "adapt",
                "grow",
                "rise"
                "surge",
                "intensify",
                "amplify",
                "multiply",
                "decline",
                "reduce",
                "drop",
                "diminish",
                "fall",
                "lessen",
                "doubled",
                "tripled",
                "lower",
                "adjust",
                "reject",
                # Match Examples:
                # 1. "one... as..."
                # 2. "2x than..."
                r"(one|two|three|four|five|six|seven|eight|nine|ten|twice|thrice|([0-9]+|[0-9]+.[0-9]+)(x|%))[\s-]+[^\s]*[\s-]+(as|more|than|likely)([\s-]+|$)"
            ],
            def_pos=["NOUN", "ADJ", "ADV", "VERB", "NUM"],
            def_threshold=0.75
        )


    
    def update(self, verbose=False):
        Keywords.update(self, verbose=verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)


    
    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        filtered = []
        for token in self.main.sp_doc:
            # Already Matched
            if token in tokens:
                filtered.append(token)
            
            # Comparative Adjective
            # Looking for words like "bigger" and "better".
            elif token.pos_ == "ADJ" and token.tag_ == "JJR":
                filtered.append(token)
                continue
            
        return filtered

In [None]:
class TraitKeywords(Keywords):
    FOOD = "Food"
    PRESENT = "Present"
    NOT_APPLICABLE = "N/A"
    
    def __init__(self, main):
        super().__init__(
            main, 
            regexes=[
                r"behaviou?r", 
                r"[^A-Za-z]+rate", 
                "colou?r",
                "biomass",
                r"[^A-Za-z]+mass", 
                r"[^A-Za-z]+size",
                "number",
                "length", 
                "pattern", 
                "weight",
                "shape", 
                "efficiency", 
                "trait",
                "phenotype",
                "demography",
                "scent",
                "population (structure|mechanic)s?",
                "ability", 
                "capacity", 
                "height", 
                "width", 
                "[A-Za-z]+span",
                {"regex": "diet", "types": [TraitKeywords.FOOD]},
                {"regex": "food", "types": [TraitKeywords.FOOD, TraitKeywords.NOT_APPLICABLE]},
                {"regex": "feeding", "types": [TraitKeywords.FOOD]},
                "nest",
                "substrate",
                "breeding",
                r"[^A-Za-z]+age[^A-Za-z]+",
                "lifespan",
                "development",
                "output",
                "time",
                "period",
                "level",
                "configuration",
                "dimorphism",
                "capability",
                "regulation",
                "excretion",
                "luminescence",
                r"[^A-Za-z]+role",
                "sensitivity",
                "resistance",
                r"(un|(^|\s)[A-Za-z]*-)infected",
                "temperature",
                "density",
                {"regex": "presen(t|ce)", "types": [TraitKeywords.PRESENT]},
                {"regex": "absen(t|ce)", "types": [TraitKeywords.PRESENT]},
                "oviposition",
                "semiochemicals",
                "chemicals",
                "content",
                "level"
            ],
            # Ideally, I would only include nouns, but sometimes
            # they're recognized as adjectives.
            def_pos=["NOUN", "ADJ"]
        )


    
    def update(self, verbose=False):
        Keywords.update(self, verbose)
        self.tokens = self.filter_tokens(self.tokens, verbose)


    
    def filter_tokens(self, tokens, verbose=False):
        if not self.main.sp_doc or not self.main.index_map:
            raise Exception("DNE")

        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Unfiltered Trait Tokens: {tokens}")
        
        filtered = []
        for token in tokens:
            expanded_token = self.main.expand_unit(
                il_unit=token.i, 
                ir_unit=token.i, 
                il_boundary=token.sent.start, 
                ir_boundary=token.sent.end-1, 
                speech=["PUNCT"],
                include=False,
                verbose=verbose
            )

            valid_token = True

            # Check for Species
            if not self.main.species.has_species(expanded_token, verbose=verbose):
                valid_token = False

            # Check Noun Requirement
            for chunk in self.main.sp_doc.noun_chunks:
                if token in chunk and chunk[-1] != token:
                    valid_token = False

            if valid_token:
                filtered.append(token)
            
        if verbose and VERBOSE_LEVEL >= 1:
            print(f"Filtered Trait Tokens: {filtered}")
        
        return filtered


    def merge_traits(self, traits):
        merged = {}
        
        for trait in traits:
            found_trait = False
            for m_trait in merged:
                if self.main.has_same_base_nouns(trait, m_trait) or self.main.is_same_text(trait, m_trait):
                    merged[m_trait].append(trait)
                    found_trait = True
                    break
                
                trait_types = []
                for thing in trait:
                    things = self.main.trait.mapped_token_data.get(thing)
                    if things:
                        trait_types.extend(things["types"])
                    
                m_trait_types = []
                for thing in m_trait:
                    things = self.main.trait.mapped_token_data.get(thing)
                    if things:
                        m_trait_types.extend(things["types"])
    
                # print(trait_types, m_trait_types)
    
                if not trait_types or not m_trait_types:
                    continue
    
                type_intersection = set(trait_types).intersection(m_trait_types)
                if type_intersection.intersection(['Food']):
                    merged[m_trait].append(trait)
                    found_trait = True
                    break
                
            if not found_trait:
                merged[trait] = [trait]
    
        return merged

In [None]:
class TestKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main,
            regexes=[
                "compare",
                "examine",
                "evaluate",
                "assess",
            ],
            def_pos=["VERB", "NOUN"], 
            def_threshold=0.8
        )

In [None]:
class VariablityKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main,
            vocab=[
                {"word": "different", "pos": ["ADJ", "NOUN"]},
                {"word": "vary", "pos": ["VERB", "NOUN"]},
                {"word": "varied", "pos": ["VERB", "NOUN"]},
                {"word": "compare", "pos": ["VERB"]}
            ],
            regexes=[
                r"between",
                r"against",
                r"independen(t|ts|tly|cy)",
                r"dependen(t|ts|tly|cy)",
                r"treatments?",
                r"effect",
                r"control",
                r"(with|without)[A-Za-z]*(with|without)",
                r"(^| )(un|not)[-| ]?([A-Za-z]+) [^!;?.\n]* \3",
                r"([A-Za-z]+) [^!;?.\n]* (un|not)[-| ]?\1( |$)",
                r"when",
                r"where",
                
            ],
            patterns=[
                [[{"LOWER": {"IN": ["neither", "either", "both"]}}, {"OP": "*", "TAG": {"NOT_IN": ["."]}}, {"LOWER": {"IN": ["or", "and"]}}]],
                [[{"LOWER": {"IN": ["with", "without"]}}, {"OP": "*", "TAG": {"NOT_IN": ["."]}}, {"LOWER": {"IN": ["with", "without"]}}]],
                [[{"LOWER": {"IN": ["at"]}}, {"POS": "NUM"}]],
                [[{"LOWER": {"IN": ["at"]}}, {"LOWER": {"IN": ["several", "unique", "multiple", "different"]}}]],
            ],
        )

In [None]:
class CausalKeywords(Keywords):
    def __init__(self, main):
        super().__init__(
            main,
            patterns=[
                [[
                    {
                        "POS": "PRON"
                    },
                    {
                        "POS": "AUX",
                        "OP": "?",
                    }, 
                ]],
            ],
        )