In [None]:
# Store Data
# This part is similar to the first part
# of this process.
import os
import csv
import sys
import json
import glob
import pandas as pd

# The output files have a base of the string below.
BASE_FILE_NAME = "FILTERED_OUT"
MASTER_BASE_FILE_NAME = "OUT"

# These are the fields that are stored in the CSV
# file. Due to the overlap, I should define these constants
# elsewhere, in a shared file, but they will work here for now.
DOI = "doi"
TEXT = "text"
TITLE = "title"

# We're storing a subset of papers, so we use the number of
# that set in order to maintain that reference. I am saying
# not a lot with a lot of words.
def write_papers(number, papers):
    data_file_name = f"{BASE_FILE_NAME}_DATA_{number}.csv"
    dump_file_name = f"{BASE_FILE_NAME}_DUMP_{number}.csv"
    
    with open(data_file_name, 'w', newline='', encoding='utf-8-sig') as file:
        fieldnames = [TITLE, DOI, TEXT]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for paper in papers.values():
            writer.writerow(paper)
    
    with open(dump_file_name, 'w', newline='', encoding='utf-8') as file:
        json.dump(papers, file)

# This dictionary will store all the papers
# that we find through the various sources.
papers = {}
# To make sure that we're not doing anything
# stupid, there'll be a function that adds the papers.
# Use the function.
def add_paper(title, doi, text):
    number_papers = len(papers.keys())
    papers[number_papers] = {TITLE: title, DOI: doi, TEXT: text}
    number_papers += 1

# We read in a CSV file containing a list of papers to be filtered.
# The number indicates the number in the filename.
def read_papers(number):
    # This is taking too much work for me,
    # someone who is trying to get somewhere.
    # csv.field_size_limit(sys.maxsize//10)
    # papers = []
    file_name = f"{MASTER_BASE_FILE_NAME}_DATA_{number}.csv"
    # with open(file_name, 'r', encoding='utf-8') as file:
    #     csv_reader = csv.reader(file)
    #     header = next(csv_reader)
    #     for row in csv_reader:
    #         papers.append(row)
    df = pd.read_csv(file_name)
    return df
    # return papers

In [None]:
# There's no definite names for these patterns as I do not know what
# to call them. These patterns are used to extract possessive
# relationships from a sentence. I also could not find better names for
# the two variables below.
OWNER = "owner"
OWNED = "owned"

pattern_1 = [
    {
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "POS": {
                "IN": ["NOUN"]
            }
        }
    },
    {
        "LEFT_ID": OWNED,
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "poss"
        }
    }
]

pattern_2 = [
     {
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "POS": {
                "IN": ["NOUN"]
            }
        }
    },
    {
        "LEFT_ID": OWNED,
        "REL_OP": ">",
        "RIGHT_ID": "adp",
        "RIGHT_ATTRS": {
            "DEP": "prep",
            "POS": {
                "IN": ["ADP"]
            }
        }
    },
    {
        "LEFT_ID": "adp",
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "pobj",
            "POS": {
                "IN": ["NOUN"]
            }
        }
    }
]

pattern_3 = [
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "nsubj",
            "POS": {"IN": ["PRON"]}
        }
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "DEP": "dobj",
            "POS": {"IN": ["NOUN"]}
        }
    }
]

pattern_4 = [
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"POS": {"IN": ["VERB"]}}
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": OWNED,
        "RIGHT_ATTRS": {
            "DEP": "nsubj",
            "POS": {"IN": ["NOUN"]}
        }
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "adp",
        "RIGHT_ATTRS": {
            "DEP": "prep",
            "POS": {"IN": ["ADP"]}
        }
    },
    {
        "LEFT_ID": "adp",
        "REL_OP": ">",
        "RIGHT_ID": OWNER,
        "RIGHT_ATTRS": {
            "DEP": "pobj",
            "POS": {"IN": ["NOUN"]}
        }
    }
]

patterns = {
    "Pattern1": pattern_1,
    "Pattern2": pattern_2,
    "Pattern3": pattern_3,
    "Pattern4": pattern_4,
}

def dependency_matcher(sp_nlp):
    matcher = DependencyMatcher(sp_nlp.vocab)
    for pattern_id, pattern in patterns.items():
        matcher.add(pattern_id, [pattern])
    return matcher

In [None]:
import spacy
import stanza
import textacy
from fastcoref import FCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher
# !pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_md-1.1.0.tar.gz
!pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_biobert-1.1.0.tar.gz
# !pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_md_weak-1.1.0.tar.gz
# !pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_biobert_weak-1.1.0.tar.gz

In [None]:
sp_nlp = spacy.load("en_core_web_sm")
st_nlp = stanza.Pipeline(lang='en', processors='tokenize')
fcoref = FCoref(enable_progress_bar=False)
taxonerd = TaxoNERD()
tn_nlp = taxonerd.load(model="en_ner_eco_biobert")

In [26]:
def index_to_token(sp_doc):
    index_to_token_map = {}
    for token in sp_doc:
        index_to_token_map[token.idx] = token
    return index_to_token_map

def index_to_cluster(fc_predictions):
    index_to_cluster_map = {}
    for prediction in fc_predictions:
        clusters = prediction.get_clusters(as_strings=False)
        for cluster in clusters:
            for token in cluster:
                index = token[0]
                index_to_cluster_map[index] = cluster
    return index_to_cluster_map

def index_to_chunk(sp_doc):
    index_to_chunk_map = {}
    for noun_chunk in sp_doc.noun_chunks:
        for token in noun_chunk:
            index_to_chunk_map[token.idx] = noun_chunk
    return index_to_chunk_map

def index_to_what(sp_nlp, sp_doc, what_matches):
    index_to_what_map = {}
    for match_id, token_ids in what_matches:
        pattern_id = sp_nlp.vocab.strings[match_id]
        owner = None
        owned = None
        for i in range(len(token_ids)):
            right_id = patterns[pattern_id][i]["RIGHT_ID"]
            if right_id == OWNER:
                owner = sp_doc[token_ids[i]]
            if right_id == OWNED:
                owned = sp_doc[token_ids[i]]
            # print(f"{right_id}: {sp_doc[token_ids[i]]}")
        if owner.idx not in index_to_what_map:
            index_to_what_map[owner.idx] = []
        index_to_what_map[owner.idx].append(owned)
        if owned.idx not in index_to_what_map:
            index_to_what_map[owned.idx] = []
        index_to_what_map[owned.idx].append(owner)

    return index_to_what_map

def species_indices(tn_doc):
    indices = []
    for species_span in tn_doc.ents:
        for species in species_span:
            indices.append(species.idx)
    return indices

def context(sp_doc, tokens, token_map, cluster_map, chunk_map, what_map):
    token_indices = [token.idx for token in tokens]
    what = []
    chunks = []
    clusters = []
    for token_index in token_indices:
        # Clusters
        if token_index in cluster_map:
            for cluster_token_index in cluster_map[token_index]:
                clusters.append(token_map[cluster_token_index[0]])
        # Chunks
        if token_index in chunk_map:
            for token in chunk_map[token_index]:
                chunks.append(token)
        # What
        if token_index in what_map:
            for token in what_map[token_index]:
                what.append(token)
    return ([*clusters, *chunks, *what], {"clusters": clusters, "chunks": chunks, "what": what})

def is_species(tokens, context, species_indices):
    for token in [*tokens, *context]:
        if token.idx in species_indices:
            print(f"\t\t\tToken '{token.text}' is a Species")
            return True
    return False

def is_relevant(text):
    sp_doc = sp_nlp(text)
    
    token_map = index_to_token(sp_doc)
    # print(token_map)
    
    chunk_map = index_to_chunk(sp_doc)
    # print(chunk_map)

    matcher = dependency_matcher(sp_nlp)
    matches = matcher(sp_doc)
    what_map = index_to_what(sp_nlp, sp_doc, matches)
    # print(what_map)
    
    tn_doc = tn_nlp(text)
    species = species_indices(tn_doc)

    predictions = fcoref.predict(texts=[text])
    cluster_map = index_to_cluster(predictions)

    found_instance = False
    svo_triples = textacy.extract.subject_verb_object_triples(sp_doc)
    for svo_triple in svo_triples:
        print(f"\tTriple: {svo_triple}")
        sub_context = context(sp_doc, svo_triple.subject, token_map, cluster_map, chunk_map, what_map)
        print(f"\t\tSubject Context:")
        print(f"\t\t\tCluster: {sub_context[1]['clusters']}")
        print(f"\t\t\tChunks: {sub_context[1]['chunks']}")
        print(f"\t\t\tWhat: {sub_context[1]['what']}")
        valid_sub = is_species(svo_triple.subject, sub_context[0], species)

        obj_context = context(sp_doc, svo_triple.object, token_map, cluster_map, chunk_map, what_map)
        print(f"\t\tObject Context:")
        print(f"\t\t\tCluster: {obj_context[1]['clusters']}")
        print(f"\t\t\tChunks: {obj_context[1]['chunks']}")
        print(f"\t\t\tWhat: {obj_context[1]['what']}")
        valid_obj = is_species(svo_triple.object, obj_context[0], species)

        if valid_sub and valid_obj:
            print(f"\tValid Subject and Object")
            found_instance = True
        print()
    return found_instance