In [125]:
import re
import csv
import time
import glob
import numpy as np
import spacy
import random
import pickle
import pymupdf
import textacy
import requests
from transformers import pipeline
from pprint import pprint
from fastcoref import FCoref, LingMessCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher, PhraseMatcher
from pyalex import Works
from IPython.display import clear_output

In [126]:
def clean_text(text):
    cleaned_text = text
    cleaned_text = re.sub(r'http\S+', '', cleaned_text) # Remove URLs
    cleaned_text = re.sub(r'-\n', '', cleaned_text) # Remove Hyphenations
    cleaned_text = re.sub("\s+", " ", cleaned_text) # Remove Duplicate Spaces
    cleaned_text = re.sub(r"\s+([?.!,])", r"\1", cleaned_text) # Remove Spaces Before Punctuation
    return cleaned_text

def pdf_to_text(url):
    try:
        text = ""
        f = pdf_bytes(url)
        doc = pymupdf.open(stream=f)
        for d in doc:
            text += d.get_text()
        return text
    except Exception as e:
        return ""

def load_local_documents(group="Cleared"):
    documents = []
    filenames = glob.glob(f"../Week 7/Examples/{group}/*.pdf")
    for filename in filenames:
        full_text = ""
        doc = pymupdf.open(filename)
        for page in doc:
            text = page.get_text()
            full_text += " " + text
        if full_text:
            documents.append(clean_text(full_text))
    return documents

def load_documents():
    # Cross Product
    keywords = ["higher-order interactions", "trait-mediated interaction modification", "trait-mediated interaction", "polymorphism", "apparent competition", "resource competition", "keystone predation", "intraguild predation", "intransitive competition", "trophic chains", "competition chains", "mutual competition"]
    number_keywords = len(keywords)
    for i in range(4):
        for j in range(4, number_keywords, 1):
            keywords.append(f"{keywords[i]} {keywords[j]}")
    

    # Loading Texts
    texts = []
    number_works = 0
    number_unfiltered_works = 0
    number_keywords = len(keywords)
    
    for k, keyword in enumerate(keywords):
        print(f"({k + 1}/{number_keywords}) Searching Keyword '{keyword}'")
        pager = Works().search_filter(title=keyword).paginate(per_page=200)
        for page in pager:
            for work in page:
                number_unfiltered_works += 1
                
                title = work['title']
                abstract = work['abstract']
                doi = work['doi']
                
                # Find Full Text
                url = None
                if work["primary_location"]:
                    url = work["primary_location"]["pdf_url"]
                full_text = "" if not url else pdf_to_text(url)
                
                if not abstract and not full_text:
                    continue
                texts.append((number_works, title, doi, abstract if abstract and not full_text else full_text))
                number_works += 1
        k += 1
        clear_output(wait=True)        

    assert len(texts) == number_works
    print(f"Number Documents: {number_works}, Number Unfiltered Documents: {number_unfiltered_works}")
    return texts

In [127]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def on_topic(text):
    # Topic and Threshold
    topics = [("ecology", 0.75), ("interaction", 0.75)]

    for topic, threshold in topics:
        # Break Into Parts
        chunks = []
        chunk_length = len(text)
        
        i = 0
        while i < len(text):
            chunk = text[i:i+chunk_length]
    
            # Ensure Full Words
            j = i + chunk_length
            while j < len(text) and text[j] != " ":
                chunk += text[j]
                j += 1
    
            i = j
            chunks.append(chunk)
    
        # Classify
        labels = [topic]
        scores = {}
        for label in labels:
            scores[label] = 0
            
        for i, chunk in enumerate(chunks):
            result = classifier(chunk, labels)
            for label, score in zip(result["labels"], result["scores"]):
                scores[label] += score
        
        mean_score = np.mean(np.array(list(scores.values())) / len(chunks))
        if mean_score < threshold:
            return False

    return True

In [128]:
class Controller:
    def __init__(self):
        # print("Loading SP_NLP")
        t0 = time.time()
        self.sp_nlp = spacy.load("en_core_web_lg")
        t1 = time.time()
        # print(f"SP_NLP: {t1-t0}s")

        # print("Loading TN_NLP")
        t0 = time.time()
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler", "lemmatizer"])
        t1 = time.time()
        # print(f"TN_NLP: {t1-t0}s")

        # print("Loading FCOREF")
        t0 = time.time()
        self.fcoref = FCoref(enable_progress_bar=False, device='cpu')
        t1 = time.time()
        # print(f"FCOREF: {t1-t0}s")
        
        self.sp_doc = None
        self.tn_doc = None
        self.tk_map = None
    
    def update(self, doc):
        self.sp_doc = doc
        # print("Updating TN_DOC")
        t0 = time.time()
        self.tn_doc = self.tn_nlp(doc.text)
        t1 = time.time()
        # print(f"TN_DOC: {t1-t0}s")

        # print("Updating TK_MAP")
        t0 = time.time()
        self.tk_map = self.load_token_map()
        t1 = time.time()
        # print(f"TK_MAP: {t1-t0}s")

    def load_token_map(self):
        tk_map = {}
        for token in self.sp_doc:
            tk_map[token.idx] = token.i
        return tk_map

In [129]:
class Species:
    def __init__(self, controller):
        self.controller = controller
        self.species_spans = None
        self.species_indices = None

    def update(self):
        if not self.controller.sp_doc or not self.controller.tn_doc:
            return
        # print("Updating Species Indices and Spans")
        t0 = time.time()
        self.species_spans, self.species_indices = self.load_species_spans()
        t1 = time.time()
        # print(f"Load Species Indices and Span: {t1-t0}s")
        
    def load_species_spans(self):
        spans = []
        indices = []
        for species_span in self.controller.tn_doc.ents:
            l_species_idx = species_span[0].idx
            r_species_idx = species_span[-1].idx
            
            if l_species_idx not in self.controller.tk_map or r_species_idx not in self.controller.tk_map:
                raise Exception("Invalid Token")
                
            l_species_i = self.controller.tk_map[l_species_idx]
            r_species_i = self.controller.tk_map[r_species_idx]

            span = self.controller.sp_doc[l_species_i:r_species_i+1]
            spans.append(span)
            indices += [token.i for token in span]
        return (spans, indices)

    def is_species(self, token):
        return token.i in self.species_indices
        
    def has_species(self, tokens):
        for token in tokens:
            if token.i in self.species_indices:
                return True
        return False

In [130]:
class Keywords:
    def __init__(self, controller, literals, pos_types, threshold=0.7):
        self.controller = controller
        self.literals = literals
        self.threshold = threshold
        self.pos_types = pos_types
        self.keywords = [self.controller.sp_nlp(literal) for literal in self.literals]
        self.keyword_indices = []

    def update(self):
        if not self.controller.sp_doc or not self.controller.sp_nlp:
            return
        # print("Updating Keyword Indices")
        t0 = time.time()
        self.keyword_indices = self.load_keyword_indices()
        t1 = time.time()
        # print(f"Keyword Indices: {t1-t0}s")
        
    def is_keyword(self, token):
        return token.i in self.keyword_indices

    def has_keyword(self, tokens):
        for token in tokens:
            if token.i in self.keyword_indices:
                return True
        return False

    def load_keyword_indices(self):
        indices = []
        for token in self.controller.sp_doc:
            if token.pos_ not in self.pos_types or self.do_not_check(token):
                continue
            # Fast Check
            if token.lemma_ in self.literals:
                indices.append(token.i)
                continue
            # Comparing Similarity
            lemma = self.controller.sp_nlp(token.lemma_)
            for keyword in self.keywords:
                similarity = keyword.similarity(lemma)
                if similarity > self.threshold:
                    indices.append(token.i)
        return indices

    def find_keyword_indices(self, tokens):
        indices = []
        for token in tokens:
            if token.pos_ not in self.pos_types or self.do_not_check(token):
                continue
            # Fast Check
            if token.lemma_ in self.literals:
                indices.append(token.i)
                continue
            # Comparing Similarity
            lemma = self.controller.sp_nlp(token.lemma_)
            for keyword in self.keywords:
                similarity = keyword.similarity(lemma)
                if similarity > self.threshold:
                    indices.append(token.i)
        return indices

    def do_not_check(self, token):
        return len(token) <= 5 or re.match('^[\w]+$', token.text) is None

In [131]:
class ChangeKeywords(Keywords):
    def __init__(self, controller):
        super().__init__(controller, {"increase", "decrease", "change", "shift", "cause", "produce"}, ["NOUN", "VERB"], 0.6)

In [132]:
class References:
    def __init__(self, controller, texts=None):
        self.controller = controller
        self.predictions = None
        self.cluster_map = None
        self.text_size_in_tokens = 100
        if texts:
            self.update(texts)

    def update(self, text):
        if not self.controller.sp_doc:
            return
        # print("Updating Predictions")
        t0 = time.time()
        texts = []
        offsets = []
        for i in range(0, len(self.controller.sp_doc), self.text_size_in_tokens):
            texts.append(self.controller.sp_doc[i:i+self.text_size_in_tokens].text)
            offsets.append(self.controller.sp_doc[i].idx)
        self.predictions = self.controller.fcoref.predict(texts=texts)
        t1 = time.time()
        # print(f"Predictions: {t1-t0}s")

        # print("Updating Cluster Map")
        t0 = time.time()
        self.cluster_map = self.load_cluster_map(self.predictions, offsets)
        t1 = time.time()
        # print(f"Cluster Map: {t1-t0}s")

    def load_cluster_map(self, predictions, offsets):
        cluster_map = {}
        for prediction, offset in zip(predictions, offsets):
            clusters = prediction.get_clusters(as_strings=False)
            for cluster in clusters:
                # Converting Spans to Tokens
                token_cluster = []
                for span in cluster:
                    if not span:
                        continue
                    index = span[0] + offset
                    if index not in self.controller.tk_map:
                        continue
                        # raise Exception("Invalid Token")
                    index = self.controller.tk_map[index]
                    token_cluster.append(self.controller.sp_doc[index])
                # Mapping
                for token in token_cluster:
                    cluster_map[token.i] = list(filter(lambda t: t != token, token_cluster))
        return cluster_map
            
    def get_references(self, tokens):
        refs = []
        for token in tokens:
            index = token.i
            if index in self.cluster_map:
                refs += self.cluster_map[index]
        return refs

    def same_reference(self, token_a, token_b):
        if token_a.lemma_.lower() == token_b.lemma_.lower():
            return True
        if token_a.i in self.cluster_map and token_b in self.cluster_map[token_a.i]:
            return True
        if token_b.i in self.cluster_map and token_a in self.cluster_map[token_b.i]:
            return True
        return False

    def same_reference_span(self, span_a, span_b):
        if span_a.text.lower() == span_b.text.lower():
            return True
        for token_a in span_a:
            for token_b in span_b:
                if self.same_reference(token_a, token_b):
                    return True
        return False

In [136]:
class Scanner():
    def __init__(self, controller):
        # Helpers
        self.controller = controller
        self.species = Species(self.controller)
        self.references = References(self.controller)
        self.changes = ChangeKeywords(self.controller)

        # Used to Evaluate Points
        self.level1 = {"ecological", "community", "interaction", "trait", "ecosystem", "ecology"}
        self.level2 = {"model"}
        self.level3 = {"predator", "prey", "competitor", "resource", "predation", "specie", "result", "effect", "population", "species", "invade", "presence"}

    def update(self, doc):
        self.controller.update(doc)
        self.references.update(doc.text)
        self.species.update()

    def get_full_species(self):
        full_species = [*self.species.species_spans]
        full_indices = [*self.species.species_indices]
        
        for k, v in self.references.cluster_map.items():
            if k in self.species.species_indices:
                for token in v:
                    if token.i not in full_indices:
                        token_span = self.controller.sp_doc[token.i:token.i+1]
                        full_species.append(token_span)
                        full_indices.append(token.i)
            if self.species.has_species(v):
                if k not in full_indices:
                    token_span = self.controller.sp_doc[k:k+1]
                    full_species.append(token_span)
                    full_indices.append(k)
        
        return (full_species, full_indices)

    def get_points(self):
        points = 0

        # Species Work
        visited_species = {}
        species_spans, species_indices = self.get_full_species()
        
        for species_span in species_spans:
            # Adjusting Species if Vague
            if species_span[0].text.lower() in ["species"]:
                i = species_span[0].i
                j = i
                while j > 0:
                    prev_token = self.controller.sp_doc[j-1]
                    if prev_token.pos_ not in ["NOUN", "ADJ", "PROPN", "SYM"]:
                        break
                    species_span = self.controller.sp_doc[j-1:i+1]
                    # print(f"Adjusted Species: {species_span}")
                    j -= 1
                
            # Repeating Species
            past_visits = 0
            for sp in visited_species.keys():
                visited_span = self.controller.sp_doc[sp[0]:sp[1]+1]
                if self.references.same_reference_span(species_span, visited_span):
                    past_visits = visited_species[sp]
                    visited_species[sp] += 1
                    break
            if past_visits == 0:
                visited_species[(species_span[0].i, species_span[-1].i)] = 1
            if past_visits > 50:
                continue
                    
            li = species_span[0].sent.start
            ri = species_span[-1].sent.end

            sli = species_span[0].i
            sri = species_span[-1].i

            l_token_indices = set([token.i for token in self.controller.sp_doc[li:sli]])
            r_token_indices = set([token.i for token in self.controller.sp_doc[sri+1:ri]])

            # Nearby Actions (Modification)
            change_indices = set(self.changes.find_keyword_indices(self.controller.sp_doc[li:ri]))
            l_changes = l_token_indices.intersection(change_indices)
            r_changes = r_token_indices.intersection(change_indices)

            # There must be a change.
            if not l_changes and not r_changes:
                continue
                
            # Nearby Species (Interaction)
            l_species = l_token_indices.intersection(species_indices)
            r_species = r_token_indices.intersection(species_indices)

            if l_changes or r_changes:
                points += 10 * (past_visits + 1)
                change_found = True
            if l_species or r_species:
                points += 10 * (past_visits + 1)
                other_species_found = True
            points += min(past_visits * 10, 100)
            
        print(f"Species Points After: {points}")
        
        # Adjustments
        points /= len(list(self.controller.sp_doc.sents))
        print(f"Adjust Points After: {points}")

        print(f"Visited Species: {visited_species}")
        if len(visited_species) < 3:
            return 0
        return points

In [137]:
# # (1) Load and (2) Filter Documents by Topic
# labeled_documents = list(filter(lambda d: on_topic(d[-1]), load_documents()))
# documents = [d[-1] for d in labeled_documents]
# print(f"Number of Documents: {len(documents)}")

# # Save Documents
# with open('documents.pickle', 'wb') as file:
#     pickle.dump(labeled_documents, file)

# Load Documents
with open('documents.pickle', 'rb') as file:
    labeled_documents = pickle.load(file)
    documents = [d[-1] for d in labeled_documents]
    print(f"Number of Documents: {len(documents)}")

# Scan Documents
scanner = Scanner(Controller())
i = 0
output = []
for doc in scanner.controller.sp_nlp.pipe(documents):
    print(f"{i+1}/{len(documents)}")
    # t0 = time.time()
    scanner.update(doc)
    # t1 = time.time()
    # print(f"Total Time: {t1-t0}")"
    
    points = scanner.get_points()
    output.append((labeled_documents[i][0], labeled_documents[i][1], labeled_documents[i][2], points))
    # print(f"'{labeled_documents[i][0]}' Points: {points}\n")
    i += 1
    clear_output(wait=True)

clear_output(wait=True)

MemoryError: 

In [123]:
# # (1) Load and (2) Filter Documents by Topic
# # documents = list(filter(lambda d: on_topic(d), load_local_documents()))
# print(f"Number of Documents: {len(documents)}")

# scanner = Scanner(Controller())
# i = 0
# output = []
# for doc in scanner.controller.sp_nlp.pipe(documents):
#     print(f"{i+1}/{len(documents)}")
#     scanner.update(doc)
    
#     points = scanner.get_points()
#     output.append((i, points))
#     print(f"'{i}' Points: {points}\n")
#     i += 1
#     # clear_output(wait=True)
    
# # clear_output(wait=True)

' 2072 Ecology, 82(7), 2001, pp. 2072–2081 q 2001 by the Ecological Society of America EFFECTS OF TOP PREDATOR SPECIES ON DIRECT AND INDIRECT INTERACTIONS IN A FOOD WEB OSWALD J. SCHMITZ1 AND K. BLAKE SUTTLE2 Yale University, School of Forestry and Environmental Studies and Department of Ecology and Evolutionary Biology, New Haven, Connecticut 06511 USA Abstract. Current theory on trophic interactions in food webs assumes that ecologically similar species can be treated collectively as a single f'

In [None]:
output.sort(key=lambda d: d[-1], reverse=True)

# data = [["Index", "Points"], *output]
data = [["Index", "Title", "DOI", "Points"], *output]
with open('test_output.csv', 'w', encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(data)