In [1]:
import re
import time
import glob
import spacy
import pymupdf
import textacy
import requests
from pprint import pprint
from fastcoref import FCoref, LingMessCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher, PhraseMatcher
from pyalex import Works
from IPython.display import clear_output
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Controller:
    def __init__(self):
        # print("Loading SP_NLP")
        t0 = time.time()
        self.sp_nlp = spacy.load("en_core_web_lg")
        t1 = time.time()
        # print(f"SP_NLP: {t1-t0}s")

        # print("Loading TN_NLP")
        t0 = time.time()
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler", "lemmatizer"])
        t1 = time.time()
        # print(f"TN_NLP: {t1-t0}s")

        # print("Loading FCOREF")
        t0 = time.time()
        self.fcoref = FCoref(enable_progress_bar=False, device='cpu')
        t1 = time.time()
        # print(f"FCOREF: {t1-t0}s")
        
        self.sp_doc = None
        self.tn_doc = None
        self.tk_map = None
    
    def update(self, doc):
        self.sp_doc = doc
        # print("Updating TN_DOC")
        t0 = time.time()
        self.tn_doc = self.tn_nlp(doc.text)
        t1 = time.time()
        # print(f"TN_DOC: {t1-t0}s")

        # print("Updating TK_MAP")
        t0 = time.time()
        self.tk_map = self.load_token_map()
        t1 = time.time()
        # print(f"TK_MAP: {t1-t0}s")

    def load_token_map(self):
        tk_map = {}
        for token in self.sp_doc:
            tk_map[token.idx] = token.i
        return tk_map

In [3]:
class Species:
    def __init__(self, controller):
        self.controller = controller
        self.species_spans = None
        self.species_indices = None

    def update(self):
        if not self.controller.sp_doc or not self.controller.tn_doc:
            return
        # print("Updating Species Indices and Spans")
        t0 = time.time()
        self.species_spans, self.species_indices = self.load_species_spans()
        t1 = time.time()
        # print(f"Load Species Indices and Span: {t1-t0}s")
        
    def load_species_spans(self):
        spans = []
        indices = []
        for species_span in self.controller.tn_doc.ents:
            l_species_idx = species_span[0].idx
            r_species_idx = species_span[-1].idx
            
            if l_species_idx not in self.controller.tk_map or r_species_idx not in self.controller.tk_map:
                raise Exception("Invalid Token")
                
            l_species_i = self.controller.tk_map[l_species_idx]
            r_species_i = self.controller.tk_map[r_species_idx]

            span = self.controller.sp_doc[l_species_i:r_species_i+1]
            spans.append(span)
            indices += [token.i for token in span]
        return (spans, indices)

    def is_species(self, token):
        return token.i in self.species_indices
        
    def has_species(self, tokens):
        for token in tokens:
            if token.i in self.species_indices:
                return True
        return False

In [4]:
class Keywords:
    def __init__(self, controller, literals, pos_types, threshold=0.7):
        self.controller = controller
        self.literals = literals
        self.threshold = threshold
        self.pos_types = pos_types
        self.keywords = [self.controller.sp_nlp(literal) for literal in self.literals]
        self.keyword_indices = []

    def update(self):
        if not self.controller.sp_doc or not self.controller.sp_nlp:
            return
        # print("Updating Keyword Indices")
        t0 = time.time()
        self.keyword_indices = self.load_keyword_indices()
        t1 = time.time()
        # print(f"Keyword Indices: {t1-t0}s")
        
    def is_keyword(self, token):
        return token.i in self.keyword_indices

    def has_keyword(self, tokens):
        for token in tokens:
            if token.i in self.keyword_indices:
                return True
        return False

    def load_keyword_indices(self):
        indices = []
        for token in self.controller.sp_doc:
            if token.pos_ not in self.pos_types or self.do_not_check(token):
                continue
            # Fast Check
            if token.lemma_ in self.literals:
                indices.append(token.i)
                continue
            # Comparing Similarity
            lemma = self.controller.sp_nlp(token.lemma_)
            for keyword in self.keywords:
                similarity = keyword.similarity(lemma)
                if similarity > self.threshold:
                    indices.append(token.i)
        return indices

    def find_keyword_indices(self, tokens):
        indices = []
        for token in tokens:
            if token.pos_ not in self.pos_types or self.do_not_check(token):
                continue
            # Fast Check
            if token.lemma_ in self.literals:
                indices.append(token.i)
                continue
            # Comparing Similarity
            lemma = self.controller.sp_nlp(token.lemma_)
            for keyword in self.keywords:
                similarity = keyword.similarity(lemma)
                if similarity > self.threshold:
                    indices.append(token.i)
        return indices

    def do_not_check(self, token):
        return len(token) <= 5 or re.match('^[\w]+$', token.text) is None

In [5]:
class ChangeKeywords(Keywords):
    def __init__(self, controller):
        super().__init__(controller, {"increase", "decrease", "change", "weaken", "shift", "cause", "produce", "invade", "modify", "affect"}, ["NOUN", "VERB"], 0.6)

In [7]:
class References:
    def __init__(self, controller, texts=None):
        self.controller = controller
        self.predictions = None
        self.cluster_map = None
        self.text_size_in_tokens = 100
        if texts:
            self.update(texts)

    def update(self, text):
        if not self.controller.sp_doc:
            return
        # print("Updating Predictions")
        t0 = time.time()
        texts = []
        offsets = []
        for i in range(0, len(self.controller.sp_doc), self.text_size_in_tokens):
            texts.append(self.controller.sp_doc[i:i+self.text_size_in_tokens].text)
            offsets.append(self.controller.sp_doc[i].idx)
        self.predictions = self.controller.fcoref.predict(texts=texts)
        t1 = time.time()
        # print(f"Predictions: {t1-t0}s")

        # print("Updating Cluster Map")
        t0 = time.time()
        self.cluster_map = self.load_cluster_map(self.predictions, offsets)
        t1 = time.time()
        # print(f"Cluster Map: {t1-t0}s")

    def load_cluster_map(self, predictions, offsets):
        cluster_map = {}
        for prediction, offset in zip(predictions, offsets):
            clusters = prediction.get_clusters(as_strings=False)
            print(f"Clusters: {clusters}")
            for cluster in clusters:
                print(f"\tCluster: {cluster}")
                # Converting Spans to Tokens
                token_cluster = []
                for span in cluster:
                    print(f"\t\tSpan: {span}")
                    index = span[0] + offset
                    if index not in self.controller.tk_map:
                        raise Exception("Invalid Token")
                    index = self.controller.tk_map[index]
                    token_cluster.append(self.controller.sp_doc[index])
                # Mapping
                for token in token_cluster:
                    cluster_map[token.i] = list(filter(lambda t: t != token, token_cluster))
        return cluster_map
            
    def get_references(self, tokens):
        refs = []
        for token in tokens:
            index = token.i
            if index in self.cluster_map:
                refs += self.cluster_map[index]
        return refs

    def same_reference(self, token_a, token_b):
        if token_a.lemma_ == token_b.lemma_:
            return True
        if token_a.i in self.cluster_map and token_b.i in self.cluster_map[token_a.i]:
            return True
        if token_b.i in self.cluster_map and token_a.i in self.cluster_map[token_b.i]:
            return True
        return False

    def same_reference_span(self, span_a, span_b):
        if span_a.lemma_ == span_b.lemma_:
            return True
        for token_a in span_a:
            for token_b in span_b:
                if self.same_reference(token_a, token_b):
                    return True
        return False

In [8]:
class Scanner():
    def __init__(self, controller):
        # Helpers
        self.controller = controller
        self.species = Species(self.controller)
        self.references = References(self.controller)
        self.changes = ChangeKeywords(self.controller)

        # Used to Evaluate Points
        self.level1 = {"ecological", "community", "interaction", "trait", "ecosystem", "ecology"}
        self.level2 = {"model"}
        self.level3 = {"predator", "prey", "competitor", "resource", "predation", "specie", "result", "effect", "population", "species", "invade", "presence"}

    def update(self, doc):
        self.controller.update(doc)
        self.references.update(doc.text)
        self.species.update()

    def get_full_species(self):
        full_species = [*self.species.species_spans]
        full_indices = [*self.species.species_indices]
        
        for k, v in self.references.cluster_map.items():
            if k in self.species.species_indices:
                for token in v:
                    if token.i not in full_indices:
                        token_span = self.controller.sp_doc[token.i:token.i+1]
                        full_species.append(token_span)
                        full_indices.append(token.i)
            if self.species.has_species(v):
                if k not in full_indices:
                    token_span = self.controller.sp_doc[k:k+1]
                    full_species.append(token_span)
                    full_indices.append(k)
        
        return (full_species, full_indices)

    def get_points(self, verbose=True):
        points = 0

        change_found = False
        other_species_found = False
        
        # Species Work
        if verbose:
            print(f"Species Points Before: {points}")
        visited_species_spans = {}
        species_spans, species_indices = self.get_full_species()
        
        species_sent = None
        for species_span in species_spans:
            if species_span[0].sent.start == species_sent:
                continue

            # Repeating Species
            past_visits = 0
            for sp in visited_species_spans.keys():
                if self.references.same_reference_span(species_span, self.controller.sp_doc[sp[0]:sp[1]+1]):
                    past_visits = visited_species_spans[sp]
                    visited_species_spans[sp] += 1
            if past_visits == 0:
                visited_species_spans[(species_span[0].i, species_span[-1].i)] = 1
                    
            li = species_span[0].sent.start
            ri = species_span[-1].sent.end

            l_token_indices = set([token.i for token in self.controller.sp_doc[li:i]])
            r_token_indices = set([token.i for token in self.controller.sp_doc[i+1:ri]])

            # Nearby Actions (Modification)
            change_indices = set(self.changes.find_keyword_indices(self.controller.sp_doc[li:ri]))
            l_changes = l_token_indices.intersection(change_indices)
            r_changes = r_token_indices.intersection(change_indices)

            # There must be a change.
            if not l_changes and not r_changes:
                continue
                
            # Nearby Species (Interaction)
            l_species = l_token_indices.intersection(species_indices)
            r_species = r_token_indices.intersection(species_indices)

            if l_changes or r_changes:
                points += 10 * (past_visits + 1)
                change_found = True
            if l_species or r_species:
                points += 10 * (past_visits + 1)
                other_species_found = True
            points += min(past_visits * 10, 1000)
            
            species_sent = li
        if verbose:
            print(f"Species Points After: {points}")
        
        # Keyword Work
        if verbose:
            print(f"Keyword Points Before: {points}")
        for token in self.controller.sp_doc:
            if token.pos_ not in ["NOUN", "VERB"]:
                continue
            
            lemma = token.lemma_
            if lemma in self.level1:
                points += 100
            elif lemma in self.level2:
                points += 25
            elif lemma in self.level3:
                points += 1
        if verbose:
            print(f"Keyword Points After: {points}")
        
        # Fairness
        if verbose:
            print(f"Fairness Points Before: {points}")
        points //= len(list(self.controller.sp_doc.sents))
        if verbose:
            print(f"Fairness Points After: {points}")

        if not change_found or len(visited_species_spans) < 3:
            return 0
        return points

In [9]:
def clean_text(text):
    cleaned_text = text
    cleaned_text = re.sub(r'http\S+', '', cleaned_text) # Remove URLs
    cleaned_text = re.sub(r'-\n', '', cleaned_text) # Remove Hyphenations
    cleaned_text = re.sub("\s+", " ", cleaned_text) # Remove Duplicate Spaces
    cleaned_text = re.sub(r"\s+([?.!,])", r"\1", cleaned_text) # Remove Spaces Before Punctuation
    return cleaned_text

def load_documents(group="Cleared"):
    documents = []
    filenames = glob.glob(f"../Week 7/Examples/{group}/*.pdf")
    for filename in filenames:
        full_text = ""
        doc = pymupdf.open(filename)
        for page in doc:
            text = page.get_text()
            full_text += " " + text
        if full_text:
            documents.append(clean_text(full_text))
    return documents

def pdf_to_text(url):
    try:
        text = ""
        f = pdf_bytes(url)
        doc = pymupdf.open(stream=f)
        for d in doc:
            text += d.get_text()
        return text
    except Exception as e:
        return ""

def load_documents_from_api():
    keywords = ["higher-order interactions", "trait-mediated interaction modification", "trait-mediated interaction", "polymorphism", "apparent competition", "resource competition", "keystone predation", "intraguild predation", "intransitive competition", "trophic chains", "competition chains", "mutual competition"]
    number_keywords = len(keywords)
    all_keywords = [*keywords]
    for i in range(4):
        for j in range(4, number_keywords, 1):
            all_keywords.append(f"{keywords[i]} {keywords[j]}")

    # Loading Texts
    texts = []
    number_works = 0
    number_unfiltered_works = 0
    number_keywords = len(all_keywords)
    k = 0
    for keyword in all_keywords:
        print(f"({k + 1}/{number_keywords}) Searching Keyword '{keyword}'")
        pager = Works().search_filter(title=keyword).paginate(per_page=200)
        for page in pager:
            for work in page:
                number_unfiltered_works += 1
                
                title = work['title']
                abstract = work['abstract']
                doi = work['doi']
                
                # Find Full Text
                url = None
                if work["primary_location"]:
                    url = work["primary_location"]["pdf_url"]
                full_text = "" if not url else pdf_to_text(url)
                
                if not abstract and not full_text:
                    continue
                texts.append((k, title, doi, abstract if abstract and not full_text else full_text))
                number_works += 1
        k += 1
        clear_output(wait=True)        

    assert len(texts) == number_works
    print(f"Number Documents: {len(texts)}, Number Unfiltered Documents: {number_unfiltered_works}")
    return (texts, [text[-1] for text in texts])

In [9]:
labeled_documents, documents = load_documents_from_api()
labeled_points = []

scanner = Scanner(Controller())
i = 0
for doc in scanner.controller.sp_nlp.pipe(documents):
    print(f"{i+1}/{len(documents)}")
    t0 = time.time()
    scanner.update(doc)
    t1 = time.time()
    # print(f"Total Time: {t1-t0}")
    
    points = scanner.get_points(verbose=False)
    labeled_points.append((labeled_documents[i][0], labeled_documents[i][1], labeled_documents[i][2], points))
    # print(f"'{labeled_documents[i][0]}' Points: {points}\n")
    i += 1
    clear_output(wait=True)

2777/10356


05/09/2025 20:28:45 - INFO - 	 Tokenize 2 inputs...
Map: 100%|██████████| 2/2 [00:00<00:00, 31.43 examples/s]
05/09/2025 20:28:45 - INFO - 	 ***** Running Inference on 2 texts *****


TypeError: 'NoneType' object is not subscriptable

In [10]:
import csv

labeled_points.sort(key=lambda tup: tup[-1], reverse=True)

data = [
    ["Index", "Title", "DOI", "Points"],
    *labeled_points
]

file_path = 'output.csv'
with open(file_path, 'w', encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(data)

In [None]:
len(documents)

In [13]:
labeled_documents[2776]

for doc in scanner.controller.sp_nlp.pipe([documents[2776]]):
    print(f"{i+1}/{len(documents)}")
    t0 = time.time()
    scanner.update(doc)

2777/10356


05/09/2025 21:29:51 - INFO - 	 Tokenize 2 inputs...
Map: 100%|██████████| 2/2 [00:00<00:00, 21.23 examples/s]
05/09/2025 21:29:51 - INFO - 	 ***** Running Inference on 2 texts *****


TypeError: 'NoneType' object is not subscriptable