In [1]:
import re
import time
import glob
import spacy
import pymupdf
import textacy
import requests
from pprint import pprint
from fastcoref import FCoref, LingMessCoref
from taxonerd import TaxoNERD
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher, PhraseMatcher

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# spacy.require_gpu()

True

In [3]:
class Controller:
    def __init__(self):
        print("Loading SP_NLP")
        t0 = time.time()
        self.sp_nlp = spacy.load("en_core_web_lg")
        t1 = time.time()
        print(f"SP_NLP: {t1-t0}s")

        print("Loading TN_NLP")
        t0 = time.time()
        self.tn_nlp = TaxoNERD(prefer_gpu=False).load(model="en_ner_eco_biobert", exclude=["tagger", "parser", "attribute_ruler", "lemmatizer"])
        t1 = time.time()
        print(f"TN_NLP: {t1-t0}s")

        print("Loading FCOREF")
        t0 = time.time()
        self.fcoref = FCoref(enable_progress_bar=False, device='cpu')
        t1 = time.time()
        print(f"FCOREF: {t1-t0}s")
        
        self.sp_doc = None
        self.tn_doc = None
        self.tk_map = None
    
    def update(self, doc):
        self.sp_doc = doc
        print("Updating TN_DOC")
        t0 = time.time()
        self.tn_doc = self.tn_nlp(doc.text)
        t1 = time.time()
        print(f"TN_DOC: {t1-t0}s")

        print("Updating TK_MAP")
        t0 = time.time()
        self.tk_map = self.load_token_map()
        t1 = time.time()
        print(f"TK_MAP: {t1-t0}s")

    def load_token_map(self):
        tk_map = {}
        for token in self.sp_doc:
            tk_map[token.idx] = token.i
        return tk_map

In [4]:
class Species:
    def __init__(self, controller):
        self.controller = controller
        self.species_indices = None

    def update(self):
        if not self.controller.sp_doc or not self.controller.tn_doc:
            return
        print("Updating Species Indices")
        t0 = time.time()
        self.species_indices = self.load_species_indices()
        t1 = time.time()
        print(f"Load Species Indices: {t1-t0}s")
        
    def load_species_indices(self):
        indices = []

        for species_span in self.controller.tn_doc.ents:
            for species in species_span:
                if species.idx not in self.controller.tk_map:
                    raise Exception("Invalid Token")
                index = self.controller.tk_map[species.idx]
                if index in indices:
                    continue
                indices.append(index)

        return indices

    def is_species(self, token):
        index = token.i
        return index in self.species_indices
        
    def has_species(self, tokens):
        for token in tokens:
            if token.i in self.species_indices:
                return True
        return False

In [5]:
class Keywords:
    def __init__(self, controller, literals, pos_types, threshold=0.7):
        self.controller = controller
        self.literals = literals
        self.threshold = threshold
        self.pos_types = pos_types
        self.keywords = [self.controller.sp_nlp(literal) for literal in self.literals]
        self.keyword_indices = []

    def update(self):
        if not self.controller.sp_doc or not self.controller.sp_nlp:
            return
        print("Updating Keyword Indices")
        t0 = time.time()
        self.keyword_indices = self.load_keyword_indices()
        t1 = time.time()
        print(f"Keyword Indices: {t1-t0}s")
        
    def is_keyword(self, token):
        return token.i in self.keyword_indices

    def has_keyword(self, tokens):
        for token in tokens:
            if token.i in self.keyword_indices:
                return True
        return False

    def load_keyword_indices(self):
        indices = []
        for token in self.controller.sp_doc:
            if token.pos_ not in self.pos_types or self.do_not_check(token):
                continue
            # Fast Check
            if token.lemma_ in self.literals:
                indices.append(token.i)
                continue
            # Comparing Similarity
            lemma = self.controller.sp_nlp(token.lemma_)
            for keyword in self.keywords:
                similarity = keyword.similarity(lemma)
                if similarity > self.threshold:
                    indices.append(token.i)
        return indices

    def do_not_check(self, token):
        return len(token) <= 5 or re.match('^[\w]+$', token.text) is None

In [6]:
class ChangeKeywords(Keywords):
    def __init__(self, controller):
        super().__init__(controller, {"increase", "decrease", "change", "weaken", "shift", "cause", "produce", "invade", "modify", "affect"}, ["NOUN", "VERB"], 0.7)

In [7]:
class References:
    def __init__(self, controller, texts=None):
        self.controller = controller
        self.predictions = None
        self.cluster_map = None
        self.text_size_in_tokens = 100
        if texts:
            self.update(texts)

    def update(self, text):
        if not self.controller.sp_doc:
            return
        print("Updating Predictions")
        t0 = time.time()
        texts = []
        offsets = []
        for i in range(0, len(self.controller.sp_doc), self.text_size_in_tokens):
            texts.append(self.controller.sp_doc[i:i+self.text_size_in_tokens].text)
            offsets.append(self.controller.sp_doc[i].idx)
        self.predictions = self.controller.fcoref.predict(texts=texts)
        t1 = time.time()
        print(f"Predictions: {t1-t0}s")

        print("Updating Cluster Map")
        t0 = time.time()
        self.cluster_map = self.load_cluster_map(self.predictions, offsets)
        t1 = time.time()
        print(f"Cluster Map: {t1-t0}s")

    def load_cluster_map(self, predictions, offsets):
        cluster_map = {}
        for prediction, offset in zip(predictions, offsets):
            clusters = prediction.get_clusters(as_strings=False)
            for cluster in clusters:
                # Converting Spans to Tokens
                token_cluster = []
                for span in cluster:
                    index = span[0] + offset
                    if index not in self.controller.tk_map:
                        raise Exception("Invalid Token")
                    index = self.controller.tk_map[index]
                    token_cluster.append(self.controller.sp_doc[index])
                # Mapping
                for token in token_cluster:
                    cluster_map[token.i] = list(filter(lambda t: t != token, token_cluster))
        return cluster_map
            
    def get_references(self, tokens):
        refs = []
        for token in tokens:
            index = token.i
            if index in self.cluster_map:
                refs += self.cluster_map[index]
        return refs

In [8]:
class Scanner():
    def __init__(self, controller):
        self.controller = controller
        self.species = Species(self.controller)
        self.references = References(self.controller)
        self.change_keywords = ChangeKeywords(self.controller)

    def update(self, doc):
        self.controller.update(doc)
        self.references.update(doc.text)
        self.species.update()
        self.change_keywords.update()

In [9]:
def clean_text(text):
    cleaned_text = text
    cleaned_text = re.sub(r'http\S+', '', cleaned_text) # Remove URLs
    cleaned_text = re.sub(r'-\n', '', cleaned_text) # Remove Hyphenations
    cleaned_text = re.sub("\s+", " ", cleaned_text) # Remove Duplicate Spaces
    cleaned_text = re.sub(r"\s+([?.!,])", r"\1", cleaned_text) # Remove Spaces Before Punctuation
    return cleaned_text

In [10]:
def load_documents():
    documents = []
    filenames = glob.glob("../Week 7/Examples/Cleared/*.pdf")
    for filename in filenames:
        full_text = ""
        doc = pymupdf.open(filename)
        for page in doc:
            text = page.get_text()
            full_text += " " + text
        if full_text:
            documents.append(clean_text(full_text))
    return documents

In [11]:
scanner = Scanner(Controller())

documents = load_documents()
for doc in scanner.controller.sp_nlp.pipe(documents):
    t0 = time.time()
    scanner.update(doc)
    t1 = time.time()
    print(f"Total Time: {t1-t0}")
    
    points = 0

    # Gathering All the Species
    full_species = [*scanner.species.species_indices]
    for k, v in scanner.references.cluster_map.items():
        if k in scanner.species.species_indices:
            for token in v:
                if token.i not in full_species:
                    full_species.append(token.i)
        if scanner.species.has_species(v):
            if k not in full_species:
                full_species.append(k)
    
    # Evaluating Points
    species_l_span = 0
    for i in full_species:
        # Remove Redundancy
        if i + 1 in full_species:
            species_l_span += 1
            continue
        species_l_span = 0
            
        RED = '\033[91m'
        GREEN = '\033[92m'
        BLUE = '\033[94m'
        RESET = '\033[0m'
    
        token = scanner.controller.sp_doc[i]
        li = token.sent.start
        ri = token.sent.end
    
        li_tokens = list(filter(lambda ti: ti < token.i - species_l_span, [token.i for token in scanner.controller.sp_doc[li:i]]))
        ri_tokens = [token.i for token in scanner.controller.sp_doc[i+1:ri+1]]
    
        l_changes = set(li_tokens).intersection(set(scanner.change_keywords.keyword_indices))
        r_changes = set(ri_tokens).intersection(set(scanner.change_keywords.keyword_indices))
    
        l_species = set(li_tokens).intersection(set(full_species))
        r_species = set(ri_tokens).intersection(set(full_species))
        
        l_color = RESET
        if l_changes:
            l_color = BLUE
    
        r_color = RESET
        if r_changes:
            r_color = GREEN
        
        if l_changes:
            points += 7.5
        if r_changes:
            points += 7.5
        if l_species:
            points += 10
        if r_species:
            points += 10
            
        print(f"{l_color}{scanner.controller.sp_doc[li:i]}{RESET} {RED}{scanner.controller.sp_doc[i]}{RESET} {r_color}{scanner.controller.sp_doc[i+1:ri+1]}{RESET}")
    
    # Keywords
    keywords = ["ecological", "community", "interaction", "competition", "model", "ecosystem", "resource", "competitor", "predator", "predation", "prey", "trait"]
    for token in scanner.controller.sp_doc:
        if token.lemma_ in keywords:
            points += 2
    
    print(f"Points: {points}")

Loading SP_NLP


05/08/2025 09:23:46 - INFO - 	 GPU is available
05/08/2025 09:23:46 - INFO - 	 TaxoNERD will use GPU


SP_NLP: 8.488579273223877s
Loading TN_NLP
TN_NLP: 39.38270711898804s
Loading FCOREF


05/08/2025 09:24:30 - INFO - 	 missing_keys: []
05/08/2025 09:24:30 - INFO - 	 unexpected_keys: []
05/08/2025 09:24:30 - INFO - 	 mismatched_keys: []
05/08/2025 09:24:30 - INFO - 	 error_msgs: []
05/08/2025 09:24:30 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M


FCOREF: 3.8099234104156494s
Updating TN_DOC


OutOfMemoryError: CUDA out of memory. Tried to allocate 834.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 3.48 GiB is allocated by PyTorch, and 812.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Gathering All the Species
full_species = [*scanner.species.species_indices]
for k, v in scanner.references.cluster_map.items():
    if k in scanner.species.species_indices:
        for token in v:
            if token.i not in full_species:
                full_species.append(token.i)
    if scanner.species.has_species(v):
        if k not in full_species:
            full_species.append(k)

# Evaluating Points
species_l_span = 0
for i in full_species:
    # Remove Redundancy
    if i + 1 in full_species:
        species_l_span += 1
        continue
    species_l_span = 0
        
    RED = '\033[91m'
    GREEN = '\033[92m'
    BLUE = '\033[94m'
    RESET = '\033[0m'

    token = scanner.controller.sp_doc[i]
    li = token.sent.start
    ri = token.sent.end

    li_tokens = list(filter(lambda ti: ti < token.i - species_l_span, [token.i for token in scanner.controller.sp_doc[li:i]]))
    ri_tokens = [token.i for token in scanner.controller.sp_doc[i+1:ri+1]]

    l_changes = set(li_tokens).intersection(set(scanner.change_keywords.keyword_indices))
    r_changes = set(ri_tokens).intersection(set(scanner.change_keywords.keyword_indices))

    l_species = set(li_tokens).intersection(set(full_species))
    r_species = set(ri_tokens).intersection(set(full_species))
    
    l_color = RESET
    if l_changes:
        l_color = BLUE

    r_color = RESET
    if r_changes:
        r_color = GREEN
    
    if l_changes:
        points += 7.5
    if r_changes:
        points += 7.5
    if l_species:
        points += 10
    if r_species:
        points += 10
        
    print(f"{l_color}{scanner.controller.sp_doc[li:i]}{RESET} {RED}{scanner.controller.sp_doc[i]}{RESET} {r_color}{scanner.controller.sp_doc[i+1:ri+1]}{RESET}")

# Keywords
keywords = ["ecological", "community", "interaction", "competition", "model", "ecosystem", "resource", "competitor", "predator", "predation", "prey", "trait"]
for token in scanner.controller.sp_doc:
    if token.lemma_ in keywords:
        points += 2

print(f"Points: {points}")