In [81]:
import os
import re
import glob
import numpy as np
import pymupdf
# from transformers import pipeline

In [69]:
def clean_text(text):
    cleaned_text = text
    cleaned_text = re.sub(r'http\S+', '', cleaned_text) # Remove URLs
    cleaned_text = re.sub(r'-\n', '', cleaned_text) # Remove Hyphenations
    cleaned_text = re.sub("\s+", " ", cleaned_text) # Remove Duplicate Spaces
    cleaned_text = re.sub(r"\s+([?.!,])", r"\1", cleaned_text) # Remove Spaces Before Punctuation
    return cleaned_text

def load_documents(group="Cleared"):
    documents = []
    filenames = glob.glob(f"../Week 7/Examples/{group}/*.pdf")
    for filename in filenames:
        full_text = ""
        doc = pymupdf.open(filename)
        for page in doc:
            text = page.get_text()
            full_text += " " + text
        if full_text:
            documents.append(clean_text(full_text))
    return documents

def pdf_to_text(url):
    try:
        text = ""
        f = pdf_bytes(url)
        doc = pymupdf.open(stream=f)
        for d in doc:
            text += d.get_text()
        return text
    except Exception as e:
        return ""

def load_documents_from_api():
    keywords = ["higher-order interactions", "trait-mediated interaction modification", "trait-mediated interaction", "polymorphism", "apparent competition", "resource competition", "keystone predation", "intraguild predation", "intransitive competition", "trophic chains", "competition chains", "mutual competition"]
    number_keywords = len(keywords)
    all_keywords = [*keywords]
    for i in range(4):
        for j in range(4, number_keywords, 1):
            all_keywords.append(f"{keywords[i]} {keywords[j]}")

    # Loading Texts
    texts = []
    number_works = 0
    number_unfiltered_works = 0
    number_keywords = len(all_keywords)
    k = 0
    for keyword in all_keywords:
        print(f"({k + 1}/{number_keywords}) Searching Keyword '{keyword}'")
        pager = Works().search_filter(title=keyword).paginate(per_page=200)
        for page in pager:
            for work in page:
                number_unfiltered_works += 1
                
                title = work['title']
                abstract = work['abstract']
                doi = work['doi']
                
                # Find Full Text
                url = None
                if work["primary_location"]:
                    url = work["primary_location"]["pdf_url"]
                full_text = "" if not url else pdf_to_text(url)
                
                if not abstract and not full_text:
                    continue
                texts.append((k, title, doi, abstract if abstract and not full_text else full_text))
                number_works += 1
        k += 1
        clear_output(wait=True)        

    assert len(texts) == number_works
    print(f"Number Documents: {len(texts)}, Number Unfiltered Documents: {number_unfiltered_works}")
    return (texts, [text[-1] for text in texts])

In [70]:
documents = load_documents()

In [None]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [90]:
text = documents[0]

chunks = []
chunk_length = 2000

i = 0
while i < len(text):
    chunk = text[i:i+chunk_length]

    j = i + chunk_length
    while j < len(text) and text[j] != " ":
        chunk += text[j]
        j += 1
    
    chunks.append(chunk)
    i = j

print(f"Number Chunks: {len(chunks)}")

Number Chunks: 66


In [92]:
labels = ["ecology", "ecosystem", "trait-mediated interactions", "predator-prey interactions", "ecological interactions", "behavioral ecology", "species coexistence", "trophic dynamics", "phenotypic plasticity", "functional ecology", "indirect ecological effects", "community ecology"]

scores = np.zeros(len(labels))
for chunk in chunks:
    result = classifier(chunk, labels)
    scores += np.array(result["scores"])

scores /= len(chunks)

print(np.mean(scores))

0.08333333409414863


In [176]:
def zero_shot(texts=documents):
    for text in texts:
        # Break Into Parts
        chunks = []
        chunk_length = len(text)
        
        i = 0
        while i < len(text):
            chunk = text[i:i+chunk_length]
        
            j = i + chunk_length
            while j < len(text) and text[j] != " ":
                chunk += text[j]
                j += 1
            
            chunks.append(chunk)
            i = j

        # Classify
        labels = ["trait-mediated interaction modification"]
        scores = {}
        for label in labels:
            scores[label] = 0
            
        for i, chunk in enumerate(chunks):
            print(f"Chunk {i+1}/{len(chunks)}")
            result = classifier(chunk, labels)
            print(result["labels"], result["scores"])
            for label, score in zip(result["labels"], result["scores"]):
                scores[label] += score

        print(scores)
        scores = np.array(list(scores.values()))
        scores /= len(chunks)

        print(f"Document Scores: {np.mean(scores)}")

In [177]:
zero_shot()
zero_shot(["A study has been made of the effect of “crowding” on both prenatal and postnatal control of wing development in aphids. In experiments on prenatal form control it was shown that brief controlled encounters of only a minute's duration between two adult aphids could cause them to switch from producing apterous to alate progeny. Aphids which had been exposed to contact with other aphids continued to produce alate progeny for several days. Evidence from a number of experiments suggests that the principle stimulus involved is tactile. In experiments on postnatal form control, it was shown that more alates developed among larvae which were reared together than among larvae reared in isolation."])

Chunk 1/1
['trait-mediated interaction modification'] [0.7951579093933105]
{'trait-mediated interaction modification': 0.7951579093933105}
Document Scores: 0.7951579093933105
Chunk 1/1
['trait-mediated interaction modification'] [0.8306777477264404]
{'trait-mediated interaction modification': 0.8306777477264404}
Document Scores: 0.8306777477264404
Chunk 1/1
['trait-mediated interaction modification'] [0.9232949614524841]
{'trait-mediated interaction modification': 0.9232949614524841}
Document Scores: 0.9232949614524841
Chunk 1/1
['trait-mediated interaction modification'] [0.9911022782325745]
{'trait-mediated interaction modification': 0.9911022782325745}
Document Scores: 0.9911022782325745
Chunk 1/1
['trait-mediated interaction modification'] [0.26065483689308167]
{'trait-mediated interaction modification': 0.26065483689308167}
Document Scores: 0.26065483689308167
