In [None]:
import os
import csv
import json
import glob

# The output files will have
# a base of the string below.
BASE_FILE_NAME = "OUT"

# These are the fields that we will
# store in the CSV file.
DOI = 'doi'
TEXT = 'text'
TITLE = 'title'

# There may be value in saving previous
# outputs of this file. To do so, you'll
# need to know what files already exist.
def next_file_number():
    os.chdir("./")
    max_number = -1
    prefix_length = len(f"{BASE_FILE_NAME}_XXXX_")
    for file in glob.glob(f"{BASE_FILE_NAME}*.csv"):
        file_number = int(file[prefix_length:-4])
        if file_number > max_number:
            max_number = file_number
    return max_number + 1

file_number = next_file_number()
data_file_name = f"./{BASE_FILE_NAME}_DATA_{file_number}.csv"
dump_file_name = f"./{BASE_FILE_NAME}_DUMP_{file_number}.txt"

# The papers will be stored in a dictionary.
# After a threshold is satisfied, we will store
# the papers.
def store_papers(papers):
    with open(data_file_name, 'w', newline='', encoding='utf-8-sig') as file:
        fieldnames = [TITLE, DOI, TEXT]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for paper in papers.values():
            writer.writerow(paper)
    
    with open(dump_file_name, 'w', newline='', encoding='utf-8') as file:
        json.dump(papers, file)

In [None]:
# PDF to Text
import io
import pymupdf
import requests

def pdf_to_text(pdf_url):
    try:
        r = requests.get(url)
        f = io.BytesIO(r.content)
        doc = pymupdf.open(stream=f)
        # Get Text
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except:
        print("Unable to Get PDF")
        return None    

In [None]:
# This dictionary will store all the papers
# that we find through the various sources.
papers = {}
# To make sure that we're not doing anything
# stupid, there'll be a function that adds the papers.
# Use the function.
def add_paper(title, doi, text):
    number_papers = len(papers.keys())
    papers[number_papers] = {TITLE: title, DOI: doi, TEXT: text}
    number_papers += 1

In [None]:
# Keywords
# These keywords should be found elsewhere.
# For now, here's some generic ideas.
keywords = [
    "trait-mediated interaction modification",
    "plasticity",
    "plastic responses",
    "phenotypic",
    "polymorphism"
]
max_number_papers_per_keyword = 10

In [None]:
# Source 1: OpenAlex
from pyalex import Works

number_papers = 0
for keyword in keywords:
    number_papers = 0
    
    # print(f"Keyword: '{keyword}'")
    pager = Works().search_filter(title=keyword).paginate(per_page=200)
    
    for page in pager:
        done = False
        for work in page:
            if work["primary_location"]:
                url = work["primary_location"]["pdf_url"]
                if not url:
                    continue
                # print(f"TITLE: {work['title']}\nDOI: {work['doi']}\nURL: {url}")
                text = pdf_to_text(url)
                add_paper(work['title'], work['doi'], text)
            
            number_papers += 1
            # print(f"Number of Works in Page: {len(page)}")
            # print(f"Number of Total Works: {number_papers}")
            if number_papers >= max_number_papers_per_keyword:
                # print(f"Limit Reached")
                done = True
                break

        if done:
            break

In [None]:
# When all the papers are gathered,
# we'll store them in the CSV file by calling the below
# function.
store_papers(papers)