In [1]:
# OpenAlex: Finding Papers
# Let's say we have a set or an array of keywords.
# We can use OpenAlex to find a large number of papers
# that, in some way, match those keywords. This is an
# example of how it could work. Furthermore, Veronica
# mentioned how there's other characteristics that you
# may be looking for, like how far back you want to go
# in searching for papers.
from pyalex import Works

# Later, I'll use these URLs to try out the PDF to text
# tools. If there's any.
urls = []
number_urls = 0

keywords = ["Machine Learning"]

pager = Works().search_filter(title=keywords[0]).paginate(per_page=200)
number_works = 0

for page in pager:
    for work in page:        
        print(f"Title: {work['title']}")
        # You'll see that some PDFs do include URLs,
        # while some do not. Maybe in these cases, we could
        # instead use the abstract that OpenAlex provides.
        # However, there are still chances that the abstract
        # is also unavailable. Also, there was an instance of
        # an incorrect abstract, so that's also something to 
        # consider.
        print(f"Abstract: {work['abstract']}")
        if work["primary_location"]:
            url = work["primary_location"]["pdf_url"]
            if url:
                print(url)
                urls.append(url)
                number_urls += 1 
        number_works += 1
        print()
    if number_urls >= 10 or number_works >= 50:
        break

print(f"URLs: {urls}")

Title: Scikit-learn: Machine Learning in Python
Abstract: Scikit-learn is a Python module integrating a wide range of state-of-the-art machine learning algorithms for medium-scale supervised and unsupervised problems. This package focuses on bringing machine learning to non-specialists using a general-purpose high-level language. Emphasis is put on ease of use, performance, documentation, and API consistency. It has minimal dependencies and is distributed under the simplified BSD license, encouraging its use in both academic and commercial settings. Source code, binaries, and documentation can be downloaded from http://scikit-learn.org.

Title: Genetic algorithms in search, optimization, and machine learning
Abstract: From the Publisher:
This book brings together - in an informal and tutorial fashion - the computer techniques, mathematical tools, and research results that will enable both students and practitioners to apply genetic algorithms to problems in many fields. 

Major concept

In [2]:
# URLs
# As you may be able to see, the number of URLs provided are not plenty.
# I'm not sure if this is important, but what if good papers are missed out on
# because they don't have a PDF listed? Also, it seems that OpenAlex offers
# everything, so I'm not sure if these are research papers that one would want
# to use. Possibly a problem for a later day.
# In case I can't find enough URLs, this serves to find any paper with an URL.
# This is just for the "testing" later on.
print(f"URLs: {urls}")
if len(urls) < 10:
    while len(urls) < 10:
        work = Works().random()
        while not work["primary_location"] or not work["primary_location"]["pdf_url"]:
            work = Works().random()
        urls.append(work["primary_location"]["pdf_url"])
print(f"URLs: {urls}")

URLs: ['https://link.springer.com/content/pdf/10.1023/A:1022602019183.pdf', 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0169748&type=printable', 'https://www.frontiersin.org/articles/10.3389/fninf.2014.00014/pdf', 'https://pubs.rsc.org/en/content/articlepdf/2018/sc/c7sc02664a', 'https://www.mdpi.com/1424-8220/18/8/2674/pdf?version=1534247979', 'https://link.aps.org/accepted/10.1103/PhysRevLett.108.058301', 'https://journals.sagepub.com/doi/pdf/10.1177/2053951715622512', 'https://aclanthology.org/D14-1179.pdf', 'https://direct.mit.edu/books/oa-monograph/chapter-pdf/2025421/c000700_9780262301183.pdf', 'https://www.aeaweb.org/articles/pdf/doi/10.1257/jep.31.2.87', 'https://direct.mit.edu/books/oa-monograph/chapter-pdf/2025438/c002700_9780262301183.pdf', 'https://link.springer.com/content/pdf/10.1007/s12525-021-00475-2.pdf', 'https://aip.scitation.org/doi/pdf/10.1063/1.4966192', 'https://academic.oup.com/bib/article-pdf/7/1/86/23992771/bbk007.pdf', 'https://www.

In [3]:
# PDF to Text
# It's not logical to physically download a PDF
# to reference it in the code. Especially if you're
# going through thousands of PDFs. So, we can download
# the bytes of an URL that links to said PDF to get the
# PDF file. This code contains the simple process of
# downloading the bytes of the PDF.
import io
import requests

def pdf_bytes(url):
    r = requests.get(url)
    f = io.BytesIO(r.content)
    return f

In [4]:
# PDF to Text: PyPDF
# This doesn't seem to be useless, but it throws
# exceptions easily. It's understandable why an
# error would be thrown if the PDF cannot be parsed.
# However, I wonder if this would cause problems with
# the number of issues a PDF could have for whatever
# reason.
# This also works, but the text is not impressive.
from pypdf import PdfReader

def pdf_to_text_pypdf(url):
    f = pdf_bytes(url)
    reader = PdfReader(f)
    contents = reader.get_page(0).extract_text().split('\n')
    print(contents)
    print(" ".join(contents))

pdf_to_text_pypdf("https://link.springer.com/content/pdf/10.1023/A:1022602019183.pdf")

ModuleNotFoundError: No module named 'pypdf'

In [None]:
# PDF to Text: Tika
# This has been running for at least
# 13 minutes, possibly more. I can't tell
# if it's stuck in an infinite loop, or if the
# PDF is that long. It could be that the link 
# to the PDF is a download. I'm not sure if this 
# is affecting the parser. I'm going to stop it
# and try a different URL. Now that I'm using a
# PDF that works, I'm seeing that the server can't
# be started. Honestly, with the performance of the
# parser below, I don't think I'm going to try to
# fix it.
import tika
from tika import parser

def pdf_to_text_tika(url):
    f = pdf_bytes(url)
    parsed = parser.from_buffer(f)
    print(parsed["metadata"])
    print(parsed["content"])

# pdf_to_text_tika("https://nnjournal.net/article/download/75")
pdf_to_text_tika("https://link.springer.com/content/pdf/10.1023/A:1022602019183.pdf")

In [None]:
# PDF to Text: Textract
# I've seen this used by other packages I've installed.
# The documentation does not look that helpful.
# Also, I don't think they allow for file objects.
# Onwards!
# I think the prior parser was not working because the URL
# was a download link and not the URL of a PDF.

In [7]:
# PDF to Text: PyMuPDF
# This looks like it may be good. The documentation
# looks great. I think PyMuPDF will be my saving grace
# for this portion of the project. I will be choosing
# PyMuPDF.
import pymupdf

def pdf_to_text_pymupdf(url):
    try:
        text = ""
        f = pdf_bytes(url)
        doc = pymupdf.open(stream=f)
        # print(doc[0].get_text())
        for d in doc:
            text += d.get_text()
        return text
    except Exception as e:
        return ""

pdf_to_text_pymupdf("https://link.springer.com/content/pdf/10.1023/A:1022602019183.pdf")

'Machine Learning 3: 95-99, 1988\n© 1988 Kluwer Academic Publishers - Manufactured in The Netherlands\nGUEST EDITORIAL\nGenetic Algorithms and Machine Learning\nMetaphors for learning\nThere is no a priori reason why machine learning must borrow from nature.\nA field could exist, complete with well-defined algorithms, data structures,\nand theories of learning, without once referring to organisms, cognitive or\ngenetic structures, and psychological or evolutionary theories. Yet at the end\nof the day, with the position papers written, the computers plugged in, and\nthe programs debugged, a learning edifice devoid of natural metaphor would\nlack something. It would ignore the fact that all these creations have become\npossible only after three billion years of evolution on this planet. It would\nmiss the point that the very ideas of adaptation and learning are concepts\ninvented by the most recent representatives of the species Homo sapiens from\nthe careful observation of themselves an