In [None]:
# OpenAlex: Finding Papers
# Let's say we have a set or an array of keywords.
# We can use OpenAlex to find a large number of papers
# that, in some way, match those keywords. This is an
# example of how it could work. Furthermore, Veronica
# mentioned how there's other characteristics that you
# may be looking for, like how far back you want to go
# in searching for papers.
from pyalex import Works

# Later, I'll use these URLs to try out the PDF to text
# tools. If there's any.
urls = []
number_urls = 0

keywords = ["Machine Learning"]

pager = Works().search_filter(title=keywords[0]).paginate(per_page=200)
number_works = 0

for page in pager:
    for work in page:        
        print(f"Title: {work['title']}")
        # You'll see that some PDFs do include URLs,
        # while some do not. Maybe in these cases, we could
        # instead use the abstract that OpenAlex provides.
        # However, there are still chances that the abstract
        # is also unavailable. Also, there was an instance of
        # an incorrect abstract, so that's also something to 
        # consider.
        print(f"Abstract: {work['abstract']}")
        if work["primary_location"]:
            url = work["primary_location"]["pdf_url"]
            if url:
                print(url)
                urls.append(url)
                number_urls += 1 
        number_works += 1
        print()
    if number_urls >= 10 or number_works >= 50:
        break

print(f"URLs: {urls}")

In [None]:
# URLs
# As you may be able to see, the number of URLs provided are not plenty.
# I'm not sure if this is important, but what if good papers are missed out on
# because they don't have a PDF listed? Also, it seems that OpenAlex offers
# everything, so I'm not sure if these are research papers that one would want
# to use. Possibly a problem for a later day.
# In case I can't find enough URLs, this serves to find any paper with an URL.
# This is just for the "testing" later on.
print(f"URLs: {urls}")
if len(urls) < 10:
    while len(urls) < 10:
        work = Works().random()
        while not work["primary_location"] or not work["primary_location"]["pdf_url"]:
            work = Works().random()
        urls.append(work["primary_location"]["pdf_url"])
print(f"URLs: {urls}")

In [2]:
# PDF to Text
# It's not logical to physically download a PDF
# to reference it in the code. Especially if you're
# going through thousands of PDFs. So, we can download
# the bytes of an URL that links to said PDF to get the
# PDF file. This code contains the simple process of
# downloading the bytes of the PDF.
import io
import requests

def pdf_bytes(url):
    r = requests.get(url)
    f = io.BytesIO(r.content)
    return f

In [None]:
# PDF to Text: PyPDF
# This doesn't seem to be useless, but it throws
# exceptions easily. It's understandable why an
# error would be thrown if the PDF cannot be parsed.
# However, I wonder if this would cause problems with
# the number of issues a PDF could have for whatever
# reason.
# This also works, but the text is not impressive.
from pypdf import PdfReader

def pdf_to_text_pypdf(url):
    f = pdf_bytes(url)
    reader = PdfReader(f)
    contents = reader.get_page(0).extract_text().split('\n')
    print(contents)
    print(" ".join(contents))

pdf_to_text_pypdf("https://link.springer.com/content/pdf/10.1023/A:1022602019183.pdf")

In [None]:
# PDF to Text: Tika
# This has been running for at least
# 13 minutes, possibly more. I can't tell
# if it's stuck in an infinite loop, or if the
# PDF is that long. It could be that the link 
# to the PDF is a download. I'm not sure if this 
# is affecting the parser. I'm going to stop it
# and try a different URL. Now that I'm using a
# PDF that works, I'm seeing that the server can't
# be started. Honestly, with the performance of the
# parser below, I don't think I'm going to try to
# fix it.
import tika
from tika import parser

def pdf_to_text_tika(url):
    f = pdf_bytes(url)
    parsed = parser.from_buffer(f)
    print(parsed["metadata"])
    print(parsed["content"])

# pdf_to_text_tika("https://nnjournal.net/article/download/75")
pdf_to_text_tika("https://link.springer.com/content/pdf/10.1023/A:1022602019183.pdf")

In [None]:
# PDF to Text: Textract
# I've seen this used by other packages I've installed.
# The documentation does not look that helpful.
# Also, I don't think they allow for file objects.
# Onwards!
# I think the prior parser was not working because the URL
# was a download link and not the URL of a PDF.

In [7]:
# PDF to Text: PyMuPDF
# This looks like it may be good. The documentation
# looks great. I think PyMuPDF will be my saving grace
# for this portion of the project. I will be choosing
# PyMuPDF.
import pymupdf

def pdf_to_text_pymupdf(url):
    f = pdf_bytes(url)
    print(f)
    doc = pymupdf.open(stream=f)
    # print(doc[0].get_text())
    for d in doc:
        print(d.get_text())

pdf_to_text_pymupdf("https://link.springer.com/content/pdf/10.1023/A:1022602019183.pdf")

<_io.BytesIO object at 0x000001FB86C3F100>
Machine Learning 3: 95-99, 1988
© 1988 Kluwer Academic Publishers - Manufactured in The Netherlands
GUEST EDITORIAL
Genetic Algorithms and Machine Learning
Metaphors for learning
There is no a priori reason why machine learning must borrow from nature.
A field could exist, complete with well-defined algorithms, data structures,
and theories of learning, without once referring to organisms, cognitive or
genetic structures, and psychological or evolutionary theories. Yet at the end
of the day, with the position papers written, the computers plugged in, and
the programs debugged, a learning edifice devoid of natural metaphor would
lack something. It would ignore the fact that all these creations have become
possible only after three billion years of evolution on this planet. It would
miss the point that the very ideas of adaptation and learning are concepts
invented by the most recent representatives of the species Homo sapiens from
the careful o