In [1]:
import json
import re
import os
from collections import defaultdict
from datetime import datetime

# Create a logs folder to store papers
def sanitize_filename(filename, max_length=100):
    sanitized = re.sub(r'[^a-zA-Z0-9]', '_', filename)
    return sanitized[:max_length]

logs_folder_path = f'autoscious_logs/'
if not os.path.exists(logs_folder_path):
    os.makedirs(logs_folder_path)

NUM_PAPERS_TO_SCRAPE = 250

In [2]:
# Helper function code to get the PDF content if possible

import requests
from PyPDF2 import PdfReader 
from io import BytesIO

def try_getting_pdf(url):
    try:
        response = requests.get(url, verify=True)
        f = BytesIO(response.content)
        pdf = PdfReader(f)
        return True
    except:
        print("Could not get pdf")
        return False

# Get the PDF content
def try_getting_pdf_content(url):
    try:
        print("trying url:", url)
        response = requests.get(url, verify=True)
        f = BytesIO(response.content)
        pdf = PdfReader(f)
        content = ""

        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            text = page.extract_text()
            content += text

        print("SUCCESSSS")
        return content
    except:
        print("Error getting PDF content")
        return ""

In [3]:
# Load in metadata from openalex_papers.json
with open('openalex_papers.json') as f:
    open_alex_results = json.load(f)

In [4]:
# Scrape with selenium
from util import scrape_text_with_selenium_no_agent
MAX_RETRIES = 3

for res in open_alex_results:
    if len(os.listdir(logs_folder_path)) >= NUM_PAPERS_TO_SCRAPE:
        break

    title = res["title"]
    url = res['open_access']['oa_url'] if res['open_access']['oa_url'] else res['doi']
    full_text_path = f'autoscious_logs/{sanitize_filename(title)}.txt'
    print("title: ", title, "url: ", url)

    if title and url and not os.path.exists(full_text_path):
        print("TRYING PDF!")
        pdf_text = try_getting_pdf_content(url)
        if pdf_text:
            print("PDF WORKED!")

            # Record pdf text
            with open(full_text_path, 'w', encoding='utf-8') as f:
                f.write(pdf_text)
            print("PDF WORKED! 2")
            continue
        else:
            print("PDF DIDN'T WORK, TRYING SELENIUM")
            text = scrape_text_with_selenium_no_agent(url, None, search_engine='chrome')

            # Record pdf text
            with open(full_text_path, 'w', encoding='utf-8') as f:
                f.write(text)

title:  CSM-AB: graph-based antibody–antigen binding affinity prediction and docking scoring function url:  https://doi.org/10.1093/bioinformatics/btab762
title:  ZDOCK: An initial‐stage protein‐docking algorithm url:  https://doi.org/10.1002/prot.10389
title:  Updates to the Integrated Protein–Protein Interaction Benchmarks: Docking Benchmark Version 5 and Affinity Benchmark Version 2 url:  https://europepmc.org/articles/pmc4677049?pdf=render
title:  Removal of N-Linked Glycosylation Enhances PD-L1 Detection and Predicts Anti-PD-1/PD-L1 Therapeutic Efficacy url:  http://www.cell.com/article/S1535610819302995/pdf
title:  Practical Theoretic Guidance for the Design of Tumor-Targeting Agents url:  https://europepmc.org/articles/pmc3978464?pdf=render
title:  Binding affinity prediction for antibody–protein antigen complexes: A machine learning analysis based on interface and surface areas url:  https://doi.org/10.1016/j.jmgm.2022.108364
title:  Mathematical theory of cross-reactive radioi

Multiple definitions in dictionary at byte 0x99b97 for key /MediaBox
Multiple definitions in dictionary at byte 0x99dc4 for key /MediaBox
Multiple definitions in dictionary at byte 0x99f56 for key /MediaBox
Multiple definitions in dictionary at byte 0x9a140 for key /MediaBox
Multiple definitions in dictionary at byte 0x9a2ea for key /MediaBox
Multiple definitions in dictionary at byte 0x9a4d9 for key /MediaBox
Multiple definitions in dictionary at byte 0x9a6a4 for key /MediaBox
Multiple definitions in dictionary at byte 0x9a85f for key /MediaBox
Multiple definitions in dictionary at byte 0x9aa12 for key /MediaBox
Multiple definitions in dictionary at byte 0x9abf9 for key /MediaBox
Multiple definitions in dictionary at byte 0x9adc4 for key /MediaBox
Multiple definitions in dictionary at byte 0x9afc6 for key /MediaBox
Multiple definitions in dictionary at byte 0x9b1f0 for key /MediaBox


SUCCESSSS
PDF WORKED!
PDF WORKED! 2
title:  Cooperative interactions at the SLP-76 complex are critical for actin polymerization url:  https://europepmc.org/articles/pmc2910278?pdf=render
TRYING PDF!
trying url: https://europepmc.org/articles/pmc2910278?pdf=render
SUCCESSSS
PDF WORKED!
PDF WORKED! 2
title:  Enhanced periplasmic expression of high affinity humanized scFv against Hepatitis B surface antigen by codon optimization url:  https://doi.org/10.1016/j.pep.2010.06.006
TRYING PDF!
trying url: https://doi.org/10.1016/j.pep.2010.06.006
Error getting PDF content
PDF DIDN'T WORK, TRYING SELENIUM
Going through url:  https://doi.org/10.1016/j.pep.2010.06.006
select firefox options!
hard coding chrome
setting up chrome driver
Driver is getting url
set timeout!
Page loaded within 15 seconds
Driver got url
Driver has found page source
Handing off to Beautiful Soup!
done extractin
Text:  ! There was a problem providing the content you requested
Please contact us via our
support center for m



SUCCESSSS
PDF WORKED!
PDF WORKED! 2
title:  The c-fos protein interacts with c-JunAP-1 to stimulate transcription of AP-1 responsive genes url:  https://doi.org/10.1016/0092-8674(88)90076-1
TRYING PDF!
trying url: https://doi.org/10.1016/0092-8674(88)90076-1
Error getting PDF content
PDF DIDN'T WORK, TRYING SELENIUM
Going through url:  https://doi.org/10.1016/0092-8674(88)90076-1
select firefox options!
hard coding chrome
setting up chrome driver
Driver is getting url
set timeout!
Page loaded within 15 seconds
Driver got url
Driver has found page source
Handing off to Beautiful Soup!
done extractin
Text:  Enable JavaScript and cookies to continue
title:  VLA Proteins in the Integrin Family: Structures, Functions, and Their Role on Leukocytes url:  https://doi.org/10.1146/annurev.iy.08.040190.002053
TRYING PDF!
trying url: https://doi.org/10.1146/annurev.iy.08.040190.002053
Error getting PDF content
PDF DIDN'T WORK, TRYING SELENIUM
Going through url:  https://doi.org/10.1146/annurev.iy.