In [None]:
import arxiv
import json
import requests
import time
import tqdm
from numpy import random
import asyncio

# Retrieve Arxiv papers based on a given category

In [None]:
#cs.CL, cs.AI, cs.LG, cs.MA, cs.CV
cat = "cs.AI"

In [None]:
search = arxiv.Search(
    query=f"cat:{cat}", 
    max_results=2000,
    sort_by=arxiv.SortCriterion.SubmittedDate
)

In [None]:
papers = list(search.results())

In [None]:
dataset = list()

for paper in papers:
    
    dataset.append({
        "title": paper.title,
        "authors": ", ".join([author.name for author in paper.authors]),
        "categories": ", ".join([category for category in paper.categories]),
        "comment": paper.comment,
        "doi": paper.doi,
        "entry_id": paper.entry_id,
        "journal_ref": paper.journal_ref,
        "pdf_url": paper.pdf_url,
        "primary_category": paper.primary_category,
        "published": str(paper.published),
        "summary": paper.summary,
        "updated": str(paper.updated)
    })

In [None]:
with open(f"../data/{cat.replace('.', '_')}papers.jsonl", 'w') as outfile:
    for entry in dataset:
        jout = json.dumps(entry) + '\n'
        outfile.write(jout)

# Scrap pdfs

In [None]:
with open(f"../data/{cat.replace('.', '_')}papers.jsonl", 'r') as infile:
    json_list = list(infile)

papers = []

for json_str in json_list:
    papers.append(json.loads(json_str))

# NON ASYNC SCRAPING

for paper in tqdm.tqdm(papers):

    paper_id = paper["pdf_url"].split("/")[-1]
    pdf_url = f"https://export.arxiv.org/pdf/{paper_id}"
    
    res = requests.get(pdf_url)
    
    with open(f"../data/pdfs/{cat.replace('.', '_')}/{paper_id}.pdf", "wb") as outfile:
        outfile.write(res.content)

    time.sleep(random.uniform(0.0, 2.0))

### Asyncio scraping

In [None]:
# *** NEED TO MANUALLY SET ARXIV cat ***

def fetch_pdf(paper):
    
    paper_id = paper["pdf_url"].split("/")[-1]
    pdf_url = f"https://export.arxiv.org/pdf/{paper_id}"
    
    res = requests.get(pdf_url)

    with open(f"../data/pdfs/cs_LG/{paper_id}.pdf", "wb") as outfile:
        outfile.write(res.content)

async def scrape_arxiv(paper, loop):
    await loop.run_in_executor(None, fetch_pdf, paper)


def main(papers):
    
    loop = asyncio.get_event_loop()
    
    for paper in papers:
        loop.create_task(scrape_arxiv(paper, loop))

    loop.run_until_complete(asyncio.gather(*asyncio.all_tasks(loop)))

In [None]:
main(papers)

for papers_chunk in tqdm.tqdm(chunks(papers, 5)):
    main(papers_chunk)

In [None]:
def load_html(url):
    print(url)
    res = urlopen(url)
    return res.read().decode()


def main():
    loop = asyncio.get_event_loop()
    tasks = [loop.run_in_executor(None, load_html, url) for url in URLS]
    data = loop.run_until_complete(asyncio.gather(*tasks))
    with open('/tmp/j_dump', 'w') as fp:
        json.dump(data, fp)