In [75]:
import bs4 as bs
import pandas as pd
import csv
import urllib
import re
import wget
import os
from collections import defaultdict

## Grab all Paper links for a particular search query

In [None]:
# List of all searches to be completed

In [375]:
def archive_pagelinks_scraper(links_dict,url):
    """Scrapes the links from the search page for a
    particular query.
    A dictionary containing the link to the page with
    the paper information and a dictionary is returned"""
    prefix = 'http://search.arxiv.org:8081/'
    fhand = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(fhand,"lxml")
    links = soup.find_all('a',{'class':'url'})
    next_page = soup.find_all('p',{'align':'center'})
    
    for link in links:
        paper_page = link.string
        ref = re.findall('[0-9]+.[0-9]+',paper_page)[0]
        links_dict[ref].append(paper_page)
    
    for each in next_page[0].find_all('a'):
        if 'Next' in each.string:
            next_link = each.get('href')
            new_url = prefix+next_link
        else:
            new_url = ''
    return links_dict,new_url
    

In [376]:
def search_page_looper(start_page):
    """This function loops through each page of a response
    to a search query returning all the links"""
    links_dict = defaultdict(list)
    links_dict, new_url = archive_pagelinks_scraper(links_dict,start_page)
    while len(new_url) > 0:
        links_dict, new_url = archive_pagelinks_scraper(links_dict,new_url)
    return links_dict

In [380]:
def paper_pdf_scraper(full_links_dict):
    """Scrapes the individual paper pages grabbing
    metadata and the link to download the pdf"""
    pdf_dict = defaultdict(list)
    for key, links in full_links_dict.items():
        fh = urllib.request.urlopen(links[0]).read()
        page_soup = bs.BeautifulSoup(fh,"lxml")
        categories = page_soup.find('div',class_='subheader').h1.string.split('>')
        main_category = categories[0]
        if len(categories) > 1:
            sub_category = categories[1]
        else:
            sub_category = ''
        paper_title = page_soup.find('h1',class_='title mathjax').text.strip('Title:\n')
        authors = page_soup.find('div',class_='authors')
        author_lst = []
        for author in authors.find_all('a'):
            author_lst.append(author.string)
        abstract = page_soup.find('blockquote',class_='abstract mathjax')
        abstract = abstract.text.strip('\n').replace('\n', ' ')
        pdf_link =  page_soup.find('div',class_='full-text').a.get('href')
        pdf_link = 'https://arxiv.org' + pdf_link
        pdf_dict[key] = [links[0],main_category,sub_category,paper_title,author_lst,abstract, pdf_link]
    return pdf_dict


In [384]:
def paper_downloader(pdf_dict):
    """This function downloads all the papers based on 
    a list of links sent to it"""
    for k,v in pdf_dict.items():
        url = v[-1]
        output_directory = v[1].strip().replace(' ','_').lower()
        filename = str(k) + '.pdf'
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        filename = wget.download(url, out=output_directory)

In [183]:
start_page = 'http://search.arxiv.org:8081/?query=Machine+Learning&in='

In [184]:
full_links_dict = search_page_looper(start_page)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43


In [381]:
pdf_dict = paper_pdf_scraper(full_links_dict)
df = pd.DataFrame.from_dict(pdf_dict,orient='index')
df.to_csv('pdf_links.csv')

In [382]:
df

Unnamed: 0,0,1,2,3,4,5,6
1707.06742,https://arxiv.org/abs/1707.06742,Computer Science,Learning,Machine Teaching: A New Paradigm for Building ...,"[Patrice Y. Simard, Saleema Amershi, David M. ...",Abstract: The current processes for building m...,https://arxiv.org/pdf/1707.06742
1707.09562,https://arxiv.org/abs/1707.09562,Computer Science,"Distributed, Parallel, and Cluster Computing",mlbench: How Good Are Machine Learning Clouds ...,"[Hantian Zhang, Luyuan Zeng, Wentao Wu, Ce Zhang]",Abstract: We conduct an empirical study of mac...,https://arxiv.org/pdf/1707.09562
1707.03184,https://arxiv.org/abs/1707.03184,Computer Science,Artificial Intelligence,A Survey on Resilient Machine Learning,"[Atul Kumar, Sameep Mehta]",Abstract: Machine learning based system are in...,https://arxiv.org/pdf/1707.03184
1707.08561,https://arxiv.org/abs/1707.08561,Quantum Physics,,Quantum machine learning: a classical perspectiv,"[Carlo Ciliberto, Mark Herbster, Alessandro Da...","Abstract: Recently, increased computational po...",https://arxiv.org/pdf/1707.08561
1707.09050,https://arxiv.org/abs/1707.09050,Computer Science,Computation and Language,A Shared Task on Bandit Learning for Machine T...,"[Artem Sokolov, Julia Kreutzer, Kellen Sunderl...",Abstract: We introduce and describe the result...,https://arxiv.org/pdf/1707.09050
1611.00379,https://arxiv.org/abs/1611.00379,Computer Science,Human-Computer Interaction,he Machine Learning Algorithm as Creative Musi...,"[Rebecca Fiebrink, Baptiste Caramiaux]",Abstract: Machine learning is the capacity of ...,https://arxiv.org/pdf/1611.00379
1707.04849,https://arxiv.org/abs/1707.04849,Computer Science,Learning,Minimax deviation strategies for machine learn...,"[Michail Schlesinger, Evgeniy Vodolazskiy]",Abstract: The article is devoted to the proble...,https://arxiv.org/pdf/1707.04849
1602.00198,https://arxiv.org/abs/1602.00198,Computer Science,Artificial Intelligence,Discussion on Mechanical Learning and Learning...,[Chuyu Xiong],Abstract: Mechanical learning is a computing s...,https://arxiv.org/pdf/1602.00198
1705.10201,https://arxiv.org/abs/1705.10201,Computer Science,Artificial Intelligence,Machine Learned Learning Machines,"[Leigh Sheneman, Arend Hintze]",Abstract: There are two common approaches for ...,https://arxiv.org/pdf/1705.10201
1610.08251,https://arxiv.org/abs/1610.08251,Quantum Physics,,Quantum-enhanced machine learning,"[Vedran Dunjko, Jacob M. Taylor, Hans J. Briegel]",Abstract: The emerging field of quantum machin...,https://arxiv.org/pdf/1610.08251


In [385]:
paper_downloader(pdf_dict)