In [170]:
import pandas as pd
import requests
import xmljson
from xmljson import badgerfish as bf
from bs4 import BeautifulSoup as bs
from collections import OrderedDict
import copy
import re

In [44]:
def prep_terms(terms):
    return '+AND+'.join(['all:' + term for term in terms])

def process_article_metadata(metadata):
    """Extract metadata for one article and organize metadata into a dictionary.
    Inputs:
        metadata    String; Read in from API call
    Outputs:
        d  Dictionary of article metadata
    """

    # for each article extract and organize metadata
    metadata = bs(metadata.strip(), 'lxml')

    # each article's metadata contained in a dictionary
    d = {}

    # add single-value attributes
    single_value_attributes = ['id', 'updated', 'published', 'title', 'summary', 'doi']
    for single_value_attribute in single_value_attributes:
        try:
            v = metadata.find(single_value_attribute).contents[0]
        except AttributeError:
            v = None
        d[single_value_attribute] = v

    # add multiple-value attributes and edge-case single-value attributes
    d['arxiv:primary_category'] = metadata.find('arxiv:primary_category').attrs['term']
    d['arxiv_categories'] = [x['term'] for x in metadata.find_all('category')]
    d_links = {}
    for link in metadata.find_all('link'):
        if 'title' in link.attrs:
            k, v = link.attrs['title'], link.attrs['href']
            d_links[k] = v
    d['links'] = d_links
    d['authors'] = [x.contents[0] for x in metadata.find_all('name')]

    return d

In [183]:
# build query
terms = ['susceptible', 'infected', 'recovered']

url = 'http://export.arxiv.org/api/query?search_query=' + prep_terms(terms)
params = {'start': 10, 'max_results': 10}

# query api
r = requests.get(url, params=params)
xml_text = r.text

# parse xml
soup = bs(xml_text, 'lxml')
for i in soup.find_all('title'):
    if not i.has_attr('type'):
        s = ''.join(i.contents).replace('\n', ' ')  # make string from list results
        s = re.sub(' +', ' ', s).strip()  # remove extra whitespace
        print(s)

Pair approximation of the stochastic susceptible-infected-recovered-susceptible epidemic model on the hypercubic lattice
Behavior of susceptible-vaccinated--infected--recovered epidemics with diversity in the infection rate of the individuals
Fluctuating epidemics on adaptive networks
Stochastic dynamics of dengue epidemics
An epidemic in a dynamic population with importation of infectives
On the critical behavior of the Susceptible-Infected-Recovered (SIR) model on a square lattice
Impact of asymptomatic infection on coupled disease-behavior dynamics in complex networks
Identification of highly susceptible individuals in complex networks
Elementary proof of convergence to the mean-field model for the SIR process
Solution of an infection model near threshold


In [171]:
# prep metadata returned by api query
articles_metadata = re.sub(' +', ' ', xml_text.replace('arxiv:doi', 'doi').replace('\n', ' ')).strip().split('<entry>')[1:]

# iterate over each article and extract and organize metadata
d_metadata = OrderedDict()
for article_metadata in articles_metadata:
    v = copy.deepcopy(process_article_metadata(article_metadata))
    k = v.pop('doi')
    d_metadata[k] = v

# d_metadata can be accessed like a list using the approach below:
# items = list(d_metadata.items())
# items[0]

In [120]:

for link in article_metadata.find_all('link'):
    pass

In [133]:

if 'author' in link.attrs:
    k, v = link.attrs['title'], link.attrs['href']
    d_links[k] = v

In [134]:
link.attrs

{'title': 'pdf',
 'href': 'http://arxiv.org/pdf/1411.2370v2',
 'rel': ['related'],
 'type': 'application/pdf'}

In [132]:
v

'http://arxiv.org/pdf/1411.2370v2'

In [124]:
link

<link href="http://arxiv.org/pdf/1411.2370v2" rel="related" title="pdf" type="application/pdf"/>

In [119]:
link.attrs

{'title': 'doi',
 'href': 'http://dx.doi.org/10.1109/TIT.2017.2698504',
 'rel': ['related']}

In [108]:
article_metadata.find_all('link')[0].attrs

{'title': 'doi',
 'href': 'http://dx.doi.org/10.1109/TIT.2017.2698504',
 'rel': ['related']}

In [14]:

summary = ''.join(soup.find('summary').contents).strip().replace('\n', ' ')
title = ''.join(soup.find('title').contents).strip().replace('\n', ' ')
title

'ArXiv Query: search_query=all:electron&id_list=&start=0&max_results=1'

In [15]:

soup.find_all('title')

[<title type="html">ArXiv Query: search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1</title>,
 <title>Impact of Electron-Electron Cusp on Configuration Interaction Energies</title>]

In [None]:
'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=10'

In [None]:
import xml

In [None]:
e = xml.etree.ElementTree.parse('xml.xml')

In [None]:
e.findall('title')

In [None]:
bf.data(r.text)

In [None]:
r.text

In [None]:
import urllib

base = 'http://export.arxiv.org/api/'
method = 'query'
search_term = 'electron'
parameters
parameters = 'search_query=all:' + electron&start=0&max_results=10'
query = base + method + '?' + parameters
data = urllib.request.urlopen(query).read()
print(data)

In [None]:
data.