In [178]:
import pandas as pd
import requests
import xmljson
from xmljson import badgerfish as bf
from bs4 import BeautifulSoup as bs
from collections import OrderedDict
import copy
import re
import feedparser
import time

In [192]:
def prep_terms(terms):
    return '+AND+'.join(['all:' + term for term in terms])


def get_total_number_of_results(url, params):
    xml_text = requests.get(url, params=params).text
    return int(bs(xml_text, 'lxml').find('opensearch:totalresults').contents[0])


def process_article_metadata(metadata):
    """Extract metadata for one article and organize metadata into a dictionary.
    Inputs:
        metadata    String; Read in from API call
    Outputs:
        d  Dictionary of article metadata
    """

    # for each article extract and organize metadata
    metadata = bs(metadata.strip(), 'lxml')

    # each article's metadata contained in a dictionary
    d = {}

    # add single-value attributes
    single_value_attributes = ['id', 'updated', 'published', 'title', 'summary', 'doi']
    for single_value_attribute in single_value_attributes:
        try:
            v = metadata.find(single_value_attribute).contents[0].strip()
        except AttributeError:
            # some articles don't have doi numbers so fall back on arxiv doi
            if single_value_attribute=='doi':
                v = d['id']
            else:
                v = None
        d[single_value_attribute] = v

    # add multiple-value attributes and edge-case single-value attributes
    d['arxiv:primary_category'] = metadata.find('arxiv:primary_category').attrs['term']
    d['arxiv_categories'] = [x['term'] for x in metadata.find_all('category')]
    d_links = {}
    for link in metadata.find_all('link'):
        if 'title' in link.attrs:
            k, v = link.attrs['title'], link.attrs['href']
            d_links[k] = v
    d['links'] = d_links
    d['authors'] = [x.contents[0] for x in metadata.find_all('name')]

    return d


def get_metadata_from_page(xml_text):
    """
    Usage of output:
        As an OrderedDict, d_page_metadata has the usual dictionary functionality
        It can also be accessed like a list using the approach below (Python 3 approach below):
            items = list(d_page_metadata.items())
            items[0]
    """
    
    # prep metadata returned by api query
    articles_metadata = re.sub(' +', ' ', xml_text.replace('arxiv:doi', 'doi').replace('\n', ' ')).strip().split('<entry>')[1:]

    # iterate over each article and extract and organize metadata
    d_page_metadata = OrderedDict()
    for article_metadata in articles_metadata:
        v = copy.deepcopy(process_article_metadata(article_metadata))
        k = v.pop('doi')
        d_page_metadata[k] = v
    
    return d_page_metadata


def query_api(url, params, wait_time=3, verbose=False):
    
    # get total number of results
    n_results = get_total_number_of_results(url, {'start': 0, 'max_results': 1})
    if verbose: 
        print('%s total results, %s second wait time between each call' % (str(n_results), str(wait_time)))
    
    # build list to iterate over
    starts = list(range(0, n_results, params['max_results']))  # start, stop, step

    metadata = []

    # iterate over list to get all results
    for start in starts:
        if verbose: print(params['max_results'] * '*', end='')
        params_ = copy.deepcopy(params) 
        params_['start'] = start

        # ping api and retrieve xml for all articles in page
        xml_text = requests.get(url, params=params_).text

        # process xml page feed
        page_feed = feedparser.parse(xml_text)
        d = {'source': source, 'terms': terms, 'url': url, 'params': params_, 'page_feed': page_feed}
        metadata.append(d)
        time.sleep(wait_time)
    if verbose: print('')
    return metadata

In [193]:
params_ = copy.deepcopy(params) 
params_['start'] = start

# ping api and retrieve xml for all articles in page
xml_text = requests.get(url, params=params_).text

In [194]:
# build query
source = 'arxiv'
terms = ['susceptible', 'infected', 'recovered']
url = 'http://export.arxiv.org/api/query?search_query=' + prep_terms(terms)
params = {'start': 0, 'max_results': 20, 'sortBy': 'relevance', 'sortOrder': 'descending'}
wait_time = 3

md = query_api(url, params, verbose=True)

# for i in soup.find_all('title'):
#     if not i.has_attr('type'):
#         s = ''.join(i.contents).replace('\n', ' ')  # make string from list results
#         s = re.sub(' +', ' ', s).strip()  # remove extra whitespace
#         print(s)

179 total results, 3 second wait time between each call
************************************************************************************************************************************************************************************


In [204]:
terms = ['sir', 'model', 'disease']
url = 'http://export.arxiv.org/api/query?search_query=' + prep_terms(terms)
md2 = query_api(url, params, verbose=True)

In [207]:
md2[0]['page_feed']['feed']['opensearch_totalresults']

'185'

In [201]:
terms = ['susceptible-infected-recovered']
url = 'http://export.arxiv.org/api/query?search_query=' + prep_terms(terms)
md3 = query_api(url, params)

In [203]:
url = 'http://api.springernature.com/meta/v1/json?q='
api_key = '123'
'openaccess:true'

'http://export.arxiv.org/api/query?search_query=all:susceptible-infected-recovered'

In [196]:
md2['n_results']

TypeError: list indices must be integers or slices, not str

In [197]:
md2[0]['page_feed']['feed']['opensearch_totalresults']

'493'

In [198]:
md2[0].keys()

dict_keys(['source', 'n_results', 'terms', 'url', 'params', 'page_feed'])

In [130]:
# ps2ascii, gzip
import gzip
import os

In [132]:
f = gzip.open(raw_src, 'rb')

In [133]:
ps = f.read()

OSError: Not a gzipped file (b'%!')

In [134]:
f.close()

In [135]:
raw_dir = r'E:\Users\Peter_Rasmussen\gh\multivac\data\raw\arxiv'
raw_src = os.path.join(raw_dir, '1411.2370v2.ps.gz')
# with gzip.open(raw_src, 'rb') as f:
#     ps = f.read()
    
def opener(filename):
    f = open(filename,'rb')
    if (f.read(2) == '\x1f\x8b'):
        f.seek(0)
        return gzip.GzipFile(fileobj=f)
    else:
        f.seek(0)
        return f
f = opener(raw_src)

In [137]:
ps = f.read()
f.close()

In [148]:
with opener(raw_src) as f:
    ps = f.read().decode('iso-8859-1')

In [152]:
len(ps)

6591707

In [125]:
list(d_metadata.items())[0]

('10.1109/TIT.2017.2698504',
 {'id': 'http://arxiv.org/abs/1411.2370v2',
  'updated': '2016-10-27T04:57:49Z',
  'published': '2014-11-10T10:25:10Z',
  'title': 'On the Universality of Jordan Centers for Estimating Infection Sources in Tree Networks',
  'summary': 'Finding the infection sources in a network when we only know the network topology and infected nodes, but not the rates of infection, is a challenging combinatorial problem, and it is even more difficult in practice where the underlying infection spreading model is usually unknown a priori. In this paper, we are interested in finding a source estimator that is applicable to various spreading models, including the Susceptible-Infected (SI), Susceptible-Infected-Recovered (SIR), Susceptible-Infected-Recovered-Infected (SIRI), and Susceptible-Infected-Susceptible (SIS) models. We show that under the SI, SIR and SIRI spreading models and with mild technical assumptions, the Jordan center is the infection source associated with th

In [116]:
params_

{'start': 100,
 'max_results': 100,
 'sortBy': 'relevance',
 'sortOrder': 'descending'}

In [104]:
n_results

179

In [103]:
len(d_metadata.keys())

102

In [90]:
next(a)

This is printed second


2

In [65]:
print()
print(80 * '*')
print(xml_text)


179
********************************************************************************
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <link href="http://arxiv.org/api/query?search_query%3Dall%3Asusceptible%20AND%20all%3Ainfected%20AND%20all%3Arecovered%26id_list%3D%26start%3D10%26max_results%3D10" rel="self" type="application/atom+xml"/>
  <title type="html">ArXiv Query: search_query=all:susceptible AND all:infected AND all:recovered&amp;id_list=&amp;start=10&amp;max_results=10</title>
  <id>http://arxiv.org/api/7stV8zGmuSvcdG3QhsnFL5iaBTs</id>
  <updated>2018-12-06T00:00:00-05:00</updated>
  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">179</opensearch:totalResults>
  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">10</opensearch:startIndex>
  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">10</opensearch:itemsPerPage>
  <entry>
    <id>http://arxiv.org

In [120]:

for link in article_metadata.find_all('link'):
    pass

In [133]:

if 'author' in link.attrs:
    k, v = link.attrs['title'], link.attrs['href']
    d_links[k] = v

In [134]:
link.attrs

{'title': 'pdf',
 'href': 'http://arxiv.org/pdf/1411.2370v2',
 'rel': ['related'],
 'type': 'application/pdf'}

In [132]:
v

'http://arxiv.org/pdf/1411.2370v2'

In [124]:
link

<link href="http://arxiv.org/pdf/1411.2370v2" rel="related" title="pdf" type="application/pdf"/>

In [119]:
link.attrs

{'title': 'doi',
 'href': 'http://dx.doi.org/10.1109/TIT.2017.2698504',
 'rel': ['related']}

In [108]:
article_metadata.find_all('link')[0].attrs

{'title': 'doi',
 'href': 'http://dx.doi.org/10.1109/TIT.2017.2698504',
 'rel': ['related']}

In [14]:

summary = ''.join(soup.find('summary').contents).strip().replace('\n', ' ')
title = ''.join(soup.find('title').contents).strip().replace('\n', ' ')
title

'ArXiv Query: search_query=all:electron&id_list=&start=0&max_results=1'

In [15]:

soup.find_all('title')

[<title type="html">ArXiv Query: search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1</title>,
 <title>Impact of Electron-Electron Cusp on Configuration Interaction Energies</title>]

In [None]:
'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=10'

In [None]:
import xml

In [None]:
e = xml.etree.ElementTree.parse('xml.xml')

In [None]:
e.findall('title')

In [None]:
bf.data(r.text)

In [None]:
r.text

In [None]:
import urllib

base = 'http://export.arxiv.org/api/'
method = 'query'
search_term = 'electron'
parameters
parameters = 'search_query=all:' + electron&start=0&max_results=10'
query = base + method + '?' + parameters
data = urllib.request.urlopen(query).read()
print(data)

In [None]:
data.