# Prepare notebook

In [1]:
%load_ext dotenv
%load_ext autoreload
%autoreload 2
import os
# move current working directory up two levels to root
# not pretty but this is a notebook
# don't run this cell more than once or you'll move another two directories up, which wouldn't be good
os.chdir(os.pardir); os.chdir(os.pardir)
print('Current working directory is %s' % os.getcwd())
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from src import utilities
import settings
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from collections import OrderedDict
import copy
import re
import feedparser
import time
import re

# create a .env file in the root directory wp/ if you'd like to use dotenv
# .env not included in version control, so credentials can be stored in this file
"""
SPRINGER_API_KEY=your_springer_api_key
"""

env_path = Path('.') / '.env'
load_dotenv(env_path)

# load environment variables from .env
# don't print these out in the notebook in the event your changes accidentally get  incorporated into version control
springer_api_key = os.environ.get('SPRINGER_API_KEY')
ieee_api_key = os.environ.get('IEEE_API_KEY')

Current working directory is E:\Users\Peter_Rasmussen\gh\multivac
Directory data\raw already exists
Directory data\interim already exists
Directory data\processed already exists


# Arxiv

In [None]:
def prep_terms(terms):
    return '+AND+'.join(['all:' + term for term in terms])


def get_total_number_of_results(url, params):
    xml_text = requests.get(url, params=params).text
    return int(bs(xml_text, 'lxml').find('opensearch:totalresults').contents[0])


def query_api(url, terms, params, wait_time=3, verbose=False):
    
    # get total number of results
    n_results = get_total_number_of_results(url, {'start': 0, 'max_results': 1})
    if verbose: 
        print('%s total results, %s second wait time between each call' % (str(n_results), str(wait_time)))
    
    # build list to iterate over
    starts = list(range(0, n_results, params['max_results']))  # start, stop, step

    metadata = []

    # iterate over list to get all results
    for ix, start in enumerate(starts):
        params_ = copy.deepcopy(params) 
        params_['start'] = start

        # ping api and retrieve xml for all articles in page
        xml_text = requests.get(url, params=params_).text

        # process xml page feed
        page_feed = feedparser.parse(xml_text)
        entries = page_feed['entries']
        
        if ix == 0:
            metadata = entries
        else:
            metadata.extend(entries)
        time.sleep(wait_time)
    if verbose: print('')
    return metadata

In [48]:
# build query
source = 'arxiv'
params = {'source': source, 'start': 0, 'max_results': 20, 'sortBy': 'relevance', 'sortOrder': 'descending'}
wait_time = 3

terms1 = ['susceptible', 'infected', 'recovered']
terms2 = ['sir', 'model', 'disease']
terms3 = ['irSIR', 'model']
q = '%28' + prep_terms(terms1) + '%29' + 'OR' + '%28' + prep_terms(terms2) + '%29'
url = 'http://export.arxiv.org/api/query?search_query=' + q
arxiv_metadata = query_api(url, terms, params, verbose=True)


# url2 = 'http://export.arxiv.org/api/query?search_query=' + prep_terms(terms2)
# md2 = query_api(url2, terms2, params, verbose=True)

# md = m1 + md2

309 total results, 3 second wait time between each call



# Springer

In [82]:
li = ['sir model', 'susceptible infected recovered', 'irSIR model', 'SIS epidemic model', 'susceptible-exposed-infected']
make_q = lambda li: '(' + ' OR '.join(['"' + s + '"' for s in li]) + ')'
q = make_q(li)

base = 'http://api.springernature.com/openaccess/json?q='
url = base + q
params = {'source': 'springer', 'openaccess': 'true', 'api_key': springer_api_key, 'p': 20, 's': 1}
params_ = copy.deepcopy(params)
r = requests.get(url, params_)
springer_metadata = []

while True:
    r = requests.get(url, params_)
    if len(r.json()['records']) == 0:
        break
    params_['s'] = params_['s'] + params_['p']
    springer_metadata += r.json()['records']
    time.sleep(wait_time)
print(len(springer_metadata))

424


# IEEE Xplore

In [97]:
base = 'http://ieeexploreapi.ieee.org/api/v1/search/articles?'
url = base + q
params = {'max_records': 20, 'start_record': 1, 'querytext': q, 'apikey': ieee_api_key}
params_ = copy.deepcopy(params)

ieee_metadata = []
while True:
    r = requests.get(url, params_)
    if params_['start_record'] > r.json()['total_records']:
        break
    for article in r.json()['articles']:
        if i['access_type'] != 'LOCKED':
            ieee_metadata.append(article)
    params_['start_record'] = params_['start_record'] + params_['max_records']
    time.sleep(wait_time)
print(len(ieee_metadata))

0


# Pubmed (Entrez)
* https://marcobonzanini.com/2015/01/12/searching-pubmed-with-python/

In [98]:
# tbd

# Old

In [47]:



# def process_article_metadata(metadata):
#     """Extract metadata for one article and organize metadata into a dictionary.
#     Inputs:
#         metadata    String; Read in from API call
#     Outputs:
#         d  Dictionary of article metadata
#     """

#     # for each article extract and organize metadata
#     metadata = bs(metadata.strip(), 'lxml')

#     # each article's metadata contained in a dictionary
#     d = {}

#     # add single-value attributes
#     single_value_attributes = ['id', 'updated', 'published', 'title', 'summary', 'doi']
#     for single_value_attribute in single_value_attributes:
#         try:
#             v = metadata.find(single_value_attribute).contents[0].strip()
#         except AttributeError:
#             # some articles don't have doi numbers so fall back on arxiv doi
#             if single_value_attribute=='doi':
#                 v = d['id']
#             else:
#                 v = None
#         d[single_value_attribute] = v

#     # add multiple-value attributes and edge-case single-value attributes
#     d['arxiv:primary_category'] = metadata.find('arxiv:primary_category').attrs['term']
#     d['arxiv_categories'] = [x['term'] for x in metadata.find_all('category')]
#     d_links = {}
#     for link in metadata.find_all('link'):
#         if 'title' in link.attrs:
#             k, v = link.attrs['title'], link.attrs['href']
#             d_links[k] = v
#     d['links'] = d_links
#     d['authors'] = [x.contents[0] for x in metadata.find_all('name')]

#     return d


# def get_metadata_from_page(xml_text):
#     """
#     Usage of output:
#         As an OrderedDict, d_page_metadata has the usual dictionary functionality
#         It can also be accessed like a list using the approach below (Python 3 approach below):
#             items = list(d_page_metadata.items())
#             items[0]
#     """
    
#     # prep metadata returned by api query
#     articles_metadata = re.sub(' +', ' ', xml_text.replace('arxiv:doi', 'doi').replace('\n', ' ')).strip().split('<entry>')[1:]

#     # iterate over each article and extract and organize metadata
#     d_page_metadata = OrderedDict()
#     for article_metadata in articles_metadata:
#         v = copy.deepcopy(process_article_metadata(article_metadata))
#         k = v.pop('doi')
#         d_page_metadata[k] = v
    
#     return d_page_metadata



In [None]:
# ps2ascii, gzip
import gzip
import os

In [None]:
f = gzip.open(raw_src, 'rb')

In [None]:
ps = f.read()

In [None]:
f.close()

In [None]:
raw_dir = r'E:\Users\Peter_Rasmussen\gh\multivac\data\raw\arxiv'
raw_src = os.path.join(raw_dir, '1411.2370v2.ps.gz')
# with gzip.open(raw_src, 'rb') as f:
#     ps = f.read()
    
def opener(filename):
    f = open(filename,'rb')
    if (f.read(2) == '\x1f\x8b'):
        f.seek(0)
        return gzip.GzipFile(fileobj=f)
    else:
        f.seek(0)
        return f
f = opener(raw_src)

In [None]:
ps = f.read()
f.close()

In [None]:
with opener(raw_src) as f:
    ps = f.read().decode('iso-8859-1')

In [None]:
len(ps)

In [None]:
list(d_metadata.items())[0]

In [None]:
params_

In [None]:
n_results

In [None]:
len(d_metadata.keys())

In [None]:
next(a)

In [None]:
print()
print(80 * '*')
print(xml_text)


In [None]:

for link in article_metadata.find_all('link'):
    pass

In [None]:

if 'author' in link.attrs:
    k, v = link.attrs['title'], link.attrs['href']
    d_links[k] = v

In [None]:
link.attrs

In [None]:
v

In [None]:
link

In [None]:
link.attrs

In [None]:
article_metadata.find_all('link')[0].attrs

In [None]:

summary = ''.join(soup.find('summary').contents).strip().replace('\n', ' ')
title = ''.join(soup.find('title').contents).strip().replace('\n', ' ')
title

In [None]:

soup.find_all('title')

In [None]:
'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=10'

In [None]:
import xml

In [None]:
e = xml.etree.ElementTree.parse('xml.xml')

In [None]:
e.findall('title')

In [None]:
bf.data(r.text)

In [None]:
r.text

In [None]:
import urllib

base = 'http://export.arxiv.org/api/'
method = 'query'
search_term = 'electron'
parameters
parameters = 'search_query=all:' + electron&start=0&max_results=10'
query = base + method + '?' + parameters
data = urllib.request.urlopen(query).read()
print(data)

In [None]:
data.