# Prepare notebook

In [1]:
%load_ext dotenv
%load_ext autoreload
%autoreload 2
import os
# move current working directory up two levels to root
# not pretty but this is a notebook
# don't run this cell more than once or you'll move another two directories up, which wouldn't be good
os.chdir(os.pardir); os.chdir(os.pardir)
print('Current working directory is %s' % os.getcwd())
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from src import utilities
import settings
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from collections import OrderedDict
import copy
import re
import feedparser
import pubmed_parser
import time
import re
import pickle


# create a .env file in the root directory wp/ if you'd like to use dotenv
# .env not included in version control, so credentials can be stored in this file
"""
SPRINGER_API_KEY=your_springer_api_key
"""

env_path = Path('.') / '.env'
load_dotenv(env_path)

# load environment variables from .env
# don't print these out in the notebook in the event your changes accidentally get  incorporated into version control
springer_api_key = os.environ.get('SPRINGER_API_KEY')
ieee_api_key = os.environ.get('IEEE_API_KEY')
user_email = os.environ.get('USER_EMAIL')  # courtesy to NIH to include your email

wait_time = 3

Current working directory is E:\Users\Peter_Rasmussen\gh\multivac
Directory data\raw already exists
Directory data\interim already exists
Directory data\processed already exists
Directory data\raw\arxiv already exists
Directory data\raw\pubmed already exists
Directory data\raw\springer already exists


# Arxiv

In [9]:
def prep_terms(terms):
    return '+AND+'.join(['all:' + term for term in terms])


def get_total_number_of_results(url, params):
    xml_text = requests.get(url, params=params).text
    return int(bs(xml_text, 'lxml').find('opensearch:totalresults').contents[0])


def query_api(url, terms, params, wait_time=3, verbose=False):
    
    # get total number of results
    n_results = get_total_number_of_results(url, {'start': 0, 'max_results': 1})
    if verbose: 
        print('%s total results, %s second wait time between each call' % (str(n_results), str(wait_time)))
    
    # build list to iterate over
    starts = list(range(0, n_results, params['max_results']))  # start, stop, step

    metadata = []

    # iterate over list to get all results
    for ix, start in enumerate(starts):
        params_ = copy.deepcopy(params) 
        params_['start'] = start

        # ping api and retrieve xml for all articles in page
        xml_text = requests.get(url, params=params_).text

        # process xml page feed 
        page_feed = feedparser.parse(xml_text)
        entries = page_feed['entries']
        
        if ix == 0:
            metadata = entries
        else:
            metadata.extend(entries)
        time.sleep(wait_time)
    if verbose: print('')
    return metadata

In [10]:
# build query and get metadata of articles matching our search criteria
params = {'start': 0, 'max_results': 20, 'sortBy': 'relevance', 'sortOrder': 'descending'}
li = [x.replace('-', ' ').split(' ') for x in settings.terms]
q = 'OR'.join(['%28' + prep_terms(x) + '%29' for x in li])
url = 'http://export.arxiv.org/api/query?search_query=' + q
arxiv_metadata = query_api(url, q, params, verbose=True)

# save pdfs of articles that matched our search criteria
# we use doi as the filename when that id is present; otherwise we use the arxiv id
for ix, md in enumerate(arxiv_metadata):
    url = md['id']
    pdf_url = url.replace('/abs/', '/pdf/')
    fn = url.split('/abs/')[-1]
    fn = '_'.join(fn.split('/')) + '.pdf'
    arxiv_metadata[ix]['fn'] = fn  # specify filename so we can associate each pdf with its metadata down the road
    dst = settings.arxiv_dir / fn
    if not os.path.exists(dst):
        r = requests.get(pdf_url)
        with open(dst, 'wb') as f:
            f.write(r.content)
        time.sleep(0.3)
        
# save arxiv metadata
dst = settings.metadata_dir / 'arxiv.pkl'
with open(dst, 'wb') as f:
    pickle.dump(arxiv_metadata, f)

310 total results, 3 second wait time between each call



# Springer

In [52]:
springer_metadata[0]['doi']

'10.1038/s41598-018-36116-6'

In [4]:
# build query to retrieve metadata
li = ['sir model', 'susceptible infected recovered', 'irSIR model']
make_q = lambda li: '(' + ' OR '.join(['"' + s + '"' for s in li]) + ')'
q = make_q(settings.terms)
base = 'http://api.springernature.com/openaccess/json?q='
url = base + q
params = {'source': 'springer', 'openaccess': 'true', 'api_key': springer_api_key, 'p': 20, 's': 1}
params_ = copy.deepcopy(params)
# r = requests.get(url, params_)

# retrieve metadata
springer_metadata = []
while True:
    r = requests.get(url, params_)
    if len(r.json()['records']) == 0:
        break
    params_['s'] = params_['s'] + params_['p']
    springer_metadata += r.json()['records']
    time.sleep(wait_time)
print(len(springer_metadata))

# iterate over springer metadata and download html for each article
waits = (2**x for x in range(0,6))  # we use a generator to increase wait times with each connection error
for ix, md in enumerate(springer_metadata):
    fn = md['doi'].replace('/', '-')
    if len(fn) == 0:
        fn = md['identifier']
    fn = fn + '.html'
    springer_metadata[ix]['fn'] = fn
    dst = settings.springer_dir / fn
    if not os.path.exists(dst):
        try:
            r = requests.get(md['url'][0]['value'])
        except ConnectionError:
            time.sleep(waits.__next__)
            r = requests.get(md['url'][0]['value'])
        html = bs(r.text).encode('utf-8').decode('utf-8')
        with open(dst, 'w', encoding='utf-8') as f:
            f.write(html)
        time.sleep(3)

# save springer metadata
dst = settings.metadata_dir / 'springer.pkl'
with open(dst, 'wb') as f:
    pickle.dump(springer_metadata, f)

382


In [23]:
springer_metadata[0]

{'contentType': 'Article',
 'identifier': 'doi:10.1038/s41598-018-36116-6',
 'url': [{'format': '',
   'platform': '',
   'value': 'http://dx.doi.org/10.1038/s41598-018-36116-6'}],
 'title': 'How Physical Proximity Shapes Complex Social Networks',
 'creators': [{'creator': 'Stopczynski, Arkadiusz'},
  {'creator': 'Pentland, Alex ‘Sandy’'},
  {'creator': 'Lehmann, Sune'}],
 'publicationName': 'Scientific Reports',
 'issn': '',
 'eissn': '2045-2322',
 'openaccess': 'true',
 'journalid': '41598',
 'doi': '10.1038/s41598-018-36116-6',
 'publisher': 'Nature',
 'publicationDate': '2018-12-07',
 'onlineDate': '2018-12-07',
 'coverDate': '2018-12',
 'printDate': '',
 'volume': '8',
 'number': '1',
 'issuetype': 'Regular',
 'topicalCollection': '',
 'startingPage': '1',
 'endingPage': '10',
 'copyright': '©2018 The Author(s)',
 'genre': 'OriginalPaper',
 'abstract': 'AbstractSocial interactions among humans create complex networks and – despite a recent increase of online communication – the in

In [None]:
soup = bs(r.text)

In [None]:
li = [x.contents[0] for x in soup.find('article').find_all('p', {'class':'Para'})]
li

In [None]:
# print(soup.find('article').prettify())
li = [x.get_text() for x in soup.find('article').find_all('p')]
s = 'Springer Nature remains neutral '
li

# Pubmed Central (Entrez)

In [16]:
# search pubmed central for free full text articles containing selected query

# get the ids which we then use to get the xml text data
replace = lambda s: s.replace(' ', '+')
quote = lambda s: '%22' + s + '%22'
terms = [quote(replace(s)) for s in settings.terms]
term = 'term='+ '%28'+ '+OR+'.join(terms) + '%29'
fulltext = 'free+fulltext%5bfilter%5d'
retmax = 'retmax=2000'
base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc'
params = {'retmax': 2000, 'email': user_email}
url = base + '&' + term + '+' + fulltext + '&' + retmax
r = requests.get(url)
ids = [x.contents[0] for x in bs(r.text).find_all('id')]

# get xml text data and save to disk
for i in ids:
    pmc_id = 'pmc' + str(i)
    fn = (pmc_id + '.xml')
    dst = settings.pubmed_dir / fn
    if not os.path.exists(dst):
        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=' + str(i)
        r = requests.get(url, params={'id': i})
        xml = r.text
        with open(dst, 'w') as f:
            f.write(xml)
        time.sleep(0.5)

# STOP

# IEEE Xplore

In [None]:
# base = 'http://ieeexploreapi.ieee.org/api/v1/search/articles?'
# url = base + q
# params = {'max_records': 20, 'start_record': 1, 'querytext': q, 'apikey': ieee_api_key}
# params_ = copy.deepcopy(params)

# ieee_metadata = []
# while True:
#     r = requests.get(url, params_)
#     if params_['start_record'] > r.json()['total_records']:
#         break
#     for article in r.json()['articles']:
#         if i['access_type'] != 'LOCKED':
#             ieee_metadata.append(article)
#     params_['start_record'] = params_['start_record'] + params_['max_records']
#     time.sleep(wait_time)
# print(len(ieee_metadata))

In [None]:
if len(text) > 0:
    pmc_articles[doi] = {'metadata': metadata, 'text': text}

text = pubmed_parser.parse_pubmed_paragraph(str(path.absolute()), all_paragraph=True)
metadata = pubmed_parser.parse_pubmed_xml(str(path.absolute()))
doi = metadata.pop('doi')


In [None]:
pmc_articles[doi]

In [None]:
len(pmc_articles)

In [None]:
pmc_articles

In [None]:
paragraphs

In [None]:
# r = requests.get(url, params={'id': i})


In [None]:
src = r'E:\Users\Peter_Rasmussen\gh\multivac\data\raw\pubmed\pmc4760143'

In [None]:
# the parsing answer: https://github.com/titipata/pubmed_parser


In [None]:
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=6214536'
r = requests.get(url)

In [None]:
pubmed_parser.parse_xml_web(ids[28], save_xml=False)

In [None]:
xml = r.text

In [None]:
imp

In [None]:
import pubmed_parser

In [None]:
dicts_out = pp.parse_pubmed_paragraph('data/6605965a.nxml', all_paragraph=False)

In [None]:
print(xml.keys())

In [None]:
oa_file_list_ = pd.read_csv('ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv')

In [None]:
oa_file_list_['Article Citation'].unique()

In [None]:
# tbd

url = 'https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=ListRecords&metadataPrefix=pmc&term=%22sir+model%22+OR%22susceptible+-+infected+-+recovered%22&set=pmc-open'
r = requests.get(url)

In [None]:
r.text[:1000]

# Old

In [None]:



# def process_article_metadata(metadata):
#     """Extract metadata for one article and organize metadata into a dictionary.
#     Inputs:
#         metadata    String; Read in from API call
#     Outputs:
#         d  Dictionary of article metadata
#     """

#     # for each article extract and organize metadata
#     metadata = bs(metadata.strip(), 'lxml')

#     # each article's metadata contained in a dictionary
#     d = {}

#     # add single-value attributes
#     single_value_attributes = ['id', 'updated', 'published', 'title', 'summary', 'doi']
#     for single_value_attribute in single_value_attributes:
#         try:
#             v = metadata.find(single_value_attribute).contents[0].strip()
#         except AttributeError:
#             # some articles don't have doi numbers so fall back on arxiv doi
#             if single_value_attribute=='doi':
#                 v = d['id']
#             else:
#                 v = None
#         d[single_value_attribute] = v

#     # add multiple-value attributes and edge-case single-value attributes
#     d['arxiv:primary_category'] = metadata.find('arxiv:primary_category').attrs['term']
#     d['arxiv_categories'] = [x['term'] for x in metadata.find_all('category')]
#     d_links = {}
#     for link in metadata.find_all('link'):
#         if 'title' in link.attrs:
#             k, v = link.attrs['title'], link.attrs['href']
#             d_links[k] = v
#     d['links'] = d_links
#     d['authors'] = [x.contents[0] for x in metadata.find_all('name')]

#     return d


# def get_metadata_from_page(xml_text):
#     """
#     Usage of output:
#         As an OrderedDict, d_page_metadata has the usual dictionary functionality
#         It can also be accessed like a list using the approach below (Python 3 approach below):
#             items = list(d_page_metadata.items())
#             items[0]
#     """
    
#     # prep metadata returned by api query
#     articles_metadata = re.sub(' +', ' ', xml_text.replace('arxiv:doi', 'doi').replace('\n', ' ')).strip().split('<entry>')[1:]

#     # iterate over each article and extract and organize metadata
#     d_page_metadata = OrderedDict()
#     for article_metadata in articles_metadata:
#         v = copy.deepcopy(process_article_metadata(article_metadata))
#         k = v.pop('doi')
#         d_page_metadata[k] = v
    
#     return d_page_metadata



In [None]:
# ps2ascii, gzip
import gzip
import os

In [None]:
f = gzip.open(raw_src, 'rb')

In [None]:
ps = f.read()

In [None]:
f.close()

In [None]:
raw_dir = r'E:\Users\Peter_Rasmussen\gh\multivac\data\raw\arxiv'
raw_src = os.path.join(raw_dir, '1411.2370v2.ps.gz')
# with gzip.open(raw_src, 'rb') as f:
#     ps = f.read()
    
def opener(filename):
    f = open(filename,'rb')
    if (f.read(2) == '\x1f\x8b'):
        f.seek(0)
        return gzip.GzipFile(fileobj=f)
    else:
        f.seek(0)
        return f
f = opener(raw_src)

In [None]:
ps = f.read()
f.close()

In [None]:
with opener(raw_src) as f:
    ps = f.read().decode('iso-8859-1')

In [None]:
len(ps)

In [None]:
list(d_metadata.items())[0]

In [None]:
params_

In [None]:
n_results

In [None]:
len(d_metadata.keys())

In [None]:
next(a)

In [None]:
print()
print(80 * '*')
print(xml_text)


In [None]:

for link in article_metadata.find_all('link'):
    pass

In [None]:

if 'author' in link.attrs:
    k, v = link.attrs['title'], link.attrs['href']
    d_links[k] = v

In [None]:
link.attrs

In [None]:
v

In [None]:
link

In [None]:
link.attrs

In [None]:
article_metadata.find_all('link')[0].attrs

In [None]:

summary = ''.join(soup.find('summary').contents).strip().replace('\n', ' ')
title = ''.join(soup.find('title').contents).strip().replace('\n', ' ')
title

In [None]:

soup.find_all('title')

In [None]:
'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=10'

In [None]:
import xml

In [None]:
e = xml.etree.ElementTree.parse('xml.xml')

In [None]:
e.findall('title')

In [None]:
bf.data(r.text)

In [None]:
r.text

In [None]:
import urllib

base = 'http://export.arxiv.org/api/'
method = 'query'
search_term = 'electron'
parameters
parameters = 'search_query=all:' + electron&start=0&max_results=10'
query = base + method + '?' + parameters
data = urllib.request.urlopen(query).read()
print(data)

In [None]:
data.