# Prepare notebook

In [1]:
%load_ext dotenv
%load_ext autoreload
%autoreload 2
import os
# move current working directory up two levels to root
# not pretty but this is a notebook
# don't run this cell more than once or you'll move another two directories up, which wouldn't be good
os.chdir(os.pardir); os.chdir(os.pardir)
print('Current working directory is %s' % os.getcwd())
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from src import utilities
import settings
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from collections import OrderedDict
import copy
import re
import feedparser
import pubmed_parser
import time
import slate
import re
import pickle

Current working directory is E:\Users\Peter_Rasmussen\gh\multivac
Directory data\raw already exists
Directory data\interim already exists
Directory data\processed already exists
Directory data\raw\arxiv already exists
Directory data\raw\pubmed already exists
Directory data\raw\springer already exists


# Arxiv

In [120]:
def parse_pdf(src):
    
    try:
        # try to open file
        with open(src, 'rb') as f:
            doc = slate.PDF(f)

        # get text: strip out newlines and extra spaces
        doc = ' '.join([' '.join(x.split()) for x in doc])
        text = doc.split(' Abstract ')[-1].split(' Acknowledgments ')[0].split(' ∗ ∗ ∗ ')[0].strip()

    except:  #  PDFSyntaxError
        text = None

    return text

def parse_html(src):

    with open(src, 'r', encoding='utf-8') as f:
        raw_data_ = f.read()
    soup = bs(raw_data_)
    try:
        text = ' '.join(soup.find('article').get_text().split())
    except AttributeError:
        text = None
    return text



In [118]:
article_metadata['fn']

'1810.11918v1.pdf'

In [121]:
metadata_src = settings.metadata_dir / 'arxiv.pkl'
data_raw_dir = settings.arxiv_dir
data_interim_dst = settings.interim_dir / 'arxiv.pkl'

def parse_articles_data(source, metadata_src, data_raw_dir, data_interim_dst,verbose=False):
    # load metadata
    with open(metadata_src, 'rb') as f:
        metadata_ = pickle.load(f)

    # we'll just add the text to a new arxiv object, an ordered dict keyed on doi or other id
    data = OrderedDict()

    srcs = [data_raw_dir / x for x in os.listdir(data_raw_dir)]

    for ix, article_metadata in enumerate(metadata_):
        
        # initialize temp dictionary
        temp = OrderedDict()
        temp['metadata'] = copy.deepcopy(article_metadata)
        temp['metadata']['source'] = source
        fn = article_metadata['fn']
        if verbose:
            print(fn)
        src = data_raw_dir / fn
    
        # define key and value
        if source == 'arxiv':
            k = article_metadata['fn'].strip('.pdf')
            temp['text'] = parse_pdf(src)
        elif source =='springer':
            k = article_metadata['doi']
            temp['text'] = parse_html(src)
        elif source == 'pubmed':
            raise ValueError('pubmed not supported. Only "arxiv" and "springer" supported. Try "parse_pubmed() function"')
        else:
            raise ValueError('Only "arxiv" and "springer" supported as sources.')
        
        # populate interim dictionary
        if len()
        data[k] = temp
    
    # save intermediate outputs
    with open(data_interim_dst, 'wb') as f:
        pickle.dump(data, f)
    
    return data

source = 'springer'
metadata_src = settings.metadata_dir / 'springer.pkl'
data_raw_dir = settings.springer_dir
data_interim_dst = settings.interim_dir / 'springer.pkl'
springer_data = parse_articles_data('springer', metadata_src, data_raw_dir, data_interim_dst, verbose=True)

10.1038-s41598-018-36116-6.html
10.1007-s40096-018-0271-3.html
10.1007-s10816-018-9391-1.html
10.1186-s13662-018-1891-5.html
10.1186-s13662-018-1875-5.html
10.1038-s42003-018-0197-1.html
10.1186-s12976-018-0090-0.html
10.1038-s41419-018-1189-2.html
10.1186-s12936-018-2560-6.html
10.1140-epjb-e2018-90136-3.html
10.1007-s11222-017-9789-8.html
10.1007-s11538-018-0491-6.html
10.1186-s12942-018-0157-5.html
10.1186-s13662-018-1839-9.html
10.1038-s41598-018-33313-1.html
10.1186-s13662-018-1820-7.html
10.1186-s12879-018-3388-y.html
10.1186-s13662-018-1808-3.html
10.1186-s12711-018-0418-6.html
10.1007-s11538-018-0484-5.html
10.1007-s10287-018-0315-z.html
10.1186-s13662-018-1805-6.html
10.1186-s13756-018-0406-1.html
10.1038-s41598-018-32631-8.html
10.1186-s13662-018-1791-8.html
10.1007-s11538-018-0509-0.html
10.1140-epjds-s13688-018-0162-8.html
10.1038-s41598-018-31346-0.html
10.1007-s40864-018-0084-6.html
10.1007-s40096-018-0261-5.html
10.1186-s13662-018-1759-8.html
10.1186-s40537-018-0137-4.ht

10.1007-s10867-014-9373-9.html
10.1007-s10955-014-1024-9.html
10.1186-s13662-014-0332-3.html
10.1186-s13662-015-0355-4.html
10.1007-978-3-319-13755-1_8.html
10.1007-978-3-319-20591-5_13.html
10.1186-1742-4682-11-52.html
10.1186-1687-1847-2014-320.html
10.1186-1475-2875-13-486.html
10.1186-s12911-014-0108-4.html
10.1186-s40294-014-0006-8.html
10.1186-s13059-014-0541-9.html
10.1186-s13059-014-0538-4.html
10.1186-s12918-014-0126-y.html
10.1186-1687-1847-2014-278.html
10.1140-epjds-s13688-014-0029-6.html
10.1186-1687-1847-2014-270.html
10.1186-s40649-014-0003-2.html
10.1186-1471-2334-14-539.html
10.1186-1471-2458-14-1019.html
10.1186-s13388-014-0009-1.html
10.1186-1687-1847-2014-246.html
10.1186-1471-2334-14-505.html
10.1186-1687-1499-2014-127.html
10.1186-1687-1847-2014-172.html
10.1186-1687-1847-2014-168.html
10.1186-1687-1847-2014-164.html
10.1186-1297-9716-45-58.html
10.1007-s11538-014-9942-x.html
10.1186-1746-6148-10-101.html
10.1186-2049-9957-3-12.html
10.1007-s00477-013-0776-0.html


In [134]:
src

'pmc1181873.xml'

In [147]:
# make pubmed metadata
source = 'pubmed'
metadata_src = settings.metadata_dir / 'pubmed.pkl'
data_raw_dir = settings.pubmed_dir
data_interim_dst = settings.interim_dir / 'pubmed.pkl'


def parse_pubmed(src):
    metadata = pubmed_parser.parse_pubmed_xml(src)
    text = pubmed_parser.parse_pubmed_paragraph(src, all_paragraph=True)
    text = ' '.join(' '.join([x['text'] for x in text]).split())
    return metadata, text

srcs = [settings.pubmed_dir / x for x in os.listdir(settings.pubmed_dir)]

def aggregate_pubmed(srcs):
    pubmed_data = OrderedDict()
    for src in srcs:

        try:
            temp = OrderedDict()
            metadata, text = parse_pubmed(str(src.absolute()))
            temp['metadata'] = metadata
            temp['text'] = text
            try:
                k = metadata['doi']
            except AttributeError:
                k = src.strip('.xml')
            if len(text) > 0:
                pubmed_data[k] = temp
            print(src)
        except:
            print('Error: %s' % src)
            pass
    return pubmed_data

pubmed_data = aggregate_pubmed(srcs)

data\raw\pubmed\pmc1181873.xml
data\raw\pubmed\pmc1183543.xml
data\raw\pubmed\pmc130633.xml
data\raw\pubmed\pmc1336740.xml
data\raw\pubmed\pmc1360062.xml
data\raw\pubmed\pmc1364502.xml
data\raw\pubmed\pmc1413717.xml
data\raw\pubmed\pmc1462849.xml
data\raw\pubmed\pmc153427.xml
data\raw\pubmed\pmc1560197.xml
data\raw\pubmed\pmc1560306.xml
data\raw\pubmed\pmc1564099.xml
data\raw\pubmed\pmc1564112.xml
data\raw\pubmed\pmc1570461.xml
data\raw\pubmed\pmc1578275.xml
data\raw\pubmed\pmc1578276.xml
data\raw\pubmed\pmc1584413.xml
data\raw\pubmed\pmc1592533.xml
data\raw\pubmed\pmc1599940.xml
data\raw\pubmed\pmc1634794.xml
data\raw\pubmed\pmc1634916.xml
data\raw\pubmed\pmc1635475.xml
data\raw\pubmed\pmc1635496.xml
data\raw\pubmed\pmc1664645.xml
data\raw\pubmed\pmc1679884.xml
data\raw\pubmed\pmc1688553.xml
data\raw\pubmed\pmc1688890.xml
data\raw\pubmed\pmc1689474.xml
data\raw\pubmed\pmc1689913.xml
data\raw\pubmed\pmc1689929.xml
data\raw\pubmed\pmc1689932.xml
data\raw\pubmed\pmc1690196.xml
data\raw\p

data\raw\pubmed\pmc3004963.xml
data\raw\pubmed\pmc3006469.xml
data\raw\pubmed\pmc3011950.xml
data\raw\pubmed\pmc3012516.xml
data\raw\pubmed\pmc3017469.xml
data\raw\pubmed\pmc3018805.xml
data\raw\pubmed\pmc3020521.xml
data\raw\pubmed\pmc3023704.xml
data\raw\pubmed\pmc3024411.xml
data\raw\pubmed\pmc3024819.xml
data\raw\pubmed\pmc3024853.xml
data\raw\pubmed\pmc3029241.xml
data\raw\pubmed\pmc3030819.xml
data\raw\pubmed\pmc3030822.xml
data\raw\pubmed\pmc3033029.xml
data\raw\pubmed\pmc3033994.xml
data\raw\pubmed\pmc3034726.xml
data\raw\pubmed\pmc3037387.xml
data\raw\pubmed\pmc3040644.xml
data\raw\pubmed\pmc3040699.xml
data\raw\pubmed\pmc3041824.xml
data\raw\pubmed\pmc3045989.xml
data\raw\pubmed\pmc3049840.xml
data\raw\pubmed\pmc3057370.xml
data\raw\pubmed\pmc3059657.xml
data\raw\pubmed\pmc3060161.xml
data\raw\pubmed\pmc3060792.xml
data\raw\pubmed\pmc3061093.xml
data\raw\pubmed\pmc3062980.xml
data\raw\pubmed\pmc3062985.xml
data\raw\pubmed\pmc3062991.xml
data\raw\pubmed\pmc3063006.xml
data\raw

data\raw\pubmed\pmc3751816.xml
data\raw\pubmed\pmc3752018.xml
data\raw\pubmed\pmc3753349.xml
data\raw\pubmed\pmc3756967.xml
data\raw\pubmed\pmc3758010.xml
data\raw\pubmed\pmc3758189.xml
data\raw\pubmed\pmc3758892.xml
data\raw\pubmed\pmc3767637.xml
data\raw\pubmed\pmc3767837.xml
data\raw\pubmed\pmc3779324.xml
data\raw\pubmed\pmc3779923.xml
data\raw\pubmed\pmc3780334.xml
data\raw\pubmed\pmc3781295.xml
data\raw\pubmed\pmc3785461.xml
data\raw\pubmed\pmc3785835.xml
data\raw\pubmed\pmc3785836.xml
data\raw\pubmed\pmc3786716.xml
data\raw\pubmed\pmc3799982.xml
data\raw\pubmed\pmc3800049.xml
data\raw\pubmed\pmc3805536.xml
data\raw\pubmed\pmc3806247.xml
data\raw\pubmed\pmc3812071.xml
data\raw\pubmed\pmc3813836.xml
data\raw\pubmed\pmc3814409.xml
data\raw\pubmed\pmc3823847.xml
data\raw\pubmed\pmc3836038.xml
data\raw\pubmed\pmc3836275.xml
data\raw\pubmed\pmc3841118.xml
data\raw\pubmed\pmc3842399.xml
data\raw\pubmed\pmc3843840.xml
data\raw\pubmed\pmc3844175.xml
data\raw\pubmed\pmc3846174.xml
data\raw

data\raw\pubmed\pmc4575748.xml
data\raw\pubmed\pmc4578724.xml
data\raw\pubmed\pmc4585889.xml
data\raw\pubmed\pmc4585982.xml
data\raw\pubmed\pmc4587782.xml
data\raw\pubmed\pmc4589282.xml
data\raw\pubmed\pmc4590493.xml
data\raw\pubmed\pmc4590508.xml
data\raw\pubmed\pmc4590968.xml
data\raw\pubmed\pmc4593682.xml
data\raw\pubmed\pmc4595069.xml
data\raw\pubmed\pmc4605924.xml
data\raw\pubmed\pmc4609962.xml
data\raw\pubmed\pmc4611170.xml
data\raw\pubmed\pmc4614486.xml
data\raw\pubmed\pmc4617890.xml
data\raw\pubmed\pmc4619523.xml
data\raw\pubmed\pmc4619720.xml
data\raw\pubmed\pmc4622568.xml
data\raw\pubmed\pmc4623768.xml
data\raw\pubmed\pmc4623926.xml
data\raw\pubmed\pmc4624994.xml
data\raw\pubmed\pmc4626091.xml
data\raw\pubmed\pmc4627738.xml
data\raw\pubmed\pmc4629194.xml
data\raw\pubmed\pmc4629340.xml
data\raw\pubmed\pmc4632181.xml
data\raw\pubmed\pmc4633029.xml
data\raw\pubmed\pmc4633114.xml
data\raw\pubmed\pmc4634202.xml
data\raw\pubmed\pmc4635205.xml
data\raw\pubmed\pmc4640514.xml
data\raw

data\raw\pubmed\pmc5341736.xml
data\raw\pubmed\pmc5348083.xml
data\raw\pubmed\pmc5349474.xml
data\raw\pubmed\pmc5349521.xml
data\raw\pubmed\pmc5350287.xml
data\raw\pubmed\pmc5352813.xml
data\raw\pubmed\pmc5353708.xml
data\raw\pubmed\pmc5354248.xml
data\raw\pubmed\pmc5354280.xml
data\raw\pubmed\pmc5354285.xml
data\raw\pubmed\pmc5358897.xml
data\raw\pubmed\pmc5360116.xml
data\raw\pubmed\pmc5368649.xml
data\raw\pubmed\pmc5373349.xml
data\raw\pubmed\pmc5377920.xml
data\raw\pubmed\pmc5380336.xml
data\raw\pubmed\pmc5382368.xml
data\raw\pubmed\pmc5383836.xml
data\raw\pubmed\pmc5386204.xml
data\raw\pubmed\pmc5388502.xml
data\raw\pubmed\pmc5388773.xml
data\raw\pubmed\pmc5394686.xml
data\raw\pubmed\pmc5398581.xml
data\raw\pubmed\pmc5402400.xml
data\raw\pubmed\pmc5404831.xml
data\raw\pubmed\pmc5405516.xml
data\raw\pubmed\pmc5405999.xml
data\raw\pubmed\pmc5412123.xml
data\raw\pubmed\pmc5413039.xml
data\raw\pubmed\pmc5413876.xml
data\raw\pubmed\pmc5414272.xml
data\raw\pubmed\pmc5423408.xml
data\raw

data\raw\pubmed\pmc6153944.xml
data\raw\pubmed\pmc6160446.xml
data\raw\pubmed\pmc6165116.xml
data\raw\pubmed\pmc6168140.xml
data\raw\pubmed\pmc6170776.xml
data\raw\pubmed\pmc6171287.xml
data\raw\pubmed\pmc6173453.xml
data\raw\pubmed\pmc6182842.xml
data\raw\pubmed\pmc6185890.xml
data\raw\pubmed\pmc6185894.xml
data\raw\pubmed\pmc6192647.xml
data\raw\pubmed\pmc6193587.xml
data\raw\pubmed\pmc6193630.xml
data\raw\pubmed\pmc6195572.xml
data\raw\pubmed\pmc6198259.xml
data\raw\pubmed\pmc6202716.xml
data\raw\pubmed\pmc6202722.xml
data\raw\pubmed\pmc6208014.xml
data\raw\pubmed\pmc6209289.xml
data\raw\pubmed\pmc6211691.xml
data\raw\pubmed\pmc6214278.xml
data\raw\pubmed\pmc6214536.xml
data\raw\pubmed\pmc6223030.xml
data\raw\pubmed\pmc6227249.xml
data\raw\pubmed\pmc6241031.xml
data\raw\pubmed\pmc6245120.xml
data\raw\pubmed\pmc6258546.xml
data\raw\pubmed\pmc6264791.xml
data\raw\pubmed\pmc6275108.xml


In [146]:
pubmed_data.keys()
pubmed_data['10.1371/journal.pmed.0020174']['text']

"The past decade has seen a dramatic increase in the significance attached to infectious diseases from the public health perspective. This trend is due in part to the emergence of new and highly pathogenic infections such as Ebola [1], West Nile virus [2], and SARS [3]. There are also well-publicized concerns surrounding the deliberate introduction of pathogens as bioterrorism weapons [4,5], and the continued persistence and resurgence of older infections, several of which now boast strains resistant to more than one drug [6]. In addition, there have been a number of high-profile and economically expensive disease outbreaks in domestic livestock [7–9] as well as wildlife populations [10]. The effective management and control of such infections is increasingly done with substantial input from mathematical models, which are used not only to provide information on the nature of the infection itself, through estimates of key parameters such as the basic reproductive ratio R 0 [11], but als

In [149]:
output = copy.deepcopy(pubmed_data)
output.update(arxiv_data)
output.update(springer_data)

In [154]:
import json
dst_dir = os.path.join(r'L:\Atlanta-30_days_only', 'Domonique'); utilities.mkdir(dst_dir)
dst = os.path.join(dst_dir, 'data.json')
with open(dst, 'w') as f:
    json.dump(output, f)

Directory L:\Atlanta-30_days_only\Domonique already exists


In [155]:
print(len(output))
output.keys()


odict_keys(['10.1371/journal.pmed.0020174', '10.1186/1475-925X-4-69', '10.1186/1476-072X-5-4', '10.1186/1475-925X-2-4', '10.1186/1742-4682-3-32', '10.1371/journal.pmed.0030387', '10.1371/journal.pone.0000012', '10.1371/journal.pone.0000165', '10.1371/journal.pmed.0040015', '10.1371/journal.pcbi.0030085', '10.1371/journal.pmed.0040174', '10.1186/1746-6148-3-10', '10.1098/rspb.2006.0030', '10.1016/j.jtbi.2006.04.003', '10.1371/journal.pone.0000747', '10.1186/1471-2458-3-23', '10.1371/journal.pcbi.0030159', '10.1371/journal.pone.0000758', '10.1371/journal.pone.0001458', '10.1186/1471-2334-7-132', '10.1186/1741-7015-5-34', '10.1155/2007/64870', '10.1186/1471-2334-3-19', '10.1186/1751-0473-3-3', '10.1371/journal.pone.0001941', '10.1186/1746-6148-4-11', '10.1371/journal.pone.0002185', '10.1098/rsif.2007.1272', '10.1371/journal.pone.0002299', '10.1111/j.1365-2656.2007.01328.x', '10.1186/1753-4631-2-2', '10.1371/journal.pone.0002710', '10.1186/1476-072X-7-35', '10.1371/journal.pmed.0050200', '

In [137]:
len(pubmed_data)

937

In [86]:
# # load metadata
# with open(settings.metadata_dir / 'arxiv.pkl', 'rb') as f:
#     arxiv_metadata_ = pickle.load(f)

# we'll just add the text to a new arxiv object, an ordered dict keyed on doi or other id
# arxiv_data = OrderedDict()

# arxiv_srcs = [settings.arxiv_dir / x for x in os.listdir(settings.arxiv_dir)]

# for ix, article_metadata in enumerate(arxiv_metadata_[109:]):
#     fn = article_metadata['fn']
#     # define key, usually the doi
#     k = article_metadata['id'].split('/abs/')[-1]
    
#     # initialize temp dictionary
#     temp = OrderedDict()
#     temp['metadata'] = copy.deepcopy(article_metadata)
#     src = settings.arxiv_dir / fn
#     temp['data'] = parse_pdf(src)
    
#     arxiv_data[k] = temp
    
dst = settings.interim_dir / 'arxiv.pkl'
with open(dst, 'wb') as f:
    pickle.dump(arxiv_data, f)









































































































































































































































































































































































































































































































































































































































































































































































































































































































































# Springer

In [95]:
srcs = [settings.springer_dir / x for x in os.listdir(settings.springer_dir)]

for src in srcs[:1]:


In [113]:
data

'Ranaviruses pp 209-240 | Cite asDesign and Analysis of Ranavirus Studies: Surveillance and Assessing RiskAuthorsAuthors and affiliationsMatthew J. GrayJesse L. BrunnerJulia E. EarlEllen ArielOpen AccessChapter 2 Citations 11 Readers 5.3k Downloads AbstractRanaviruses are pathogens that cause disease in ectothermic vertebrate species, and are responsible for die-off events in multiple taxa across the globe. Understanding the threat of ranavirus in wild and captive populations is a growing conservation and economic interest. Quantifying risk is a central premise to understanding the threat of a pathogen, and surveillance studies are a logical starting point. In this chapter, we discuss how to design surveillance studies for ranavirus, required sample sizes, statistical analyses commonly used to analyze data, and modeling approaches to predict disease outcomes. Additionally, we cover the process of Import Risk Analysis, which quantifies the threat of ranavirus introduction into a new are

In [103]:
with open(settings.metadata_dir / 'springer.pkl', 'rb') as f:
    metadata_ = pickle.load(f)

In [106]:
metadata_[0]['fn']

'10.1038-s41598-018-36116-6.html'

'\nRanaviruses\n                pp 209-240 |\n                Cite asDesign and Analysis of Ranavirus Studies: Surveillance and Assessing RiskAuthorsAuthors and affiliationsMatthew\xa0J.\xa0GrayJesse\xa0L.\xa0BrunnerJulia\xa0E.\xa0EarlEllen\xa0ArielOpen AccessChapter \n\n\n2\nCitations\n\n\n\n\n11\nReaders\n\n\n\n\n5.3k\nDownloads\n\n\n\nAbstractRanaviruses are pathogens that cause disease in ectothermic vertebrate species, and are responsible for die-off events in multiple taxa across the globe. Understanding the threat of ranavirus in wild and captive populations is a growing conservation and economic interest. Quantifying risk is a central premise to understanding the threat of a pathogen, and surveillance studies are a logical starting point. In this chapter, we discuss how to design surveillance studies for ranavirus, required sample sizes, statistical analyses commonly used to analyze data, and modeling approaches to predict disease outcomes. Additionally, we cover the process of

In [None]:
# 
for ix, md in enumerate(springer_metadata):
    fn = md['doi'].replace('/', '-')
    if len(fn) == 0:
        fn = md['identifier']
    r = requests.get(md['url'][0]['value'])
    html = bs(r.text).encode('utf-8').decode('utf-8')
    dst = settings.raw_dir / 'springer' / (fn + '.html')
    with open(dst, 'w', encoding='utf-8') as f:
        f.write(html)
    time.sleep(0.3)

In [32]:
soup = bs(r.text)

In [45]:
li = [x.contents[0] for x in soup.find('article').find_all('p', {'class':'Para'})]
li

['This paper investigates an SIS epidemic model with variable population size including a vaccination program. Dynamics of the endemic equilibrium of the model are obtained, and it will be shown that this equilibrium exists and is locally asymptotically stable when ',
 'The susceptible–infected–susceptible (SIS) model is one of the most well-known type of epidemic models. These models are appropriate for some infections, for instance, common cold and influenza, or bacterial diseases such as meningitis and cholera, or sexually transmitted diseases, that do not cause permanent immunity after recovery. To immunize individuals from infection and control the infectious diseases, vaccination is usually preferred because of its efficiency compared with other drug and non-drug interventions. There are many epidemic models [',
 'Here, the population has been divided into three subpopulations as susceptible, infected, and vaccinated individuals. The size of population in each class at time ',
 '

In [54]:
# print(soup.find('article').prettify())
li = [x.get_text() for x in soup.find('article').find_all('p')]
s = 'Springer Nature remains neutral '
li

['Mathematical Sciences',
 'December 2018, Volume 12, Issue\xa04,\n                        pp 313–320 | Cite as',
 'This paper investigates an SIS epidemic model with variable population size including a vaccination program. Dynamics of the endemic equilibrium of the model are obtained, and it will be shown that this equilibrium exists and is locally asymptotically stable when \\({\\mathcal {R}}_0 > 1\\). In this case, the disease uniformly persists, and moreover, using a geometric approach we conclude that the model is globally asymptotically stable under some conditions. Also, a numerical discussion is given to verify the theoretical results.',
 'The susceptible–infected–susceptible (SIS) model is one of the most well-known type of epidemic models. These models are appropriate for some infections, for instance, common cold and influenza, or bacterial diseases such as meningitis and cholera, or sexually transmitted diseases, that do not cause permanent immunity after recovery. To immu

In [30]:
text = pubmed_parser.parse_pubmed_paragraph(str((settings.raw_dir / 'springer' / 'test.xml').absolute()), all_paragraph=True)

Error: it was not able to read a path, a file-like object, or a string as an XML


XMLSyntaxError: Start tag expected, '<' not found, line 1, column 1 (<string>, line 1)

In [27]:
r = requests.get(springer_metadata[1]['url'][0]['value'])


{'feed': {'html': {'lang': 'en-gb', 'class': 'no-js'},
  'meta': {'name': 'msapplication-TileImage',
   'content': '/springerlink-static/1645515599/images/favicon/ic_launcher_xxhdpi.png'},
  'links': [{'rel': 'canonical',
    'href': 'https://link.springer.com/article/10.1007/s40096-018-0271-3',
    'type': 'text/html'},
   {'rel': 'shortcut icon',
    'href': '/springerlink-static/1645515599/images/favicon/favicon.ico',
    'type': 'text/html'},
   {'rel': 'icon',
    'sizes': '16x16 32x32 48x48',
    'href': '/springerlink-static/1645515599/images/favicon/favicon.ico',
    'type': 'text/html'},
   {'rel': 'icon',
    'sizes': '16x16',
    'type': 'image/png',
    'href': '/springerlink-static/1645515599/images/favicon/favicon-16x16.png'},
   {'rel': 'icon',
    'sizes': '32x32',
    'type': 'image/png',
    'href': '/springerlink-static/1645515599/images/favicon/favicon-32x32.png'},
   {'rel': 'icon',
    'sizes': '48x48',
    'type': 'image/png',
    'href': '/springerlink-static/16

# IEEE Xplore

In [None]:
base = 'http://ieeexploreapi.ieee.org/api/v1/search/articles?'
url = base + q
params = {'max_records': 20, 'start_record': 1, 'querytext': q, 'apikey': ieee_api_key}
params_ = copy.deepcopy(params)

ieee_metadata = []
while True:
    r = requests.get(url, params_)
    if params_['start_record'] > r.json()['total_records']:
        break
    for article in r.json()['articles']:
        if i['access_type'] != 'LOCKED':
            ieee_metadata.append(article)
    params_['start_record'] = params_['start_record'] + params_['max_records']
    time.sleep(wait_time)
print(len(ieee_metadata))

# Pubmed (Entrez)
* https://marcobonzanini.com/2015/01/12/searching-pubmed-with-python/

In [25]:
# Download all of it
# get the ids
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=%22sir+model%22&set=pmc-open&metadataPrefix=pmc&usehistory=y&retmax=2000'


Directory data already exists
Directory data\raw already exists
Directory data\raw\arxiv already exists
Directory data\raw\springer already exists
Directory data\raw\pubmed already exists
Directory data\interim already exists
Directory data\processed already exists
Directory data\processed\metadata already exists


In [None]:
# search pubmed central for free full text articles containing selected query
terms = ['sir model', 'susceptible-infected-recovered']
replace = lambda s: s.replace(' ', '+')
quote = lambda s: '%22' + s + '%22'
terms = [quote(replace(s)) for s in terms]
term = 'term='+ '%28'+ '+OR+'.join(terms) + '%29'
fulltext = 'free+fulltext%5bfilter%5d'
retmax = 'retmax=2000'
base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc'
params = {'retmax': 2000, 'email': }
url = base + '&' + term + '+' + fulltext + '&' + retmax
r = requests.get(url)
ids = [x.contents[0] for x in bs(r.text).find_all('id')]
ids[5]

In [None]:
len(ids)

In [None]:
pmc_articles = OrderedDict()
for i in ids:
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=' + str(i)
    r = requests.get(url, params={'id': i})
    xml = r.text
    pmc_id = 'pmc' + str(i)
    fn = (pmc_id + '.xml')
    path = settings.raw_dir / 'pubmed' / fn
    with open(path, 'w') as f:
        f.write(xml)
    time.sleep(0.5)

In [None]:
if len(text) > 0:
    pmc_articles[doi] = {'metadata': metadata, 'text': text}

text = pubmed_parser.parse_pubmed_paragraph(str(path.absolute()), all_paragraph=True)
metadata = pubmed_parser.parse_pubmed_xml(str(path.absolute()))
doi = metadata.pop('doi')


In [None]:
pmc_articles[doi]

In [None]:
len(pmc_articles)

In [None]:
pmc_articles

In [None]:
paragraphs

In [None]:
# r = requests.get(url, params={'id': i})


In [None]:
src = r'E:\Users\Peter_Rasmussen\gh\multivac\data\raw\pubmed\pmc4760143'

In [None]:
# the parsing answer: https://github.com/titipata/pubmed_parser


In [None]:
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=6214536'
r = requests.get(url)

In [None]:
pubmed_parser.parse_xml_web(ids[28], save_xml=False)

In [None]:
xml = r.text

In [None]:
imp

In [None]:
import pubmed_parser

In [None]:
dicts_out = pp.parse_pubmed_paragraph('data/6605965a.nxml', all_paragraph=False)

In [None]:
print(xml.keys())

In [None]:
oa_file_list_ = pd.read_csv('ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv')

In [None]:
oa_file_list_['Article Citation'].unique()

In [None]:
# tbd

url = 'https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=ListRecords&metadataPrefix=pmc&term=%22sir+model%22+OR%22susceptible+-+infected+-+recovered%22&set=pmc-open'
r = requests.get(url)

In [None]:
r.text[:1000]

# Old

In [None]:



# def process_article_metadata(metadata):
#     """Extract metadata for one article and organize metadata into a dictionary.
#     Inputs:
#         metadata    String; Read in from API call
#     Outputs:
#         d  Dictionary of article metadata
#     """

#     # for each article extract and organize metadata
#     metadata = bs(metadata.strip(), 'lxml')

#     # each article's metadata contained in a dictionary
#     d = {}

#     # add single-value attributes
#     single_value_attributes = ['id', 'updated', 'published', 'title', 'summary', 'doi']
#     for single_value_attribute in single_value_attributes:
#         try:
#             v = metadata.find(single_value_attribute).contents[0].strip()
#         except AttributeError:
#             # some articles don't have doi numbers so fall back on arxiv doi
#             if single_value_attribute=='doi':
#                 v = d['id']
#             else:
#                 v = None
#         d[single_value_attribute] = v

#     # add multiple-value attributes and edge-case single-value attributes
#     d['arxiv:primary_category'] = metadata.find('arxiv:primary_category').attrs['term']
#     d['arxiv_categories'] = [x['term'] for x in metadata.find_all('category')]
#     d_links = {}
#     for link in metadata.find_all('link'):
#         if 'title' in link.attrs:
#             k, v = link.attrs['title'], link.attrs['href']
#             d_links[k] = v
#     d['links'] = d_links
#     d['authors'] = [x.contents[0] for x in metadata.find_all('name')]

#     return d


# def get_metadata_from_page(xml_text):
#     """
#     Usage of output:
#         As an OrderedDict, d_page_metadata has the usual dictionary functionality
#         It can also be accessed like a list using the approach below (Python 3 approach below):
#             items = list(d_page_metadata.items())
#             items[0]
#     """
    
#     # prep metadata returned by api query
#     articles_metadata = re.sub(' +', ' ', xml_text.replace('arxiv:doi', 'doi').replace('\n', ' ')).strip().split('<entry>')[1:]

#     # iterate over each article and extract and organize metadata
#     d_page_metadata = OrderedDict()
#     for article_metadata in articles_metadata:
#         v = copy.deepcopy(process_article_metadata(article_metadata))
#         k = v.pop('doi')
#         d_page_metadata[k] = v
    
#     return d_page_metadata



In [None]:
# ps2ascii, gzip
import gzip
import os

In [None]:
f = gzip.open(raw_src, 'rb')

In [None]:
ps = f.read()

In [None]:
f.close()

In [None]:
raw_dir = r'E:\Users\Peter_Rasmussen\gh\multivac\data\raw\arxiv'
raw_src = os.path.join(raw_dir, '1411.2370v2.ps.gz')
# with gzip.open(raw_src, 'rb') as f:
#     ps = f.read()
    
def opener(filename):
    f = open(filename,'rb')
    if (f.read(2) == '\x1f\x8b'):
        f.seek(0)
        return gzip.GzipFile(fileobj=f)
    else:
        f.seek(0)
        return f
f = opener(raw_src)

In [None]:
ps = f.read()
f.close()

In [None]:
with opener(raw_src) as f:
    ps = f.read().decode('iso-8859-1')

In [None]:
len(ps)

In [None]:
list(d_metadata.items())[0]

In [None]:
params_

In [None]:
n_results

In [None]:
len(d_metadata.keys())

In [None]:
next(a)

In [None]:
print()
print(80 * '*')
print(xml_text)


In [None]:

for link in article_metadata.find_all('link'):
    pass

In [None]:

if 'author' in link.attrs:
    k, v = link.attrs['title'], link.attrs['href']
    d_links[k] = v

In [None]:
link.attrs

In [None]:
v

In [None]:
link

In [None]:
link.attrs

In [None]:
article_metadata.find_all('link')[0].attrs

In [None]:

summary = ''.join(soup.find('summary').contents).strip().replace('\n', ' ')
title = ''.join(soup.find('title').contents).strip().replace('\n', ' ')
title

In [None]:

soup.find_all('title')

In [None]:
'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=10'

In [None]:
import xml

In [None]:
e = xml.etree.ElementTree.parse('xml.xml')

In [None]:
e.findall('title')

In [None]:
bf.data(r.text)

In [None]:
r.text

In [None]:
import urllib

base = 'http://export.arxiv.org/api/'
method = 'query'
search_term = 'electron'
parameters
parameters = 'search_query=all:' + electron&start=0&max_results=10'
query = base + method + '?' + parameters
data = urllib.request.urlopen(query).read()
print(data)

In [None]:
data.