# Fetch articles from PMID list

This notebook fetches all of the raw xml records for each pmid in the list of pmids that was generated by Caroline.

This outputs a bunch of files to the ./data/out/pmid_xmls folder in the format of PMID.xml (eg, '19297575.xml')

This takes a while to run, about 15 minutes to pull down at most 7206 pmid files.

In [1]:
import pandas as pd
import sys
sys.path.append('../')
from team1Python import eutil
from lxml import etree
import xmltodict
import json

In [7]:
eutils = eutil.EUtils(
        '8d4c4f67f2a663e9d0ef6ed4d60a4eedd609',  # API key
        'brian.lee@cdc.gov',  # Email address - unused
        20,  # API calls per second
        'https://eutilspreview.ncbi.nlm.nih.gov/entrez'
        # URL prefix for preview - normally not needed
    )


In [9]:
# taken from https://github.com/NCBI-Codeathons/pubmed-codeathon-team4/blob/main/bmcodeathon/team4/diagnose.py
def print_element(root, pretty_print=True):
    buf = etree.tostring(
        root,
        encoding='utf-8',
        pretty_print=pretty_print
    )
    return buf.decode('utf-8')

In [27]:
pmids = pd.read_csv('../data/out/pmids.csv')['pmid']

In [28]:
pmids

0       19297575
1       29911252
2       27960175
3       27549666
4       28381201
          ...   
7201    33988094
7202    35608744
7203    35603494
7204    35597972
7205    35596530
Name: pmid, Length: 7206, dtype: int64

In [33]:
for pmid in pmids:
    r = eutils.efetch('pubmed', pmid, rettype='xml')
    r.raise_for_status()
    assert r.headers['Content-Type'].startswith('text/xml')
    doc = r.xml()
    doc_pp = print_element(doc)
    f = open('../data/out/pmid_xmls/'+str(pmid)+'.xml', 'a')
    f.write(doc_pp)
    f.close()
    print(pmid)

19297575
29911252
27960175
27549666
28381201
30424960
20924291
27386589
33567654
31066331
33762446
30347298
18232331
34469866
32643683
16941427
12760172
33539822
3048657
32030929
35500358
35413541
35512575
35502663
35430511
35427840
35367845
35311740
34780101
34716882
35605621
35588715
35595656
35578209
34744103
35569986
35567597
35545931
35543128
35333329
30695970
19878403
34755952
33435739
33342168
27789971
31750635
28392653
29467580
34917368
29706457
29641827
18318880
31836115
30446193
17367622
17052510
16489838
29853755
17052516
35076344
34917368
33435739
35130384
34755952
33986606
33869705
33781159
33342168
32845296
32845295
32436416
32104041
31836115
31295443
31750635
31597359
31108976
30709965
29706457
30556770
34036632
33161603
25686105
27767278
21596182
29773275
32246440
31077002
21967869
32546426
31057541
28631068
29742056
27707729
28583784
28686751
25614348
29382382
27435956
35574994
35548670
35490273
35396080
35354288
35039323
35314925
35229164
35262642
35281325
35339140
35

35446392
20301288
35426432
35420598
35571990
26389342
35484894
35393544
31775340
34160155
25391139
35052414
33549200
26465709
18833216
17915571
22161416
26559449
25028051
25914343
34269712
32735634
30074569
24282836
30534954
7662715
18425888
21154353
35576861
35174518
35150142
35486202
35563042
35464873
35439216
35454873
34739082
35192358
35089317
34782065
35465342
35351818
35338174
35022222
35320153
35085960
34898394
35444719
17604717
15784165
32502837
16603397
30741586
16288295
20515819
11882383
24794538
33019927
26071042
24800899
15569636
18288941
23297823
14619975
26348913
23971717
32271398
33113244
35163623
34848462
34772336
34711805
34407201
34157521
34174504
34161152
33960434
32160078
33785835
33741019
33690729
33658617
33113244
33385405
33393983
33615993
33531984
33273014
21892142
28232179
28407774
19629071
26616193
31195981
28622524
35196887
28408430
31747940
30304866
34547233
29592806
27919039
30867601
26303471
34517004
22369257
22186033
23080146
35531887
35416120
34860639
35

35037787
35034396
34999600
35605143
35134489
35598279
35596256
35595719
35593522
35585582
35579856
35575880
35591890
35570406
35572462
31321757
29631795
223716
16776222
10332384
34404673
33143829
2008821
33761983
12430572
8438210
15950720
13032539
14777902
18127915
18103553
35100642
17425469
21167251
14640913
35606066
35600201
35290880
35275325
34851495
35606028
35585293
35254428
35226748
33760534
28723023
31424791
35576751
34841515
30252248
35170268
34821121
35597068
35028661
35436330
31650376
30713326
31791461
34135159
28365915
31905944
27294040
32805624
32618613
25799073
31712867
33833754
24193862
34280553
18206818
17490850
17541445
34755227
34321847
17701958
35586678
35257202
35189464
35175447
33820756
35596263
35576564
34744111
34670874
35526270
35527030
35505465
35379557
35303492
35486167
35558377
35487851
35548476
35465817
35435857
14751044
19088156
19088156
14751044
25408753
28271702
22819144
29420391
24366116
15541453
31069056
31488888
31812704
24393293
20801404
27165051
33640

32472450
34043195
32770451
32822773
29706611
35142659
35462297
35483214
35462165
35525236
35417751
35405260
35398340
35301029
34916419
35358621
35430448
35413535
35403008
35526456
35490443
35283326
35150691
35280925
35603052
30173207
28504679
34182088
31127258
31222186
32747763
34259824
30359598
27083772
33594012
31277860
23394773
33421491
25082707
34366798
33186530
32916145
34764308
33404500
33687961
35579602
35513229
35358499
35301029
35134251
35259414
35526456
35346832
35601692
35466006
35347633
35301086
35219855
35199098
35189258
35164969
35152448
35119605
34919273
34628661
25646382
18284371
34000560
32697942
28590682
32839445
34125337
27023731
26247860
31408264
33870125
27094096
34861597
26969452
30911185
24917300
30216605
24183026
24366126
21867877
35603057
35603052
35603046
35602445
35602439
34629508
35604244
35600693
35600222
35548881
35578732
35576648
35394755
35524981
35507662
35540110
35442617
35388840
35586491
35530119
30359598
32859716
32208366
28528964
29115042
34502208
1

26960936
22876388
32632968
25091797
35109781
35526650
35367821
35399702
35603471
35602933
35586893
33620821
35570091
35523788
35504863
35513284
35565326
35451372
35438788
35412885
35430731
35492620
35432336
35450328
29601862
33439750
33868253
18684880
8699856
356692
29275187
34148453
26149496
7013670
20936972
1974406
25319336
24445266
9515197
20627672
18799934
10780050
6390521
1601328
35600968
35412419
35363114
35352622
35306933
35254202
35244505
35232327
35225122
35188856
35188021
35180021
35166645
35125072
35119317
35118215
35030980
34978237
34967692
34964421
29935338
25327200
26195305
34490836
26542035
19654018
21640849
33174903
34096779
19729033
16318691
32853741
19824790
20399814
16108911
18375178
11951140
17661631
29145870
19720074
34842509
35452990
35499330
35590480
35575735
35304125
29999865
35552506
35549771
35546539
35569405
29939550
35306427
35114377
34342700
35591130
35565571
35475639
35138121
35454225
32819774
20973657
34486526
31885244
20042306
33207065
29519505
29664374


31201126
34130146
25695155
25714655
33495273
12201574
21485383
35540704
35142079
28613631
35529481
35431076
35431064
35490957
35285700
35467375
35323021
35378507
35409212
35320633
35292596
35293560
35323561
35384427
35235094
34462848
33667026
25208300
28544285
27193549
34453548
29785477
32155148
27573929
34582469
17582565
29616895
33079054
32146451
34370659
23166855
33651791
28859315
32699118
21758008
32253142
20551214
35537043
35325722
35364283
35338009
35212491
35608808
35608633
35489061
35591750
35587200
35579463
35579416
35597090
35568365
35558860
29262003
32491445
35491421
35138654
35129256
32663462
26068698
32880768
29691072
29501823
33569724
329063
32031907
28649137
28513559
27494616
16999876
31376513
34280580
30261194
26403725
22627317
28333314
27489797
29879965
35045788
35608550
35435576
35093552
35546037
35527016
35482608
35488539
35422342
35118580
35378792
35318352
34978459
35044573
35193669
35206082
35208889
33993224
35215176
35135613
9709046
25566513
27370344
34678970
3392

32294334
31156186
32081239
23594017
26719778
32646564
28325633
30631965
35600979
35415293
35415292
35310360
35393084
35503933
35365291
35351286
35314880
35241329
35194977
35085159
35015058
34761665
34629285
34629284
34622706
34615426
34380958
34350789
32534032
25589433
34059461
10593029
32281637
23752503
30910872
33989509
30586662
1683448
31420501
21915580
34623967
30763471
22998085
31937342
10469100
28235838
16875143
7050644
35435152
35230221
35098828
34962225
35259847
35077988
35595360
35526512
35497489
35421740
35345466
35314292
35310386
35304684
35257694
35234322
35230136
35219023
35166879
35072299
27865321
31582705
31454047
32965366
31154334
31153997
20152201
21670651
35080596
23571188
27865328
23206714
10148363
32571367
33382075
23765922
33184118
28549696
9526383
20619231
35600275
35430716
35591761
35599993
35597261
35581627
35527459
35533490
35513808
35468017
35076728
34896096
34730417
34426075
32310548
30570987
35459195
35469265
35467819
35428268
27739401
3133037
7600367
249696

27758061
32900158
32300053
31205192
32981026
31571717
35524620
35366598
35243620
35124852
35094290
35585125
35416690
35600754
35575633
30000146
35545799
35537706
35535441
35545157
35137229
35510505
35501478
35466847
29083706
35579871
31643776
30601610
30765911
17199032
16953653
17728846
29691835
32491527
32103957
33179247
29313492
21351809
22332960
20233912
33231110
29957667
21348771
34234679
17696794
34273668
35596110
35597399
35560228
35286772
35062045
32491527
35487871
35392704
35444497
35180640
35063442
34590711
35213526
20301494
34605857
34596412
34273668
34161904
34486101
34509883
30000710
31643998
23992601
28176222
29947099
30992195
32990096
19791828
28447181
26403305
27282159
31629700
28884600
21802144
29787616
28903775
28984487
26923222
20402634
23137182
35581617
35578028
35182920
35603648
34728339
35468904
35434817
35388920
35180646
34779013
35247279
35181219
34922031
35227036
35187695
35203674
34962286
34260107
33961299
33078683
29763510
911000
6741602
6944087
9005879
324433

## Below this is debug stuff, so ignore what happens

In [13]:
jsonString = json.dumps(xmltodict.parse(doc_pp))

In [14]:
json_obj = json.loads(jsonString)

In [15]:
json_obj['PubmedArticleSet']['PubmedArticle']

{'MedlineCitation': {'@Status': 'MEDLINE',
  '@Owner': 'NLM',
  'PMID': {'@Version': '1', '#text': '14630660'},
  'DateCompleted': {'Year': '2004', 'Month': '07', 'Day': '22'},
  'DateRevised': {'Year': '2019', 'Month': '12', 'Day': '10'},
  'Article': {'@PubModel': 'Print',
   'Journal': {'ISSN': {'@IssnType': 'Print', '#text': '1367-4803'},
    'JournalIssue': {'@CitedMedium': 'Print',
     'Volume': '19',
     'Issue': '17',
     'PubDate': {'Year': '2003', 'Month': 'Nov', 'Day': '22'}},
    'Title': 'Bioinformatics (Oxford, England)',
    'ISOAbbreviation': 'Bioinformatics'},
   'ArticleTitle': 'PDB file parser and structure class implemented in Python.',
   'Pagination': {'StartPage': '2308',
    'EndPage': '2310',
    'MedlinePgn': '2308-10'},
   'Abstract': {'AbstractText': [{'@Label': 'UNLABELLED',
      '#text': 'The biopython project provides a set of bioinformatics tools implemented in Python. Recently, biopython was extended with a set of modules that deal with macromolecul

In [34]:
doc

<lxml.etree._ElementTree at 0x7f86c0099a80>

In [35]:
import pubmed_parser as pp

In [36]:
pp.parse_article_meta(doc)

AttributeError: module 'pubmed_parser' has no attribute 'parse_article_meta'