-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubmed.py
66 lines (59 loc) · 2.03 KB
/
pubmed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
'''
Application Interface for querying the pubmed database
Future Direction: This just caches terms but it would be even better to cache word frequencies (processed),
this will require some reorginization of the code however as pubmed_query would need to return processed
results rather than a big word list.
'''
import pymysql
import urllib.request
import itertools as it
from bs4 import BeautifulSoup
from config import config
def pubmed_cache(pmid):
''' Cache results for general speedups of pubmed queries '''
con = pymysql.connect(**config['database'])
cur = con.cursor()
cur.execute('select terms from pubmed_cache where pmid=%s', (pmid))
res = cur.fetchall()
if res:
print('h', res)
terms = ' '.join(t['terms'] for t in res)
else:
print('m')
terms = pubmed_query(search=False, id=pmid)
try:
cur.execute('insert into pubmed_cache values (%s, %s)', (pmid, terms))
con.commit()
except:
# TODO: eliminate non-standard characters in terms so we can cache these
pass
return terms
def pubmed_query(search=True, **kwargs):
''' API for querying the pubmed database via GET requests '''
try:
# default parameters, overridden by **kwargs
params = dict({'retmode': 'xml',
'rettype': 'Abstract',
'db': 'pubmed',
'retmax': config['pubmed_query_limit']},
**kwargs)
# create url serializing kwargs in GET
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/e%s.fcgi?%s' % (
'search' if search else 'fetch',
'&'.join('='.join(map(str, param))
for param in params.items()))
# open and parse the website
xml = urllib.request.urlopen(url)
soup = BeautifulSoup(xml, 'xml')
# search or fetch
if search:
# perform a pubmed query on each id in our results
return ' '.join(pubmed_cache(i.getText()) # pubmed_query(search=False, id=i.getText())
for i in soup.findAll('Id'))
else:
# return the text in the title/abstract
return ' '.join(res.getText()
for section in ['ArticleTitle', 'AbstractText']
for res in soup.findAll(section))
except:
return ''