-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract-large-abstracts.py
40 lines (30 loc) · 1.47 KB
/
extract-large-abstracts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from Bio import Entrez
import sys
import csv
Entrez.email = 'anonymous@gmail.com'
def fetch_abstracts(pub_ids, retmax=1000, output_file='abstracts.csv'):
# Make sure requests to NCBI are not too big
for i in range(0, len(pub_ids), retmax):
j = i + retmax
if j >= len(pub_ids):
j = len(pub_ids)
print(f"Fetching abstracts from {i} to {j}.")
handle = Entrez.efetch(db="pubmed", id=','.join(pub_ids[i:j]),
rettype="xml", retmode="text", retmax=retmax)
records = Entrez.read(handle)
abstracts = [pubmed_article['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
if 'Abstract' in pubmed_article['MedlineCitation']['Article'].keys()
else pubmed_article['MedlineCitation']['Article']['ArticleTitle']
for pubmed_article in records['PubmedArticle']]
abstract_dict = dict(zip(pub_ids[i:j], abstracts))
with open(output_file, 'a', newline='') as csvfile:
fieldnames = ['pub_id', 'abstract']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
if i == 0:
writer.writeheader()
for pub_id, abstract in abstract_dict.items():
writer.writerow({'pub_id': pub_id, 'abstract': abstract})
if __name__ == '__main__':
filename = sys.argv[1]
pub_ids = open(filename, "r").read().splitlines()
fetch_abstracts(pub_ids)