## Imports and constant inits

In [1]:
import urllib
import requests
from bs4 import BeautifulSoup

In [2]:
# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"

In [16]:
headers = {"user-agent" : USER_AGENT}
base_url_search = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
base_url_fetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
db = 'pubmed'
term='asthma'
terms = ['cystic fibrosis', 'acute bronchitis', 'asthma', 'lung cancer', 'COPD', 'flu', 'pneumonia']

# Demo

## Search query

In [4]:
search_query = base_url_search + f'?db={db}&term={term}&WebEnv=<webenv string>&usehistory=y'

In [None]:
search_response = requests.get(search_query, headers=headers)

In [None]:
search_soup = BeautifulSoup(search_response.content, 'xml')

In [5]:
WebEnv = search_soup.find('WebEnv').get_text()
QueryKey = search_soup.find('QueryKey').get_text()
WebEnv, QueryKey

## Fetch query

In [11]:
fetch_query = base_url_fetch + f'?db={db}&query_key={QueryKey}&WebEnv={WebEnv}&retmax=10&retmode=XML&rettype=Abstract'

In [12]:
query_response = requests.get(fetch_query, headers=headers)

In [13]:
atricles = BeautifulSoup(query_response.content, 'xml')

In [23]:
with open("./api_data/asthma_articles.xml", "wb") as file:
    file.write(atricles.prettify().encode())

## Demodata extraction

In [15]:
atricle1 = atricles.find_all('PubmedArticle')[0]

### Title

In [16]:
atricle1_title = atricle1.find("ArticleTitle").get_text()
atricle1_title

'Co-presentation of pulmonary arteriovenous malformation (PAVM) and multifocal benign metastasizing leiomyoma (BML) of lungs: Diagnostic dilemma and successful endovascular treatment.'

### Abstract

In [17]:
atricle1_abstract = atricle1.find("Abstract").get_text()
atricle1_abstract

"\nA 50-year-old female with a 20-year history of multifocal pulmonary benign metastasizing leiomyoma (BML), and asthma presented with subacute worsening of chronic dyspnea. A contrast-enhanced computerized tomography of the chest showed a single 1.4\xa0×\xa01.5-cm contrast-enhancing mass in the right lower lobe among numerous non-enhancing bilateral pulmonary BML lesions. Pulmonary angiogram was not performed at that time due to clinical improvement. Four years later, the patient presented with refractory subacute worsening of her chronic dyspnea and was referred for embolization of the pulmonary arteriovenous malformation (PAVM). Two feeder arteries to the PAVM were embolized; each with a 6-mm Amplatzer-IV vascular plug and a 4-mm Nester coil. Follow-up angiograms demonstrated no flow through the PAVM. The patient's dyspnea resolved and she remained asymptomatic at one-year follow-up.\nCopyright © 2020 Elsevier Inc. All rights reserved.\n"

### Keywords

In [18]:
atricle1_keyword_list = atricle1.find("KeywordList").get_text()
keyword_list = atricle1_keyword_list.split('\n')
keyword_list = [keyword for keyword in keyword_list if keyword != '']
keyword_list

['Benign metastasizing leiomyoma',
 'Embolization',
 'Pulmonary arteriovenous malformation']

# Automatized fetch

In [15]:
for term_iter in terms:
    search_term = term_iter.replace(' ', '+')
    
    #Search for article ids
    search_query = base_url_search + f'?db={db}&term={search_term}&WebEnv=<webenv string>&usehistory=y'
    search_response = requests.get(search_query, headers=headers)
    search_soup = BeautifulSoup(search_response.content, 'xml')
    
    #Extract fetch infos
    WebEnv = search_soup.find('WebEnv').get_text()
    QueryKey = search_soup.find('QueryKey').get_text()
    
    #Fetch articles
    fetch_query = base_url_fetch + f'?db={db}&query_key={QueryKey}&WebEnv={WebEnv}&retmax=1000&retmode=XML&rettype=Abstract'
    query_response = requests.get(fetch_query, headers=headers)
    atricles = BeautifulSoup(query_response.content, 'xml')
    
    #Save fetch result
    file_name = term_iter.replace(' ', '')
    with open(f"./api_data/{file_name}_articles.xml", "wb") as file:
        file.write(atricles.prettify().encode())