<a href="https://colab.research.google.com/github/cheung0/Download-PubMed-Abstracts-with-Python-Tool/blob/main/Download_PubMed_abstracts_with_Python_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download PubMed abstracts with Python Tool

If you're a medical student, or doctor, or are trying to read PubMed articles relating to your medicine or medical condition, this Python tool's for you. It downloads PubMed abstracts in a text file, allowing you to read them faster. It helps you save time!

By: [Michael Cheung](https://www.linkedin.com/in/michael-cheung0/)

Credits:
[GitHub Repo](https://github.com/erilu/pubmed-abstract-compiler)

**Import packages**

In [1]:
import csv
import re
import urllib
from time import sleep
import requests
from bs4 import BeautifulSoup

**Specify search query**

In [2]:
# Specify your search query here. Works on single words or multiple words.
# query = 'P2RY8'
query = 'Intelligence'

# Formats query in correct format
def format_query(search_query):
    if ' ' not in search_query:
        query = search_query
    else:
        query = '"' + '+'.join(search_query.split()) + '"'
    return query

query = format_query(query)
print("Query: " + query)

Query: Intelligence


**Url with abstract ids**

In [3]:
# common settings between esearch and efetch
base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
db = 'db=pubmed'

# esearch specific settings
search_eutil = 'esearch.fcgi?'
search_term = '&term=' + query
search_usehistory = '&usehistory=y'
search_rettype = '&rettype=json'

search_url = base_url+search_eutil+db+search_term+search_usehistory+search_rettype
print(search_url)

f = urllib.request.urlopen(search_url)
search_data = f.read().decode('utf-8')

# obtain total abstract count
total_abstract_count = int(re.findall("<Count>(\d+?)</Count>",search_data)[0])

# obtain webenv and querykey settings for efetch command
fetch_webenv = "&WebEnv=" + re.findall ("<WebEnv>(\S+)<\/WebEnv>", search_data)[0]
fetch_querykey = "&query_key=" + re.findall("<QueryKey>(\d+?)</QueryKey>",search_data)[0]

http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=Intelligence&usehistory=y&rettype=json


**Url with abstract summaries**

You can further filter results by changing optional values. For example, change retmax (return max) to limit amount of abstracts.

In [28]:
# other efetch settings
fetch_eutil = 'efetch.fcgi?'
retmax = 1
retstart = 50
fetch_retstart = "&retstart=" + str(retstart)
fetch_retmax = "&retmax=" + str(retmax)
fetch_retmode = "&retmode=text"
fetch_rettype = "&rettype=abstract"

fetch_url = base_url+fetch_eutil+db+fetch_querykey+fetch_webenv+fetch_retstart+fetch_retmax+fetch_retmode+fetch_rettype
print(fetch_url)

http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key=1&WebEnv=MCID_65a2f51e0fc0bd750077b2ca&retstart=50&retmax=1&retmode=text&rettype=abstract


**Download the abstracts into a text file**

In [29]:
def download_webpage(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        return text
    else:
        print("Failed to download.")
        return None

def save_text_to_file(text, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)
    print("Text saved to", filename)

# Example usage:
url = fetch_url
filename = "abstracts.txt"

webpage_text = download_webpage(url)
if webpage_text:
    save_text_to_file(webpage_text, filename)

Text saved to abstracts.txt


In [30]:
import json
import re

def text_to_json(text):
    # Extracting PMID
    pmid_match = re.search(r'PMID:\s*(\d+)', text)
    pmid = pmid_match.group(1) if pmid_match else None

    # Extracting title
    title_match = re.search(r'^([\s\S]+?)\.\s*\d{4}\s+[A-Za-z]+\s+\d+\s*;\d+\(.+?\):', text)
    title = title_match.group(1).strip() if title_match else None

    # Extracting abstract
    abstract_start = re.search(r'(?<=\n\n)[\s\S]+?(?=\n\n)', text)
    abstract = abstract_start.group(0).strip() if abstract_start else None

    # Creating JSON structure
    data = {
        "PMID": pmid,
        "title": title,
        "abstract": abstract
    }

    return data

def save_to_json(data, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=2)

if __name__ == "__main__":
    # Read text from a file
    input_file = "abstracts.txt"

    with open(input_file, 'r') as file:
        text = file.read()

    # Convert text to JSON
    pubmed_data = text_to_json(text)

    # Save JSON data to a file
    output_file = "pubmed_data.json"
    save_to_json(pubmed_data, output_file)

    print(f"JSON data has been written to {output_file}")



JSON data has been written to pubmed_data.json


In [31]:
import requests
import xml.etree.ElementTree as ET

# Set up your E-utilities parameters
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
db = 'pubmed'
fetch_eutil = 'efetch.fcgi?'
retmax = 1
retstart = 50
fetch_retstart = "&retstart=" + str(retstart)
fetch_retmax = "&retmax=" + str(retmax)
fetch_retmode = "&retmode=xml"  # Change to "&retmode=text" if you prefer plain text
fetch_rettype = "&rettype=abstract"

fetch_url = base_url + fetch_eutil + db + fetch_retstart + fetch_retmax + fetch_retmode + fetch_rettype
print(fetch_url)

# Make the request
response = requests.get(fetch_url)

# Parse the XML response
tree = ET.fromstring(response.content)

# Extract PMID, title, and abstract
pmid = tree.find(".//PMID").text if tree.find(".//PMID") is not None else None
title = tree.find(".//ArticleTitle").text if tree.find(".//ArticleTitle") is not None else None
abstract = tree.find(".//AbstractText").text if tree.find(".//AbstractText") is not None else None

# Creating JSON structure with only PMID, title, and abstract
data = {
    "PMID": pmid,
    "title": title,
    "abstract": abstract
}

# Save JSON data to a file
output_file = "pubmed_data.json"
with open(output_file, 'w') as json_file:
    json.dump(data, json_file, indent=2)

print(f"JSON data has been written to {output_file}")


https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?pubmed&retstart=50&retmax=1&retmode=xml&rettype=abstract
JSON data has been written to pubmed_data.json


In [32]:
import re
import requests
import urllib.request
from bs4 import BeautifulSoup

# Specify your search query here. Works on single words or multiple words.
# query = 'P2RY8'
query = 'Intelligence'

# Formats query in correct format
def format_query(search_query):
    if ' ' not in search_query:
        query = search_query
    else:
        query = '"' + '+'.join(search_query.split()) + '"'
    return query

query = format_query(query)
print("Query: " + query)

# common settings between esearch and efetch
base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
db = 'db=pubmed'

# esearch specific settings
search_eutil = 'esearch.fcgi?'
search_term = '&term=' + query
search_usehistory = '&usehistory=y'
search_rettype = '&rettype=json'

search_url = base_url + search_eutil + db + search_term + search_usehistory + search_rettype
print(search_url)

f = urllib.request.urlopen(search_url)
search_data = f.read().decode('utf-8')

# obtain total abstract count
total_abstract_count = int(re.findall("<Count>(\d+?)</Count>", search_data)[0])

# obtain webenv and querykey settings for efetch command
fetch_webenv = "&WebEnv=" + re.findall("<WebEnv>(\S+)<\/WebEnv>", search_data)[0]
fetch_querykey = "&query_key=" + re.findall("<QueryKey>(\d+?)</QueryKey>", search_data)[0]

# other efetch settings
fetch_eutil = 'efetch.fcgi?'
retmax = 1
retstart = 50
fetch_retstart = "&retstart=" + str(retstart)
fetch_retmax = "&retmax=" + str(retmax)
fetch_retmode = "&retmode=xml"  # Use XML for structured data
fetch_rettype = "&rettype=abstract"

fetch_url = base_url + fetch_eutil + db + fetch_querykey + fetch_webenv + fetch_retstart + fetch_retmax + fetch_retmode + fetch_rettype
print(fetch_url)

# Make the request to fetch the XML response
response = requests.get(fetch_url)

# Parse the XML response
soup = BeautifulSoup(response.text, 'xml')

# Extract PMID, title, and abstract
pmid = soup.find("PMID").text if soup.find("PMID") else None
title = soup.find("ArticleTitle").text if soup.find("ArticleTitle") else None
abstract_tag = soup.find("AbstractText")
abstract = abstract_tag.text if abstract_tag else None

# Creating JSON structure with only PMID, title, and abstract
data = {
    "PMID": pmid,
    "title": title,
    "abstract": abstract
}

# Save JSON data to a file
output_file = "pubmed_data.json"
with open(output_file, 'w') as json_file:
    json.dump(data, json_file, indent=2)

print(f"JSON data has been written to {output_file}")


Query: Intelligence
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=Intelligence&usehistory=y&rettype=json
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key=1&WebEnv=MCID_65a307e1126d074028431d80&retstart=50&retmax=1&retmode=xml&rettype=abstract
JSON data has been written to pubmed_data.json
