In [2]:
# Install and Import Packages.
import sys
from Bio import Entrez as ez
import pandas as pd
from dotenv import load_dotenv
import os
from time import sleep

In [3]:

ez_functions = [func for func in dir(ez) if callable(getattr(ez, func)) and not func.startswith('_')]

# Loop through each function name in your list
for func_name in ez_functions:
    # Get the actual function object from the 'ez' module
    function_object = getattr(ez, func_name)
    
    # Get the docstring, handle cases where it might be empty
    docstring = function_object.__doc__
    if docstring:
        # Split the docstring into lines and take the first non-empty one
        first_line = docstring.strip().split('\n')[0]
        print(f"{func_name}: {first_line}")
    else:
        print(f"{func_name}: No description available.")

HTTPError: Raised when HTTP error occurs, but also acts like non-error return
Request: No description available.
URLError: No description available.
ecitmatch: Retrieve PMIDs for input citation strings, returned as a handle.
efetch: Fetch Entrez results which are returned as a handle.
egquery: Provide Entrez database counts for a global search (DEPRECATED).
einfo: Return a summary of the Entrez databases as a results handle.
elink: Check for linked external articles and return a handle.
epost: Post a file of identifiers for future use.
esearch: Run an Entrez search and return a handle to the results.
espell: Retrieve spelling suggestions as a results handle.
esummary: Retrieve document summaries as a results handle.
function_with_previous: Decorate a function as having an attribute named 'previous'.
parse: Parse an XML file from the NCBI Entrez Utilities into python objects.
read: Parse an XML file from the NCBI Entrez Utilities into python objects.
urlencode: Encode a dict or sequence

In [4]:
# Load environment variables from .env file
load_dotenv()

ez.email = os.environ.get('EMAIL')
# ez.api_key = os.environ.get('API_KEY')
SEARCH_QUERY = os.environ.get('SEARCH_QUERY')
DATABASE = os.environ.get('DATABASE')
RETTYPE = os.environ.get('RETTYPE')
RETMODE = os.environ.get('RETMODE')

print(SEARCH_QUERY)
print(ez.email)


climate+change
gnasser@charlotte.edu


In [6]:

print(SEARCH_QUERY)

print("Running esearch...")
handle = ez.esearch(db=DATABASE,
                        term=SEARCH_QUERY,
                        usehistory="y") # IMPORTANT: Use the history server

search_results = ez.read(handle)
handle.close()

# Get the total count and the history server identifiers
count = int(search_results["Count"])
webenv = search_results["WebEnv"]
query_key = search_results["QueryKey"]

print(f"Found {count} results.")

climate+change
Running esearch...
Found 116586 results.


In [11]:
batch_size = 100
all_records = []

print("Fetching records in batches...")
for start in range(0, count, batch_size):
    end = min(count, start + batch_size)
    print(f"Fetching records from {start+1} to {end}")
    
    fetch_handle = ez.efetch(db = DATABASE,
                                 rettype = RETTYPE, # Request XML format for full data
                                 retmode= RETMODE,
                                 retstart=start,
                                 retmax=batch_size,
                                 webenv=webenv, # Use the history server identifiers
                                 query_key=query_key)
    
    # Biopython's ez.read can parse the XML into a structured Python object
    records = ez.read(fetch_handle)
    fetch_handle.close()
    
    # The actual articles are in the 'PubmedArticle' list
    all_records.extend(records['PubmedArticle'])
    
    # Be nice to the server! A short delay between requests.
    sleep(0.3)

print("Finished fetching all records.")

Fetching records in batches...
Fetching records from 1 to 100
Fetching records from 101 to 200


KeyboardInterrupt: 

In [15]:
import pandas as pd
# Assuming 'all_records' is a list of dictionaries you've already loaded.
# all_records = ... 

parsed_articles = []

for article in all_records:
    citation = article['MedlineCitation']
    
    # FIX: Define article_info here to access the nested 'Article' dictionary
    article_info = citation.get('Article', {})

    # Use article_info to access all subsequent fields
    title = article_info.get('ArticleTitle', 'No Title Found')

    abstract_parts = article_info.get('Abstract', {}).get('AbstractText', [])
    abstract = ' '.join(abstract_parts) if abstract_parts else 'No Abstract Found'

    journal_info = article_info.get('Journal', {})
    journal_name = journal_info.get('Title', 'No Journal Found')
    
    pub_date = journal_info.get('JournalIssue', {}).get('PubDate', {})
    year = pub_date.get('Year', pub_date.get('MedlineDate', 'No Year Found'))

    pmid = citation.get('PMID', '')

    # Now this line will work correctly because article_info is defined
    author_list = article_info.get('AuthorList', [])
    authors = []
    for author_data in author_list:
        fore_name = author_data.get('ForeName', '')
        last_name = author_data.get('LastName', '')
        if fore_name and last_name:
            authors.append(f"{fore_name} {last_name}")
    
    authors_str = '; '.join(authors) if authors else 'No Authors Found'

    parsed_articles.append({
        'PMID': str(pmid),
        'Title': title,
        'Authors': authors_str,
        'Abstract': abstract,
        'Journal': journal_name,
        'Year': year
    })

df = pd.DataFrame(parsed_articles)

# Optional: Reorder columns for a more logical layout in the CSV
if not df.empty:
    df = df[['PMID', 'Title', 'Abstract', 'Year', 'Authors', 'Journal']]
    
df.to_csv("systematic_review_results.csv", index=False)

print("Successfully parsed and saved results to systematic_review_results.csv")
print(df.head())

Successfully parsed and saved results to systematic_review_results.csv
       PMID                                              Title  \
0  40974187  Exploring Multi-Omics Tools and Their Advancem...   
1  40974145  Selenium-Containing Peptides from Foodstuff: P...   
2  40974075  The utility of artificial intelligence in the ...   
3  40974058  Complex effects of climatic variation on bumbl...   
4  40974010  Genome-Scale Metabolic Models in Plant Stress ...   

                                            Abstract  Year  \
0  Climate change poses significant threats to op...  2025   
1  Selenium-containing peptides (SePPs) are bioac...  2025   
2  Dengue fever remains a significant public heal...  2025   
3  Climate change is a global biodiversity threat...  2025   
4  Global climate change will result in plants be...  2025   

                                             Authors  \
0  Kaberi Sonowal; Sonal Sharma; Gokul Anil Kumar...   
1  Lingyun Gu; Wenzhu Zhao; Jucheng Hu; Mok Wen