In [1]:
# Install and Import Packages.
import sys
from Bio import Entrez as ez
import pandas as pd
from dotenv import load_dotenv
import os
from time import sleep

In [2]:

ez_functions = [func for func in dir(ez) if callable(getattr(ez, func)) and not func.startswith('_')]

# Loop through each function name in your list
for func_name in ez_functions:
    # Get the actual function object from the 'ez' module
    function_object = getattr(ez, func_name)
    
    # Get the docstring, handle cases where it might be empty
    docstring = function_object.__doc__
    if docstring:
        # Split the docstring into lines and take the first non-empty one
        first_line = docstring.strip().split('\n')[0]
        print(f"{func_name}: {first_line}")
    else:
        print(f"{func_name}: No description available.")

HTTPError: Raised when HTTP error occurs, but also acts like non-error return
Request: No description available.
URLError: No description available.
ecitmatch: Retrieve PMIDs for input citation strings, returned as a handle.
efetch: Fetch Entrez results which are returned as a handle.
egquery: Provide Entrez database counts for a global search (DEPRECATED).
einfo: Return a summary of the Entrez databases as a results handle.
elink: Check for linked external articles and return a handle.
epost: Post a file of identifiers for future use.
esearch: Run an Entrez search and return a handle to the results.
espell: Retrieve spelling suggestions as a results handle.
esummary: Retrieve document summaries as a results handle.
function_with_previous: Decorate a function as having an attribute named 'previous'.
parse: Parse an XML file from the NCBI Entrez Utilities into python objects.
read: Parse an XML file from the NCBI Entrez Utilities into python objects.
urlencode: Encode a dict or sequence

In [3]:
# Load environment variables from .env file
load_dotenv()

ez.email = os.environ.get('EMAIL')
# ez.api_key = os.environ.get('API_KEY')
SEARCH_QUERY = os.environ.get('SEARCH_QUERY')
DATABASE = os.environ.get('DATABASE')
RETTYPE = os.environ.get('RETTYPE')
RETMODE = os.environ.get('RETMODE')

print(SEARCH_QUERY)
print(ez.email)


climate+change
gerard.j.nasser@gmail.com


In [7]:
# --- Test with a hard-coded, guaranteed-valid search term ---
print("Testing with a simple, hard-coded search term...")

try:
    # A simple, valid PubMed query
    simple_term = "BRCA1 AND human[organism]"
    handle = ez.esearch(db="pubmed",
                        term=simple_term,
                        usehistory="y")
    search_results = ez.read(handle)
    handle.close()
    print("✅ SUCCESS: The search with the simple term worked!")
    print(f"Found {search_results['Count']} results.")

except Exception as e:
    print(f"❌ FAILURE: The search failed even with a simple term. Error: {e}")


# --- Now, print your actual search query for inspection ---
# This line assumes SEARCH_QUERY was loaded in your setup cell.
print("\n--- For debugging, here is the value of your SEARCH_QUERY variable: ---")
print(f"'{SEARCH_QUERY}'")

Testing with a simple, hard-coded search term...
✅ SUCCESS: The search with the simple term worked!
Found 20856 results.

--- For debugging, here is the value of your SEARCH_QUERY variable: ---
'climate+change'


In [5]:

print(SEARCH_QUERY)

print("Running esearch...")
handle = ez.esearch(db=DATABASE,
                        term=SEARCH_QUERY,
                        usehistory="y") # IMPORTANT: Use the history server

search_results = ez.read(handle)
handle.close()

# Get the total count and the history server identifiers
count = int(search_results["Count"])
webenv = search_results["WebEnv"]
query_key = search_results["QueryKey"]

print(f"Found {count} results.")

climate+change
Running esearch...
Found 116550 results.


In [6]:
batch_size = 100
all_records = []

print("Fetching records in batches...")
for start in range(0, count, batch_size):
    end = min(count, start + batch_size)
    print(f"Fetching records from {start+1} to {end}")
    
    fetch_handle = ez.efetch(db = DATABASE,
                                 rettype = RETTYPE, # Request XML format for full data
                                 retmode= RETMODE,
                                 retstart=start,
                                 retmax=batch_size,
                                 webenv=webenv, # Use the history server identifiers
                                 query_key=query_key)
    
    # Biopython's ez.read can parse the XML into a structured Python object
    records = ez.read(fetch_handle)
    fetch_handle.close()
    
    # The actual articles are in the 'PubmedArticle' list
    all_records.extend(records['PubmedArticle'])
    
    # Be nice to the server! A short delay between requests.
    sleep(0.3)

print("Finished fetching all records.")

Fetching records in batches...
Fetching records from 1 to 100
Fetching records from 101 to 200
Fetching records from 201 to 300
Fetching records from 301 to 400
Fetching records from 401 to 500
Fetching records from 501 to 600
Fetching records from 601 to 700
Fetching records from 701 to 800
Fetching records from 801 to 900


KeyboardInterrupt: 

In [None]:
parsed_articles = []
for article in all_records:
    citation = article['MedlineCitation']

    # Use .get() to avoid errors if a key is missing
    title = citation['Article'].get('ArticleTitle', 'No Title Found')

    abstract_parts = citation['Article'].get('Abstract', {}).get('AbstractText', [])
    abstract = ' '.join(abstract_parts) if abstract_parts else 'No Abstract Found'

    journal_info = citation['Article'].get('Journal', {})
    journal_name = journal_info.get('Title', 'No Journal Found')
    
    pub_date = journal_info.get('JournalIssue', {}).get('PubDate', {})
    year = pub_date.get('Year', pub_date.get('MedlineDate', 'No Year Found'))

    pmid = citation.get('PMID', '')

    parsed_articles.append({
        'PMID': str(pmid),
        'Title': title,
        'Abstract': abstract,
        'Journal': journal_name,
        'Year': year
    })

df = pd.DataFrame(parsed_articles)
df.to_csv("systematic_review_results.csv", index=False)

print("\nSuccessfully parsed and saved results to systematic_review_results.csv")
print(df.head()) # Display the first few rows