#### TODO
- Check how to get around article limit of 10000 if possible
- Can use history server for other purposes?
- What can be the advantage for using eutilities VS edirect / usecases?

In [10]:
from Bio import Entrez
import time

In [11]:
def build_query():
    searchterm = input("Enter query search term: ")
    article_type = ('[IT] AND ("Clinical Trial"[PT] OR "Randomized Controlled Trial"[PT] OR "Meta-Analysis"[PT] '
                    'OR "Systematic Review"[PT] OR "Comparative Study"[PT] OR "Observational Study"[PT] '
                    'OR "Validation Study"[PT] OR "Case Reports"[PT] OR "Review"[PT])')
    
    full_query = searchterm + article_type
    return full_query


In [12]:
def history(full_query):
    API_key = input("Enter API_key:")

    # Get number of data records
    with Entrez.esearch(db="pubmed", term=query, retmax=1, api_key=API_key, usehistory="y") as handle:
        results = Entrez.read(handle)
        count = int(results["Count"])
        webenv = results["WebEnv"]    
        query_key = results["QueryKey"]
    
    print(f"Total articles found: {count}")
    print(f"WebEnv: {webenv}, QueryKey: {query_key}")
    
    return webenv, query_key, count, API_key


In [19]:
def fetch_data(webenv, query_key, count, API_key, batchsize=5000):
    article_data = []
    max_records = min(count, 9999)

    for start in range(0, max_records, batchsize):
        print(f"Records {start + 1} to {min(start + batchsize, count)}...")
        
        with Entrez.efetch(db="pubmed", query_key=query_key, webenv=webenv, retstart=start, retmax=batchsize, 
                           rettype="xml", api_key=API_key) as handle:
            data = handle.read()
            article_data.append(data)
            filename = f"pubmed_data_{start+1}_{min(start + batchsize, count)}.xml"
            with open(filename, 'wb') as file:
                file.write(data)
        
        time.sleep(10)

    print("Data fetching complete.")
    return article_data

In [None]:
import xml.etree.ElementTree as ET
import csv

# Load the XML file
tree = ET.parse('magnesium_pubmed_data.xml')  # Replace with your XML file path
root = tree.getroot()

# Prepare the CSV file
csv_filename = 'output.csv'  # Specify the output CSV file name
csv_columns = ['Title', 'Author', 'Journal', 'PubDate', 'DOI']  # Example columns, adjust to your XML structure

# Open the CSV file in write mode
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_columns)
    writer.writeheader()

    # Iterate through each PubMed article in the XML
    for article in root.findall('.//PubmedArticle'):
        # Extract relevant fields from the XML structure (adjust based on your XML structure)
        title = article.find('.//ArticleTitle').text if article.find('.//ArticleTitle') else ''
        author = article.find('.//AuthorList/Author/LastName').text if article.find('.//AuthorList/Author/LastName') else ''
        journal = article.find('.//Journal/Title').text if article.find('.//Journal/Title') else ''
        pub_date = article.find('.//PubDate/Year').text if article.find('.//PubDate/Year') else ''
        doi = article.find('.//PubmedData/ArticleIdList/ArticleId[@IdType="doi"]').text if article.find('.//PubmedData/ArticleIdList/ArticleId[@IdType="doi"]') else ''

        # Write the data to the CSV file
        writer.writerow({
            'Title': title,
            'Author': author,
            'Journal': journal,
            'PubDate': pub_date,
            'DOI': doi
        })

print(f"Data has been written to {csv_filename}")


In [None]:
Entrez.email = input("Enter emailadres:")
query = build_query()
webenv, query_key, count, API_key = history(query)
articles = fetch_data(webenv, query_key, count, API_key)

In [None]:
print(articles)

In [5]:
"""
    To retrieve more than 10,000 UIDs from databases other than PubMed, 
    submit multiple esearch requests while incrementing the value of retstart (see Application 3). 
    For PubMed, ESearch can only retrieve the first 10,000 records matching the query. 
    To obtain more than 10,000 PubMed records, consider using <EDirect> that contains additional logic
    to batch PubMed search results automatically so that an arbitrary number can be retrieved.
    For details see https://www.ncbi.nlm.nih.gov/books/NBK25499/
"""

'\n    To retrieve more than 10,000 UIDs from databases other than PubMed, \n    submit multiple esearch requests while incrementing the value of retstart (see Application 3). \n    For PubMed, ESearch can only retrieve the first 10,000 records matching the query. \n    To obtain more than 10,000 PubMed records, consider using <EDirect> that contains additional logic\n    to batch PubMed search results automatically so that an arbitrary number can be retrieved.\n    For details see https://www.ncbi.nlm.nih.gov/books/NBK25499/\n'