In [1]:
from Bio import Entrez
import pandas as pd
import time

In [2]:
def search(query, retstart, retmax):
    Entrez.email = "sandra_friebolin@proton.me"
    handle = Entrez.esearch(db='pubmed', 
                            retstart=retstart, 
                            retmax=retmax, 
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results


def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = "sandra_friebolin@proton.me"
    handle = Entrez.efetch(db='pubmed',
                            retmode='xml',
                            id=ids)
    results = Entrez.read(handle)
    return results

In [3]:
def save_data_to_lists(papers, pmid_list, title_list, abstract_list, author_list, date_list, doi_list):
    for paper in papers['PubmedArticle']:
        article = paper['MedlineCitation']['Article']

        # Check if Abstract is present
        if article.get("Abstract") is not None and paper['MedlineCitation']['PMID'] not in pmid_list:
            abstract_texts = article['Abstract']['AbstractText']
            full_abstract = ' '.join([str(text) for text in abstract_texts])

            title_list.append(article['ArticleTitle'])
            pmid_list.append(paper['MedlineCitation']['PMID'])

            # Fetch authors
            if 'AuthorList' in article:
                authors = article['AuthorList']
                author_names = [author.get('ForeName') + " " + author.get('LastName') \
                                    if author.get('ForeName') else author.get('LastName') \
                                for author in authors if 'LastName' in author]
                author_list.append("; ".join(author_names))
            else:
                author_list.append("")

            # Fetch Publication Date
            medline_citation = paper.get('MedlineCitation', {})
            article = medline_citation.get('Article', {})
            pub_date = None

            # Check various fields for publication date
            if 'ArticleDate' in article:
                pub_date = article['ArticleDate']
            elif 'PubDate' in article:
                pub_date = article['PubDate']
            elif 'DateCompleted' in medline_citation:
                pub_date = medline_citation['DateCompleted']
            elif 'DateRevised' in medline_citation:
                pub_date = medline_citation['DateRevised']

            # Format the publication date
            if pub_date:
                date_str = f"{pub_date[0]['Year']}-{pub_date[0].get('Month', '01')}-{pub_date[0].get('Day', '01')}"
            else:
                date_str = ""

            date_list.append(date_str)

            # Fetch DOI
            article_id_list = paper.get('PubmedData', {}).get('ArticleIdList', [])
            doi = next((id_ for id_ in article_id_list if id_.attributes.get('IdType') == 'doi'), None)
            doi_list.append(doi if doi is not None else "")

            # Append Abstract
            abstract_list.append(full_abstract)

In [4]:

pmid_list = []
title_list = []
abstract_list =[]
author_list = []
date_list = []
doi_list = []
saved_data_cnt = 0
for year in range(2013, 2024):
    for quartal in range(4):
        month_start, month_end = (quartal) * 3 + 1, ((quartal)) * 3 + 3
        query = f"intelligence[Title/Abstract] AND (\"{year}/{month_start}\"[Date - Publication] : \"{year}/{month_end}\"[Date - Publication])"
        handle = Entrez.esearch(db='pubmed', retmax=10000, retmode='xml', term=query)
        studies = Entrez.read(handle)
        print(f"{studies['Count']} data for {month_start}/{year}-{month_end}/{year}")
        studiesIdList = studies['IdList']
        papers = fetch_details(studiesIdList)
        save_data_to_lists(papers, pmid_list, title_list, abstract_list, author_list, date_list, doi_list)
        print(f"Newly saved data: {len(pmid_list)-saved_data_cnt}; Total saved data: {len(pmid_list)}")
        saved_data_cnt = len(pmid_list)

            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


642 data for 1/2013-3/2013
Newly saved data: 625; Total saved data: 625
448 data for 4/2013-6/2013
Newly saved data: 349; Total saved data: 974
513 data for 7/2013-9/2013
Newly saved data: 361; Total saved data: 1335
513 data for 10/2013-12/2013
Newly saved data: 285; Total saved data: 1620
789 data for 1/2014-3/2014
Newly saved data: 599; Total saved data: 2219
469 data for 4/2014-6/2014
Newly saved data: 295; Total saved data: 2514
484 data for 7/2014-9/2014
Newly saved data: 302; Total saved data: 2816
512 data for 10/2014-12/2014
Newly saved data: 307; Total saved data: 3123
857 data for 1/2015-3/2015
Newly saved data: 658; Total saved data: 3781
539 data for 4/2015-6/2015
Newly saved data: 314; Total saved data: 4095
546 data for 7/2015-9/2015
Newly saved data: 323; Total saved data: 4418
524 data for 10/2015-12/2015
Newly saved data: 320; Total saved data: 4738
827 data for 1/2016-3/2016
Newly saved data: 620; Total saved data: 5358
544 data for 4/2016-6/2016
Newly saved data: 31

In [5]:
# Create DataFrame
df = pd.DataFrame({
    'PMID': pmid_list,
    'Title': title_list,
    'Abstract': abstract_list,
    'Authors': author_list,
    'Publication Date': date_list,
    'DOI': doi_list
})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58839 entries, 0 to 58838
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PMID              58839 non-null  object
 1   Title             58839 non-null  object
 2   Abstract          58839 non-null  object
 3   Authors           58839 non-null  object
 4   Publication Date  58839 non-null  object
 5   DOI               58839 non-null  object
dtypes: object(6)
memory usage: 2.7+ MB


In [7]:
df.head()

Unnamed: 0,PMID,Title,Abstract,Authors,Publication Date,DOI
0,24645995,α-1 antitrypsin and chronic fatigue syndrome: ...,SUMMARY Several lines of evidence support the...,José Alegre; Sandra Camprubí; Ana García-Quintana,,10.2217/pmt.12.84
1,24565439,A data-driven acute inflammation therapy.,Acute inflammation is a severe medical conditi...,Vladan Radosavljevic; Kosta Ristovski; Zoran O...,2013-11-11,10.1186/1755-8794-6-S3-S7
2,24505723,Voxelwise spectral diffusional connectivity an...,Human brain connectivity can be studied using ...,Junning Li; Yan Jin; Yonggang Shi; Ivo D Dinov...,,10.1007/978-3-642-40811-3_82
3,24472488,Systems integrity in health and aging - an ani...,Human lifespan is positively correlated with c...,Marije Oostindjer; Gro V Amdam,2013-01-07,10.1186/2046-2395-2-2
4,24460364,Multi-agent systems: effective approach for ca...,"Physicians, in order to study the causes of ca...",Niloofar Mohammadzadeh; Reza Safdari; Azin Rahimi,,10.7314/apjcp.2013.14.12.7757


In [8]:
# Save the data to a CSV file
df.to_csv('pubmed_data.csv', index=False)