In [1]:
from Bio import Entrez

In [2]:
import pandas as pd
import numpy as np


In [45]:
def search(query, mindate, maxdate):
    #docs: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Entrez.email = 'example@email.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='10000',
                            retmode='xml',
                            term=query, 
                            mindate=mindate, 
                            maxdate=maxdate)
    results = Entrez.read(handle)
    return results

In [113]:
from datetime import datetime, timedelta
import time
delay_seconds = 1
result_dicts = {}
start_date = datetime(2013, 1, 1)  # Start date (January 1, 2023)
end_date = datetime(2014, 1, 1)
window_duration = timedelta(days=30) #assuming 30 days per month and 2 months to be fine
current_date = start_date

# Loop over time windows of 2 months
while current_date < end_date:
    # Calculate the end of the 1-month window
    window_end = current_date + window_duration
    try:
        returned_dicts = search('Intelligence', current_date.strftime('%Y/%m/%d'), window_end.strftime('%Y/%m/%d'))
    except: 
        print(f"Error: query unsuccessful. currdate = {current_date}, window = {window_end}")
        break
        
    #accumulate dictionary values
    for key, value in returned_dicts.items():
        if key in result_dicts:
            if isinstance(value, list):
                if isinstance(result_dicts[key], list):
                    # If both are lists, extend the existing list with the new list
                    result_dicts[key].extend(value)
                else:
                    # If the existing value is not a list, create a new list with both values
                    result_dicts[key] = [result_dicts[key]] + value
            else:
                if isinstance(result_dicts[key], list):
                    # If the existing value is a list, append the new value to it
                    result_dicts[key].append(value)
                else:
                    # If neither is a list, create a list with both values
                    result_dicts[key] = [result_dicts[key], value]
        else:
            # Add the key-value pair to result_dicts
            result_dicts[key] = value
    print(f"current date processed:{current_date}")
    current_date = window_end
    time.sleep(delay_seconds)

current date processed:2013-01-01 00:00:00
current date processed:2013-01-31 00:00:00
current date processed:2013-03-02 00:00:00
current date processed:2013-04-01 00:00:00
current date processed:2013-05-01 00:00:00
current date processed:2013-05-31 00:00:00
current date processed:2013-06-30 00:00:00
current date processed:2013-07-30 00:00:00
current date processed:2013-08-29 00:00:00
current date processed:2013-09-28 00:00:00
current date processed:2013-10-28 00:00:00
current date processed:2013-11-27 00:00:00
current date processed:2013-12-27 00:00:00


In [141]:
studiesIdList

[1m[[0m[32m'23312647'[0m, [32m'23336477'[0m, [32m'23360850'[0m, [32m'23276245'[0m, [32m'23300215'[0m, [32m'24868944'[0m, [32m'23288323'[0m, [32m'23349505'[0m, [32m'24769751'[0m, [32m'23335577'[0m, [32m'23317829'[0m, [32m'24601069'[0m, [32m'23346027'[0m, [32m'23297290'[0m, [32m'23302840'[0m, [32m'23342456'[0m, [32m'23317430'[0m, [32m'23351938'[0m, [32m'23314722'[0m, [32m'23305843'[0m, [32m'23288854'[0m, [32m'23328837'[0m, [32m'23294010'[0m, [32m'23313760'[0m, [32m'23333263'[0m, [32m'23349069'[0m, [32m'23300958'[0m, [32m'23314723'[0m, [32m'23333868'[0m, [32m'23317832'[0m, [32m'23358156'[0m, [32m'23314526'[0m, [32m'23360858'[0m, [32m'23323782'[0m, [32m'23311179'[0m, [32m'22374769'[0m, [32m'23275397'[0m, [32m'24837342'[0m, [32m'23278342'[0m, [32m'23298794'[0m, [32m'23282131'[0m, [32m'23338769'[0m, [32m'23327297'[0m, [32m'23291049'[0m, [32m'23276989'[0m, [32m'23279414'[0m, [32m'23357078'[0m, [32

In [123]:
studiesIdList = result_dicts['IdList']

In [118]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'email@example.com'
    handle = Entrez.efetch(db='pubmed',
    retmode='xml',
    id=ids)
    results = Entrez.read(handle) #dictionary 
    return results

In [143]:
studies.keys()

[1;35mdict_keys[0m[1m([0m[1m[[0m[32m'PubmedArticle'[0m, [32m'PubmedBookArticle'[0m[1m][0m[1m)[0m

In [124]:
title_list = []
authors_list = []
affiliation_list = []
abstract_list = []
journal_list = []
language_list = []
pubdate_year_list = []
pubdate_month_list = []
major_descriptor_list = []
descriptor_list = []
major_qualifier_list = []
qualifier_list = []

studies = fetch_details(studiesIdList)
chunk_size = 500 #reduce chunksize to not exceed request limits
for chunk_i in range(0, len(studiesIdList), chunk_size):
    chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
    papers = fetch_details(chunk)
    for i, paper in enumerate (papers['PubmedArticle']):
        title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
        try:
            abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
        except:
            abstract_list.append('NA')
        try: 
            journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
        except: 
            journal_list.append('NA')
        try:
            language_list.append(paper['MedlineCitation']['Article']['Language'][0])
        except: 
            language_list.append('NA')
        try: 
            descr = [descriptor['DescriptorName'] for descriptor in paper['MedlineCitation']['MeshHeadingList']] 
            descriptor_list.append(descr)
        except: 
            descriptor_list.append("No Data")
        try:
            mdescr = [descriptor['DescriptorName'] for descriptor in paper['MedlineCitation']['MeshHeadingList'] if descriptor['DescriptorName'].attributes.get('MajorTopicYN') == 'Y']
            major_descriptor_list.append(mdescr)
        except: 
            major_descriptor_list.append('NA')
        try: 
            qualif = [str(descriptor['QualifierName'][0]) for descriptor in paper['MedlineCitation']['MeshHeadingList'] if descriptor['QualifierName']]
            qualifier_list.append(list(set(qualif))) #append only unique qualifiers
        except: 
            qualifier_list.append('NA')
        try: 
            maj_qualif = [str(descriptor['QualifierName'][0]) for descriptor in paper['MedlineCitation']['MeshHeadingList'] if descriptor['QualifierName'] and descriptor['QualifierName'].attributes.get('MajorTopicYN') == 'Y']
            major_qualifier_list.append(list(set(maj_qualif))) #only unique
        except: 
            major_qualifier_list.append('NA')
        try: 
            authors_list.append([", ".join([author.get('LastName'), author.get('ForeName')]) for author in paper['MedlineCitation']['Article']['AuthorList']])
        except: 
            authors_list.append('NA')
        try: 
            affiliation_lst = []
            for i, author in enumerate(paper['MedlineCitation']['Article']['AuthorList']):
                try:
                    affiliation_lst.append([affiliation.get('Affiliation', '') for affiliation in author.get('AffiliationInfo')][0])
                except: continue
            affiliation_list.append(affiliation_lst)
        except:
            affiliation_list.append('NA')
        try:
            pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
        except:
            pubdate_year_list.append('NA')
        try:
            pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
        except:
            pubdate_month_list.append('NA')
    print(f"percentage fetched: {chunk_i/studiesIdList}")
            

In [136]:
print(f"percentage fetched: {chunk_i/len(studiesIdList)}")

percentage fetched: 0.9869503235003838


In [140]:
datetime(2014, 1, 1).strftime('%Y/%m/%d')

[32m'2014/01/01'[0m

In [125]:
df = pd.DataFrame(list(zip(
            title_list, authors_list, affiliation_list, qualifier_list, major_qualifier_list, descriptor_list, major_descriptor_list, abstract_list, journal_list, language_list, pubdate_year_list, pubdate_month_list
            )),
            columns=[
            'Title', 'Authors', 'Affiliations', 'Qualifier', 'Major Qualifier', 'Descriptor', 'Major Descriptor','Abstract', 'Journal', 'Language', 'Year','Month'
            ])

In [126]:
df

Unnamed: 0,Title,Authors,Affiliations,Qualifier,Major Qualifier,Descriptor,Major Descriptor,Abstract,Journal,Language,Year,Month
0,Disorders of communication: dysarthria.,"[Enderby, Pam]",[Department of Rehabilitation and Assistive Te...,"[physiology, classification]",,"[Dysarthria, Humans, Speech]",[Dysarthria],Dysarthria is a motor speech disorder which ca...,Handbook of clinical neurology,eng,2013,
1,Effectiveness of empathy in general practice: ...,"[Derksen, Frans, Bensing, Jozien, Lagro-Jansse...","[Radboud University Nijmegen Medical Center, T...","[psychology, etiology, epidemiology]",,"[Anxiety, Communication, Empathy, Female, Gene...","[Empathy, General Practice, Physician-Patient ...",Empathy as a characteristic of patient-physici...,The British journal of general practice : the ...,eng,2013,Jan
2,Incivility.,"[Witt, Catherine L]",[],"[psychology, etiology, complications]",,"[Burnout, Professional, Emotional Intelligence...","[Emotional Intelligence, Ethics, Nursing, Inte...",,Advances in neonatal care : official journal o...,eng,2013,Feb
3,Transformative learning.,"[Nemec, Patricia B]","[Nemec Consulting, Warner, NH, USA. patnemecc@...","[methods, rehabilitation, education]",,"[Education, Humans, Mental Disorders, Motivati...",[],The whole point of teaching and training is to...,Psychiatric rehabilitation journal,eng,2012,Dec
4,The cooperative brain.,"[Stallen, Mirre, Sanfey, Alan G]","[Rotterdam School of Management, Erasmus Unive...",[physiology],,"[Animals, Brain, Cooperative Behavior, Game Th...",[Cooperative Behavior],Cooperation is essential for the functioning o...,The Neuroscientist : a review journal bringing...,eng,2013,Jun
...,...,...,...,...,...,...,...,...,...,...,...,...
9111,Burden of encephalitis-associated hospitalizat...,"[Vora, Neil M, Holman, Robert C, Mehal, Jason ...",[From the Epidemic Intelligence Service (N.M.V...,"[economics, trends, diagnosis, epidemiology]",,"[Adolescent, Adult, Aged, Child, Child, Presch...",[Cost of Illness],To estimate the burden of encephalitis-associa...,Neurology,eng,2014,Feb
9112,Occurrence of 3 Bordetella species during an o...,"[Spicer, Kevin B, Salamon, Doug, Cummins, Caro...","[From the *Section of Infectious Diseases, Nat...","[epidemiology, microbiology, pharmacology, cla...",,"[Adolescent, Anti-Bacterial Agents, Bordetella...",[Disease Outbreaks],An increase in laboratory diagnosis of pertuss...,The Pediatric infectious disease journal,eng,2014,Jul
9113,Hepatitis B vaccine immunogenicity among adult...,"[Bender, Thomas John, Sharapov, Umid, Utah, Ok...","[Epidemic Intelligence Service Program, The Of...","[immunology, blood, therapeutic use, preventio...",,"[Adult, Aged, Aged, 80 and over, Assisted Livi...",[Assisted Living Facilities],Failure to adhere to infection control guideli...,Vaccine,eng,2014,Feb
9114,Comparative virulence of reproductive diseases...,"[Han, K, Seo, H W, Park, C, Kang, I, Youn, S-K...","[Department of Veterinary Pathology, College o...","[pathology, genetics]",,"[Animals, DNA, Viral, Female, Genotype, Porcin...",[],The aim of this study was to compare the virul...,Journal of comparative pathology,eng,2014,


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9116 entries, 0 to 9115
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Title             9116 non-null   object
 1   Authors           9116 non-null   object
 2   Affiliations      9116 non-null   object
 3   Qualifier         9116 non-null   object
 4   Major Qualifier   9116 non-null   object
 5   Descriptor        9116 non-null   object
 6   Major Descriptor  9116 non-null   object
 7   Abstract          9116 non-null   object
 8   Journal           9116 non-null   object
 9   Language          9116 non-null   object
 10  Year              9116 non-null   object
 11  Month             9116 non-null   object
dtypes: object(12)
memory usage: 854.8+ KB


In [129]:
df.to_csv("/Users/Kenneth/PycharmProjects/pubMedNLP/kedronlp/data/01_raw/extract_test.csv", index=False)

In [133]:
(190000/9100)*50

[1;36m1043.956043956044[0m

## efetch return
Here's an overview of the structure of an efetch return and some key information contained within it:

- The outermost structure appears to be a dictionary with several key-value pairs.

- There are two main keys within this dictionary: 'PubmedBookArticle' and 'PubmedArticle'. In this case, the focus is on the 'PubmedArticle' key, which is associated with a list of articles or papers.

- The 'PubmedArticle' key maps to a list of dictionaries, where each dictionary represents information about a specific PubMed article.

- Within each article dictionary, there are various keys and sub-dictionaries that contain information about the article. Some of the key-value pairs include:

- 'MedlineCitation': This key contains a dictionary that provides detailed information about the article. It includes information such as the title, abstract, authors, journal details, publication date, and more.

- 'PubmedData': This key contains additional data related to the PubMed article, including references and publication history.