In [172]:
from Bio import Entrez

In [173]:
import pandas as pd
import numpy as np


In [174]:
def search(query):
    #docs: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Entrez.email = 'email@example.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='1',
                            retmode='xml',
                            term=query, 
                            mindate=2013, 
                            maxdate=2023)
    results = Entrez.read(handle)
    return results

In [175]:
studies = search('Intelligence')

In [176]:
studiesIdList = studies['IdList']

In [177]:
studiesIdList

[1m[[0m[32m'28728020'[0m[1m][0m

In [178]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'email@example.com'
    handle = Entrez.efetch(db='pubmed',
    retmode='xml',
    id=ids)
    results = Entrez.read(handle) #dictionary 
    return results

In [179]:
title_list = []
authors_list = []
affiliation_list = []
abstract_list = []
journal_list = []
language_list = []
pubdate_year_list = []
pubdate_month_list = []
major_descriptor_list = []
descriptor_list = []
major_qualifier_list = []
qualifier_list = []

studies = fetch_details(studiesIdList)
chunk_size = 500 #reduce chunksize to not exceed request limits
for chunk_i in range(0, len(studiesIdList), chunk_size):
    chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
    papers = fetch_details(chunk)
    for i, paper in enumerate (papers['PubmedArticle']):
        title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
        try:
            abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
        except:
            abstract_list.append('NA')
        try: 
            journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
        except: 
            journal_list.append('NA')
        try:
            language_list.append(paper['MedlineCitation']['Article']['Language'][0])
        except: 
            language_list.append('NA')
        try: 
            descr = [descriptor['DescriptorName'] for descriptor in paper['MedlineCitation']['MeshHeadingList']] 
            descriptor_list.append(descr)
        except: 
            descriptor_list.append("No Data")
        try:
            mdescr = [descriptor['DescriptorName'] for descriptor in paper['MedlineCitation']['MeshHeadingList'] if descriptor['DescriptorName'].attributes.get('MajorTopicYN') == 'Y']
            major_descriptor_list.append(mdescr)
        except: 
            major_descriptor_list.append('NA')
        try: 
            qualif = [str(descriptor['QualifierName'][0]) for descriptor in paper['MedlineCitation']['MeshHeadingList'] if descriptor['QualifierName']]
            qualifier_list.append(list(set(qualif))) #append only unique qualifiers
        except: 
            qualifier_list.append('NA')
        try: 
            maj_qualif = [str(descriptor['QualifierName'][0]) for descriptor in paper['MedlineCitation']['MeshHeadingList'] if descriptor['QualifierName'] and descriptor['QualifierName'].attributes.get('MajorTopicYN') == 'Y']
            major_qualifier_list.append(list(set(maj_qualif))) #only unique
        except: 
            major_qualifier_list.append('NA')
        try: 
            authors_list.append([", ".join([author.get('LastName'), author.get('ForeName')]) for author in paper['MedlineCitation']['Article']['AuthorList']])
        except: 
            authors_list.append('NA')
        try: 
            affiliation_lst = []
            for i, author in enumerate(paper['MedlineCitation']['Article']['AuthorList']):
                try:
                    affiliation_lst.append([affiliation.get('Affiliation', '') for affiliation in author.get('AffiliationInfo')][0])
                except: continue
            affiliation_list.append(affiliation_lst)
        except:
            affiliation_list.append('NA')
        try:
            pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
        except:
            pubdate_year_list.append('NA')
        try:
            pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
        except:
            pubdate_month_list.append('NA')
            

In [171]:
[", ".join([author.get('LastName'), author.get('ForeName')]) for author in paper['MedlineCitation']['Article']['AuthorList']]

[1m[[0m[32m'Hassabis, Demis'[0m, [32m'Kumaran, Dharshan'[0m, [32m'Summerfield, Christopher'[0m, [32m'Botvinick, Matthew'[0m[1m][0m

In [169]:
affiliation_lst = []
for i, author in enumerate(paper['MedlineCitation']['Article']['AuthorList']):
    try:
        affiliation_lst.append([affiliation.get('Affiliation', '') for affiliation in author.get('AffiliationInfo')][0])
    except: continue

In [170]:
affiliation_lst


[1m[[0m
    [32m'DeepMind, 5 New Street Square, London, UK; Gatsby Computational Neuroscience Unit, 25 Howland Street, London, UK. Electronic address: dhcontact@google.com.'[0m,
    [32m'DeepMind, 5 New Street Square, London, UK; Institute of Cognitive Neuroscience, University College London, 17 Queen Square, London, UK.'[0m,
    [32m'DeepMind, 5 New Street Square, London, UK; Department of Experimental Psychology, University of Oxford, Oxford, UK.'[0m,
    [32m'DeepMind, 5 New Street Square, London, UK; Gatsby Computational Neuroscience Unit, 25 Howland Street, London, UK.'[0m
[1m][0m

In [138]:
[str(descriptor['QualifierName'][0]) for descriptor in paper['MedlineCitation']['MeshHeadingList'] if descriptor['QualifierName']]

[1m[[0m[32m'physiology'[0m, [32m'physiology'[0m, [32m'physiology'[0m[1m][0m

In [129]:
qualifier_list


[1m[[0m
    [1m[[0m
        [1m[[0m[1m][0m,
        [1m[[0m[1m][0m,
        [1m[[0m[1;35mStringElement[0m[1m([0m[32m'physiology'[0m, [33mattributes[0m=[1m{[0m[32m'UI'[0m: [32m'Q000502'[0m, [32m'MajorTopicYN'[0m: [32m'Y'[0m[1m}[0m[1m)[0m[1m][0m,
        [1m[[0m[1m][0m,
        [1m[[0m[1;35mStringElement[0m[1m([0m[32m'physiology'[0m, [33mattributes[0m=[1m{[0m[32m'UI'[0m: [32m'Q000502'[0m, [32m'MajorTopicYN'[0m: [32m'Y'[0m[1m}[0m[1m)[0m[1m][0m,
        [1m[[0m[1;35mStringElement[0m[1m([0m[32m'physiology'[0m, [33mattributes[0m=[1m{[0m[32m'UI'[0m: [32m'Q000502'[0m, [32m'MajorTopicYN'[0m: [32m'Y'[0m[1m}[0m[1m)[0m[1m][0m,
        [1m[[0m[1m][0m,
        [1m[[0m[1m][0m
    [1m][0m
[1m][0m

In [180]:
df = pd.DataFrame(list(zip(
            title_list, authors_list, affiliation_list, qualifier_list, major_qualifier_list, descriptor_list, major_descriptor_list, abstract_list, journal_list, language_list, pubdate_year_list, pubdate_month_list
            )),
            columns=[
            'Title', 'Authors', 'Affiliations', 'Qualifier', 'Major Qualifier', 'Descriptor', 'Major Descriptor','Abstract', 'Journal', 'Language', 'Year','Month'
            ])

In [181]:
df

Unnamed: 0,Title,Authors,Affiliations,Qualifier,Major Qualifier,Descriptor,Major Descriptor,Abstract,Journal,Language,Year,Month
0,Neuroscience-Inspired Artificial Intelligence.,"[Hassabis, Demis, Kumaran, Dharshan, Summerfie...","[DeepMind, 5 New Street Square, London, UK; Ga...",[physiology],,"[Animals, Artificial Intelligence, Brain, Huma...","[Artificial Intelligence, Neural Networks, Com...",The fields of neuroscience and artificial inte...,Neuron,eng,2017,Jul


## efetch return
Here's an overview of the structure of an efetch return and some key information contained within it:

- The outermost structure appears to be a dictionary with several key-value pairs.

- There are two main keys within this dictionary: 'PubmedBookArticle' and 'PubmedArticle'. In this case, the focus is on the 'PubmedArticle' key, which is associated with a list of articles or papers.

- The 'PubmedArticle' key maps to a list of dictionaries, where each dictionary represents information about a specific PubMed article.

- Within each article dictionary, there are various keys and sub-dictionaries that contain information about the article. Some of the key-value pairs include:

- 'MedlineCitation': This key contains a dictionary that provides detailed information about the article. It includes information such as the title, abstract, authors, journal details, publication date, and more.

- 'PubmedData': This key contains additional data related to the PubMed article, including references and publication history.