In [1]:
import logging
from typing import  Iterable

import pandas as pd

from pymedx_custom import PubMedArticle, PubMed


#logging.basicConfig(level=logging.DEBUG,datefmt='%H:%M:%S' , format='%(asctime)s.%(msecs)03d - %(module)s/%(funcName)s:%(lineno)d - %(levelname)s - %(message)s')
logging.basicConfig(level=logging.INFO,datefmt='%H:%M:%S' , format='%(asctime)s.%(msecs)03d - %(module)s/%(funcName)s:%(lineno)d - %(levelname)s - %(message)s')


pubmed = PubMed()

Helper functions

In [2]:
def get_df(articles: Iterable[PubMedArticle]) -> pd.DataFrame:
    "Construct a DataFrame from a list of PubMedArticle objects"
    
    dict_articles = [x.toDict() for x in articles]
    print(f"Total articles: {len(dict_articles)}")
    return pd.DataFrame(dict_articles)

Base list of articles is a set of articles where:
 - Author: Kabashin 
 - or Author: Barcikowski 
 - or Keywords: "pulsed laser ablation in liquids" 
 - or Keywords: "laser ablation in liquids"
 - or Keywords: "nanoparticles" AND "laser ablation"
 - or Keywords: "laser fragmentation in liquids" 
 - Articles published after 1992

In [None]:
author = "Kabashin[AU]"
base = get_df(pubmed.query(author))

author = "Barcikowski[AU]"
base = pd.concat([base, get_df(pubmed.query(author))])

keywords = "pulsed laser ablation in liquids[OT]"
base = pd.concat([base, get_df(pubmed.query(keywords))])

keywords = "laser ablation in liquids[OT]"
base = pd.concat([base, get_df(pubmed.query(keywords))])

keywords = "laser ablation[OT] AND nanoparticles[OT]"
base = pd.concat([base, get_df(pubmed.query(keywords))])

keywords = "laser fragmentation in liquids[OT]"
base = pd.concat([base, get_df(pubmed.query(keywords))])

# Drop duplicates
base.drop_duplicates(subset='doi', inplace=True)

# Drop too old
base = base[base['publication_date'] > '1993-01-01']

Save base articles

In [None]:
base.to_csv('base_articles.csv', index=False)

Get list of articles, which cite base list

In [None]:
citing = pd.DataFrame(columns=base.columns)

tot_articles = len(base)
for i, pmid in enumerate(base['pmid']):
    logging.info(f'Start fetching article {i+1}/{tot_articles} - {pmid}')
    articles = pubmed.getCitingArticles(pmid)
    dict_articles = [x.toDict() for x in articles]
    citing = pd.concat([citing, pd.DataFrame(dict_articles)], ignore_index=True)

# Drop duplicates
citing.drop_duplicates(subset='doi', inplace=True)

# Drop articles from base list
citing = citing[~citing.doi.isin(base.doi)] 

Get references of base list

In [None]:
refs = pd.DataFrame(columns=base.columns)

tot_articles = len(base)
for i, ref_list in enumerate(base['references'].dropna()):
    logging.info(f'Start fetching refernces for article {i+1}/{tot_articles} - {ref_list}')
    if ref_list:
        ref_list = eval(ref_list)
        ids = [ref['pmid'] for ref in ref_list if ref['pmid'] is not None]
        logging.info(f'Fetching {len(ids)} references...')
        articles = list(pubmed.getArticles(ids))
        logging.info(f'Fetched {len(articles)} references')
        dict_articles = [x.toDict() for x in articles]
        refs = pd.concat([refs, pd.DataFrame(dict_articles)], ignore_index=True)

# Drop duplicates
refs.drop_duplicates(subset='doi', inplace=True)

# Drop articles already in the base
refs = refs[~refs.doi.isin(base.doi)]

Combine all lists

In [20]:
all_articles = pd.concat([base, citing, refs], ignore_index=True)
all_articles.drop_duplicates(subset='doi', inplace=True)
all_articles = all_articles[all_articles['publication_date'] > '1993-01-01']
all_articles.to_csv('all_articles.csv', index=False)

Get authors statistics

In [22]:
from collections import Counter

authors = []
for authorlist in all_articles['authors']:
    authorlist = eval(authorlist)
    authors.extend([x['lastname'].lower() for x in authorlist if x['lastname'] is not None])

auth_counter = Counter(authors)
auth_counter.most_common(10)

[('wang', 774),
 ('zhang', 606),
 ('li', 597),
 ('liu', 517),
 ('chen', 437),
 ('yang', 311),
 ('kim', 296),
 ('lee', 235),
 ('wu', 223),
 ('huang', 204)]

Get statistics on keywords

In [24]:
from collections import Counter

kw = Counter()
for kwlist in all_articles['keywords'].dropna():
    kwlist = eval(kwlist)
    kw.update([kw.lower() for kw in kwlist if kw is not None])
kw.most_common(20)

[('nanoparticles', 323),
 ('gold nanoparticles', 161),
 ('silver nanoparticles', 148),
 ('laser ablation', 136),
 ('drug delivery', 92),
 ('nanomedicine', 71),
 ('cytotoxicity', 67),
 ('nanotechnology', 67),
 ('surface plasmon resonance', 65),
 ('cancer', 62),
 ('toxicity', 60),
 ('nanomaterials', 59),
 ('plasmonics', 54),
 ('photothermal therapy', 51),
 ('antibacterial activity', 51),
 ('sers', 49),
 ('antibacterial', 46),
 ('oxidative stress', 46),
 ('wound healing', 46),
 ('green synthesis', 45)]