In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd
import os
#TODO: move to .env file
os.environ['NCBI_API_KEY'] = '9b3142c8c09a8527c8d9bc616f5ff2813d08'
from metapub import PubMedFetcher
from tqdm.notebook import tqdm
from src import data
import polars as pl

In [3]:
data_directory_openalex = '../../../../data/03_openalex'

files = os.listdir(data_directory_openalex)
subjects = [file.split('_openalex')[0] for file in files]

In [4]:
files

['adhd_openalex.csv',
 'animal_depression_openalex.csv',
 'atypical_antipsychotics_openalex.csv',
 'calcium_channel_blockers_openalex.csv',
 'oral_hypoglycemics_openalex.csv',
 'pancreatic_surgery_openalex.csv']

In [5]:
subjects

['adhd',
 'animal_depression',
 'atypical_antipsychotics',
 'calcium_channel_blockers',
 'oral_hypoglycemics',
 'pancreatic_surgery']

In [6]:
openalex_datasets = {
    subjects[count]: pd.read_csv(f'{data_directory_openalex}/{file}')
    for count, file in enumerate(files)
}

In [7]:
pl_datasets = data.dict_from_directory(data_directory_openalex, separator=',', type='polars')

In [8]:
for subject, dataset in pl_datasets.items():
    if subject != 'pancreatic_surgery':
        pl_datasets[subject] = dataset.rename({'literature_id': 'pubmed_id'})
        pl_datasets[subject].insert_column(index=6, column=pl.Series(name='wos_id', values = [None] * len(dataset)))
        pl_datasets[subject].insert_column(index=7, column=pl.Series(name='central_id', values = [None] * len(dataset)))
        pl_datasets[subject].insert_column(index=8, column=pl.Series(name='authors', values = [None] * len(dataset)))
        pl_datasets[subject].insert_column(index=9, column=pl.Series(name='journal', values = [None] * len(dataset)))
        pl_datasets[subject].insert_column(index=10, column=pl.Series(name='year', values = [None] * len(dataset)))

In [9]:
pl_datasets['pancreatic_surgery']

include,title,abstract,doi,literature_id,openalex_id
bool,str,str,str,str,str
false,"""Reflections and proposals for …","""In this paper, I consider: the…","""10.1053/ejso.1999.0731""","""10718171""",
false,"""Diagnostic endoscopic retrogra…","""The importance of diagnostic e…","""10.1055/s-2000-95""","""10718385""",
false,"""Therapeutic pancreatic endosco…","""A number of endoscopic interve…","""10.1055/s-2000-94""","""10718387""",
false,"""Cephalic phase of lipolysis is…","""BACKGROUND: Gastric lipase con…","""10.1080/003655200750024407""","""10720121""",
false,"""Ischemia/Reperfusion-Induced p…","""BACKGROUND/AIM: The pancreas i…","""10.1159/000018793""","""10720825""",
…,…,…,…,…,…
true,"""Quality of life after open ver…",,"""10.1093/bjsopen/zrad002""","""36893287""",
true,"""A randomized controlled trial …",,"""10.1007/s00423-023-02873-w""","""37010643""",
true,"""Minimally invasive versus open…",,"""10.1016/j.lanepe.2023.100673""","""37457332""",
true,"""A prospective randomized contr…",,"""10.1016/j.amjsurg.2009.04.017""","""20074698""",


In [10]:
pl_datasets['pancreatic_surgery'] = pl_datasets['pancreatic_surgery'].with_columns(
    pl.when(pl.col('literature_id').str.starts_with('CN-'))
    .then(pl.col('literature_id').alias('central_id')),
    pl.when(pl.col('literature_id').str.starts_with('WOS:'))
    .then(pl.col('literature_id').alias('wos_id')),
    pl.when(pl.col('literature_id').str.contains("^(\\d)*$"))
    .then(pl.col('literature_id').alias('pubmed_id')),
).select(pl.all().exclude('literature_id'))

In [11]:
pl_datasets['pancreatic_surgery'] = pl_datasets['pancreatic_surgery'].insert_column(index=7, column=pl.Series(name='authors', values = [None] * len(pl_datasets['pancreatic_surgery'])))
pl_datasets['pancreatic_surgery'] = pl_datasets['pancreatic_surgery'].insert_column(index=8, column=pl.Series(name='journal', values = [None] * len(pl_datasets['pancreatic_surgery'])))
pl_datasets['pancreatic_surgery'] = pl_datasets['pancreatic_surgery'].insert_column(index=9, column=pl.Series(name='year', values = [None] * len(pl_datasets['pancreatic_surgery'])))

In [12]:
a = pl_datasets['pancreatic_surgery']
a

include,title,abstract,doi,openalex_id,central_id,wos_id,authors,journal,year,pubmed_id
bool,str,str,str,str,str,str,null,null,null,str
false,"""Reflections and proposals for …","""In this paper, I consider: the…","""10.1053/ejso.1999.0731""",,,,,,,"""10718171"""
false,"""Diagnostic endoscopic retrogra…","""The importance of diagnostic e…","""10.1055/s-2000-95""",,,,,,,"""10718385"""
false,"""Therapeutic pancreatic endosco…","""A number of endoscopic interve…","""10.1055/s-2000-94""",,,,,,,"""10718387"""
false,"""Cephalic phase of lipolysis is…","""BACKGROUND: Gastric lipase con…","""10.1080/003655200750024407""",,,,,,,"""10720121"""
false,"""Ischemia/Reperfusion-Induced p…","""BACKGROUND/AIM: The pancreas i…","""10.1159/000018793""",,,,,,,"""10720825"""
…,…,…,…,…,…,…,…,…,…,…
true,"""Quality of life after open ver…",,"""10.1093/bjsopen/zrad002""",,,,,,,"""36893287"""
true,"""A randomized controlled trial …",,"""10.1007/s00423-023-02873-w""",,,,,,,"""37010643"""
true,"""Minimally invasive versus open…",,"""10.1016/j.lanepe.2023.100673""",,,,,,,"""37457332"""
true,"""A prospective randomized contr…",,"""10.1016/j.amjsurg.2009.04.017""",,,,,,,"""20074698"""


In [13]:
missing_abstracts = pl_datasets['pancreatic_surgery'].filter(
    pl.col('abstract').is_null())

missing_abstracts.head(10)

include,title,abstract,doi,openalex_id,central_id,wos_id,authors,journal,year,pubmed_id
bool,str,str,str,str,str,str,null,null,null,str
False,"""Study Protocol of the PreFiPS …",,"""10.3389/fmed.2020.00488""",,,,,,,"""33521003"""
False,"""(Neo)adjuvant treatment in pan…",,"""10.1053/ejso.1998.0614""",,,,,,,"""10218453"""
False,"""[Adjuvant and palliative treat…",,,,,,,,,"""10638047"""
True,"""Laparoscopic versus open pancr…",,"""10.17235/reed.2019.6343/2019""",,,,,,,"""31823640"""
True,"""Comparison of patient outcomes…",,"""10.1177/0300060517717400""",,,,,,,"""28718685"""
False,"""[How to improve treatment of r…",,"""10.1016/s0399-8320(04)95185-8""",,,,,,,"""15657530"""
False,"""[Adjuvant chemotherapy for pan…",,,,,,,,,"""16457253"""
False,"""Cephalic phase of pancreatic s…",,"""10.1006/appe.1999.0281""",,,,,,,"""10744910"""
False,"""Variation in Serious Illness C…",,"""10.1089/jpm.2019.0268""",,,,,,,"""31580763"""
False,"""Assessment of Response to Neoa…",,"""10.2214/AJR.19.21152""",,,,,,,"""31799875"""


# Previous Code

In [14]:
def fill_missing_from_pubmed(dataframe: pd.DataFrame, subject: str) -> pd.DataFrame:

    df = dataframe.copy()

    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=subject, leave=True):

        if not pd.isnull(row['pubmed_id']):
            try:
                metadata = PubMedFetcher().article_by_pmid(row['pubmed_id'])
                df.loc[index, 'title'] = metadata.title
                df.loc[index, 'abstract'] = metadata.abstract
                df.loc[index, 'authors'] = metadata.authors
                df.loc[index, 'journal'] = metadata.journal
                df.loc[index, 'year'] = metadata.year
            except:
                pass

    return df

# Download and Export

In [15]:
downloaded_datasets = {}

In [16]:
data_directory_uniform = '../../../../data/04_pubmed'

for subject, dataset in pl_datasets.items():
    # download metadata from pubmed eutils
    downloaded_df = fill_missing_from_pubmed(dataset.to_pandas(), subject)
    
    # add to the dictionary to access later
    downloaded_datasets[subject] = downloaded_df
    
    # save directly to csv 
    downloaded_df.to_csv(f'{data_directory_uniform}/{subject}_pubmed.csv', index=False)
    

adhd:   0%|          | 0/851 [00:00<?, ?it/s]

animal_depression:   0%|          | 0/1993 [00:00<?, ?it/s]

atypical_antipsychotics:   0%|          | 0/1120 [00:00<?, ?it/s]

calcium_channel_blockers:   0%|          | 0/1218 [00:00<?, ?it/s]

oral_hypoglycemics:   0%|          | 0/503 [00:00<?, ?it/s]

pancreatic_surgery:   0%|          | 0/34206 [00:00<?, ?it/s]

In [17]:
#filled_datasets = {subject: fill_missing_from_pubmed( #ignore
#    dataset.to_pandas(), subject) for subject, dataset in pl_datasets.items()} #ignore

## Export

In [18]:
#data_directory_uniform = '../../../../data/04_missing'
#
#[dataframe.to_csv(f'{data_directory_uniform}/{subject}_pubmed.csv', index=False)
# for subject, dataframe in filled_datasets.items()];

# OTHERS

In [19]:
#row = missing_abstracts[5]
#id = row.select(pl.col('pubmed_id')).item()
#doi = row.select(pl.col('doi')).item()
#print(id, doi)

In [20]:
#metadata = PubMedFetcher().article_by_pmid(id)
#print(metadata.title, end='\n')
#print(metadata.authors)
#print(metadata.journal, end='\n')
#print(metadata.year, end='\n')
#print(metadata.doi, end='\n')
#print(metadata.pmid, end='\n')
#print(metadata.citation, end='\n')
#print(metadata.abstract, end='\n')
