# PubMed

Each dataset contains entries with missing text data. 

Since an entry without any text is unusable for classification, the goal is to retrieve as many texts as possible.

This notebook therefore aims to retrieve texts for each article without text and an associated PubMed id:

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import os # file system operations
from metapub import PubMedFetcher # pip install metapub
from tqdm.notebook import tqdm # progress bars
from src import data # helper functions
import pandas as pd, polars as pl # dataframes

## PubMedFetcher

The PubMedFetcher library enables to retrieve text data from PubMed through the NCBI repository.

Without an api key, the interface is limited to three queries per second.
We therefore set an api key as an environment variable.

If you do not have an api key yourself, grab one from here:
https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/

In [3]:
import dotenv # read .env files

# import the ncbi api key from a local .env file

dotenv.load_dotenv()
ncbi_api_key = os.getenv('NCBI')

#set the api key as an environment variable to increase the rate limit
os.environ['NCBI_API_KEY'] = ncbi_api_key

# Load Datasets

In [4]:
data_directory_openalex = '../../../../data/datasets/02_openalex'
pl_datasets = data.dict_from_directory(data_directory_openalex, separator=',', type='polars')

# Rename Literature IDs
## SYNERGY
The SYNERGY datasets refer to ids for PubMed as 'literature_id'. 

Rename to 'pubmed_id' for consistency:

In [5]:
for subject, dataset in pl_datasets.items():
    if subject != 'pancreatic_surgery':
        pl_datasets[subject] = dataset.rename({'literature_id': 'pubmed_id'})

## EVIglance
The EVIglance dataset contains ids from pubmed, cochrane central, web of science and others. 
Detect the type of the first three ids and assign them accordingly.

In [6]:
pl_datasets['pancreatic_surgery'] = pl_datasets['pancreatic_surgery'].with_columns(
    pl.when(pl.col('literature_id').str.starts_with('CN-'))
    .then(pl.col('literature_id').alias('central_id')),
    pl.when(pl.col('literature_id').str.starts_with('WOS:'))
    .then(pl.col('literature_id').alias('webofscience_id')),
    pl.when(pl.col('literature_id').str.contains("^(\\d)*$"))
    .then(pl.col('literature_id').alias('pubmed_id')),
).select(pl.all().exclude('literature_id'))

# Type Casting
The PubMed ID is wrongfully formatted as Float64 in the animal depression dataset and as String in the pancreatic surgery dataset. 
Cast both to Integer:

In [7]:
pl_datasets['animal_depression'] = pl_datasets['animal_depression'].cast({pl.Float64: pl.Int64})
pl_datasets['pancreatic_surgery'] = pl_datasets['pancreatic_surgery'].cast({'pubmed_id': pl.Int64})

# Uniform Schema

## Definition
Define a uniform schema that all datasets will use from now on:

In [8]:
schema = pl.Schema({
    "include": pl.Boolean,
    "title": pl.String,
    "abstract": pl.String,
    "first_author": pl.String,#
    "year": pl.Int16,#
    "journal": pl.String,#
    "doi": pl.String,
    "pubmed_id": pl.Int64,
    "authors": pl.String,#
    "pubmed_type": pl.String,#
    "publication_types": pl.String,#
    "mesh": pl.String,#
    "webofscience_id": pl.String,#
    "central_id": pl.String,#
    "openalex_id": pl.String,
})

## Assignment
Assign the schema to all datasets:

In [9]:
for subject, dataset in pl_datasets.items():
    df = pl.DataFrame(schema=schema)
    pl_datasets[subject] = pl.concat(items=[pl.DataFrame(schema=schema), dataset], how='diagonal')

# Download Function

Define a function which retrieves missing data from PubMed in the following way:

- Download article data for each article that contains a PubMed-ID
- Fill each empty field in the schema with the downloaded data 

In [10]:
def fill_missing_from_pubmed(dataframe: pd.DataFrame, subject: str) -> pd.DataFrame:

    df = dataframe.copy()

    for INDEX, row in tqdm(
        df.iterrows(), 
        total=df.shape[0], 
        desc=subject, 
        leave=True
    ):

        if not pd.isnull(row['pubmed_id']):
            try:
                metadata = PubMedFetcher().article_by_pmid(row['pubmed_id'])

                # fill columns if they are missing
                df.at[INDEX, 'title'] = metadata.title if pd.isnull(
                    row['title']) else row['title']
                df.at[INDEX, 'abstract'] = metadata.abstract if pd.isnull(
                    row['abstract']) else row['abstract']
                df.at[INDEX, 'first_author'] = metadata.author1_last_fm if pd.isnull(
                    row['first_author']) else row['first_author']
                df.at[INDEX, 'year'] = metadata.year if pd.isnull(
                    row['year']) else row['year']
                df.at[INDEX, 'journal'] = metadata.journal if pd.isnull(
                    row['journal']) else row['journal']
                df.at[INDEX, 'doi'] = metadata.doi if pd.isnull(
                    row['doi']) else row['doi']
                df.at[INDEX, 'pubmed_id'] = metadata.pmid if pd.isnull(
                    row['pubmed_id']) else row['pubmed_id']
                df.at[INDEX, 'authors'] = metadata.authors_str if pd.isnull(
                    row['authors']) else row['authors']
                df.at[INDEX, 'pubmed_type'] = metadata.pubmed_type if pd.isnull(
                    row['pubmed_type']) else row['pubmed_type']
                df.at[INDEX, 'publication_types'] = '; '.join([f'{key}: {value}' for key, value in metadata.publication_types.items(
                )]) if pd.isnull(row['publication_types']) else row['publication_types']
                df.at[INDEX, 'mesh'] = '; '.join([f'{key}: {value['descriptor_name']}' for key, value in metadata.mesh.items(
                )]) if pd.isnull(row['mesh']) else row['mesh']
                df.at[INDEX, 'webofscience_id'] = row['webofscience_id']
                df.at[INDEX, 'central_id'] = row['central_id']
                df.at[INDEX, 'openalex_id'] = row['openalex_id']
            except:
                pass

    return df

# Download and Export
Apply the aforementioned function to retrieve missing data.

Save the extended dataframes to .csv files:

In [None]:
data_directory_uniform =  '../../../../data/datasets/03_pubmed'

for subject, dataset in pl_datasets.items():
    # download metadata from pubmed eutils
    downloaded_df = fill_missing_from_pubmed(dataset.to_pandas(), subject)

    # transform the dataframe to polars
    polars_df = pl.DataFrame(data=downloaded_df, schema=schema)
    
    # save directly to csv 
    polars_df.write_csv(
        f'{data_directory_uniform}/{subject}_pubmed.csv', 
        index=False
    )
    