In [7]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [8]:
import pandas as pd
import os
from metapub import PubMedFetcher
from tqdm.notebook import tqdm
from src import data
import polars as pl

#TODO: move to .env file
os.environ['NCBI_API_KEY'] = '9b3142c8c09a8527c8d9bc616f5ff2813d08'

# Load Datasets

In [2]:
import sys

print(sys.path)

['d:\\automated_title_abstract_screening', 'c:\\Users\\mfaig\\miniforge3\\envs\\automated_screening\\python312.zip', 'c:\\Users\\mfaig\\miniforge3\\envs\\automated_screening\\DLLs', 'c:\\Users\\mfaig\\miniforge3\\envs\\automated_screening\\Lib', 'c:\\Users\\mfaig\\miniforge3\\envs\\automated_screening', '', 'c:\\Users\\mfaig\\miniforge3\\envs\\automated_screening\\Lib\\site-packages', 'c:\\Users\\mfaig\\miniforge3\\envs\\automated_screening\\Lib\\site-packages\\win32', 'c:\\Users\\mfaig\\miniforge3\\envs\\automated_screening\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\mfaig\\miniforge3\\envs\\automated_screening\\Lib\\site-packages\\Pythonwin']


In [6]:
data_directory_openalex = '../../../../data/03_openalex'
pl_datasets = data.dict_from_directory(data_directory_openalex, separator=',', type='polars')

FileNotFoundError: [WinError 3] Das System kann den angegebenen Pfad nicht finden: '../../../../data/03_openalex'

# Rename Literature IDs
## SYNERGY
SYNERGY only has pubmed_ids, rename accordingly.

In [10]:
for subject, dataset in pl_datasets.items():
    if subject != 'pancreatic_surgery':
        pl_datasets[subject] = dataset.rename({'literature_id': 'pubmed_id'})

## EVIglance
The EVIglance dataset contains ids from pubmed, cochrane central, web of science and others. 
Detect the type of the first three ids and assign them accordingly.

In [11]:
pl_datasets['pancreatic_surgery'] = pl_datasets['pancreatic_surgery'].with_columns(
    pl.when(pl.col('literature_id').str.starts_with('CN-'))
    .then(pl.col('literature_id').alias('central_id')),
    pl.when(pl.col('literature_id').str.starts_with('WOS:'))
    .then(pl.col('literature_id').alias('webofscience_id')),
    pl.when(pl.col('literature_id').str.contains("^(\\d)*$"))
    .then(pl.col('literature_id').alias('pubmed_id')),
).select(pl.all().exclude('literature_id'))

# Type Casting
The PubMed ID is wrongfully formatted as Float64 in the animal depression dataset and as String in the pancreatic surgery dataset. 
Cast both to Integer:

In [12]:
pl_datasets['animal_depression'] = pl_datasets['animal_depression'].cast({pl.Float64: pl.Int64})
pl_datasets['pancreatic_surgery'] = pl_datasets['pancreatic_surgery'].cast({'pubmed_id': pl.Int64})

# Uniform Schema

## Definition
Define a uniform schema that all datasets will use from now on:

In [13]:
schema = pl.Schema({
    "include": pl.Boolean,
    "title": pl.String,
    "abstract": pl.String,
    "first_author": pl.String,
    "year": pl.Int16,
    "journal": pl.String,
    "doi": pl.String,
    "pubmed_id": pl.Int64,
    "authors": pl.String,
    "pubmed_type": pl.String,
    "publication_types": pl.String,
    "mesh": pl.String,
    "webofscience_id": pl.String,
    "central_id": pl.String,
    "openalex_id": pl.String,
})

## Assignment

In [14]:
for subject, dataset in pl_datasets.items():
    df = pl.DataFrame(schema=schema)
    pl_datasets[subject] = pl.concat(items=[pl.DataFrame(schema=schema), dataset], how='diagonal')

## Export
Export the data in this format if the target folder is empty

In [17]:
data_directory_schema = '../../../../data/031_schema'

# save the data in the provided schema 

[dataset.write_csv(f'{data_directory_schema}/{subject}_schema.csv') for subject, dataset in pl_datasets.items()];

# Download Function

In [10]:
def fill_missing_from_pubmed(dataframe: pd.DataFrame, subject: str) -> pd.DataFrame:

    df = dataframe.copy()

    for INDEX, row in tqdm(df.iterrows(), total=df.shape[0], desc=subject, leave=True):

        if not pd.isnull(row['pubmed_id']):
            try:
                metadata = PubMedFetcher().article_by_pmid(row['pubmed_id'])

                # fill columns if they are missing
                df.at[INDEX, 'title'] = metadata.title if pd.isnull(row['title']) else row['title']
                df.at[INDEX, 'abstract'] = metadata.abstract if pd.isnull(row['abstract']) else row['abstract']
                df.at[INDEX, 'first_author'] = metadata.author1_last_fm if pd.isnull(row['first_author']) else row['first_author']
                df.at[INDEX, 'year'] = metadata.year if pd.isnull(row['year']) else row['year']
                df.at[INDEX, 'journal'] = metadata.journal if pd.isnull(row['journal']) else row['journal']
                df.at[INDEX, 'doi'] = metadata.doi if pd.isnull(row['doi']) else row['doi']
                df.at[INDEX, 'pubmed_id'] = metadata.pmid if pd.isnull(row['pubmed_id']) else row['pubmed_id']
                df.at[INDEX, 'authors'] = metadata.authors_str if pd.isnull(row['authors']) else row['authors']
                df.at[INDEX, 'pubmed_type'] = metadata.pubmed_type if pd.isnull(row['pubmed_type']) else row['pubmed_type']
                df.at[INDEX, 'publication_types'] = '; '.join([f'{key}: {value}' for key, value in metadata.publication_types.items()]) if pd.isnull(row['publication_types']) else row['publication_types']
                df.at[INDEX, 'mesh'] = '; '.join([f'{key}: {value['descriptor_name']}' for key, value in metadata.mesh.items()]) if pd.isnull(row['mesh']) else row['mesh']
                df.at[INDEX, 'webofscience_id'] = row['webofscience_id']
                df.at[INDEX, 'central_id'] = row['central_id']
                df.at[INDEX, 'openalex_id'] = row['openalex_id']
            except:
                pass

    return df

# Download and Export

In [11]:
downloaded_datasets = {}

In [None]:
data_directory_uniform = '../../../../data/03_pubmed'

for subject, dataset in pl_datasets.items():
    # download metadata from pubmed eutils
    downloaded_df = fill_missing_from_pubmed(dataset.to_pandas(), subject)

    # transform the dataframe to polars
    polars_df = pl.DataFrame(data=downloaded_df, schema=schema)
    
    # add to the dictionary to access later
    downloaded_datasets[subject] = polars_df
    
    # save directly to csv 
    polars_df.write_csv(f'{data_directory_uniform}/{subject}_pubmed.csv')
    

In [None]:
polars_df = pl.DataFrame(data=downloaded_df.astype({'pubmed_id': 'Int64'}), schema=schema)

In [25]:
downloaded_df.to_csv(f'{data_directory_uniform}/{subject}_pubmed.csv', index=False)