# Preprocessing the Datasets

## Uniformization

In [1]:
import re # regular expressions
import os # file system
import pandas as pd # dataframes
import numpy as np # numerical operations

### Loading

The raw datasets all sit in the same directory:

In [2]:
data_directory_raw = '../../../data/01_original'

In [3]:
files = os.listdir(data_directory_raw)
files

['adhd_raw.csv',
 'animal_depression_raw.csv',
 'atypical_antipsychotics_raw.csv',
 'calcium_channel_blockers_raw.csv',
 'oral_hypoglycemics_raw.csv',
 'pancreatic_surgery_raw.tsv']

In [4]:
subjects = [file.split('_raw')[0] for file in files]
subjects

['adhd',
 'animal_depression',
 'atypical_antipsychotics',
 'calcium_channel_blockers',
 'oral_hypoglycemics',
 'pancreatic_surgery']

Import the raw datasets:

In [5]:
raw_dataframes = {subjects[count]: {
    'dataframe': pd.read_csv(f'{data_directory_raw}/{filename}',
                             sep='\t' if filename.endswith('.tsv') else ','),
    'synergy': True if filename.endswith('.csv') else False
} for count, filename in enumerate(files)}

### Inspection

In [6]:
raw_dataframes['pancreatic_surgery']['dataframe'].head()

Unnamed: 0,State,StudyType,Abstract,Title,LiteratureId,ArticleUrl,FirstAuthor,Doi
0,3,7,"In this paper, I consider: the value of variou...",Reflections and proposals for the standardizat...,10718171,,Elias,10.1053/ejso.1999.0731
1,3,7,The importance of diagnostic endoscopic retrog...,Diagnostic endoscopic retrograde cholangiopanc...,10718385,,Ponchon,10.1055/s-2000-95
2,3,7,A number of endoscopic interventions have expa...,Therapeutic pancreatic endoscopy.,10718387,,Neuhaus,10.1055/s-2000-94
3,3,7,BACKGROUND: Gastric lipase contributes signifi...,Cephalic phase of lipolysis is impaired in pan...,10720121,,Wøjdemann,10.1080/003655200750024407
4,3,7,BACKGROUND/AIM: The pancreas is an organ highl...,Ischemia/Reperfusion-Induced pancreatitis.,10720825,,Sakorafas,10.1159/000018793


In [7]:
raw_dataframes['animal_depression']['dataframe'].head()

Unnamed: 0,doi,pmid,openalex_id,label_included,method
0,https://doi.org/10.1042/bj1300919,https://pubmed.ncbi.nlm.nih.gov/4656804,https://openalex.org/W2401025235,0,id_retrieval_pmid
1,,https://pubmed.ncbi.nlm.nih.gov/6542443,https://openalex.org/W2410512259,0,id_retrieval_pmid
2,,,https://openalex.org/W2418079034,0,search_title
3,https://doi.org/10.1111/ejn.12410,https://pubmed.ncbi.nlm.nih.gov/24188077,https://openalex.org/W2017388204,1,id_retrieval_pmid
4,https://doi.org/10.1097/00003246-200106000-00024,https://pubmed.ncbi.nlm.nih.gov/11395604,https://openalex.org/W1995720522,0,id_retrieval_pmid


In [8]:
raw_dataframes['adhd']['dataframe'].head()

Unnamed: 0,pmid,doi,openalex_id,label_included
0,https://pubmed.ncbi.nlm.nih.gov/10051933,https://doi.org/10.1007/bf03012457,https://openalex.org/W2082613933,0
1,https://pubmed.ncbi.nlm.nih.gov/10053177,https://doi.org/10.1056/nejm199903043400903,https://openalex.org/W2312609348,0
2,https://pubmed.ncbi.nlm.nih.gov/10066996,https://doi.org/10.1037/0021-843x.108.1.90,https://openalex.org/W2022904832,0
3,https://pubmed.ncbi.nlm.nih.gov/10072008,https://doi.org/10.1097/00000539-199903000-00020,https://openalex.org/W2021097359,0
4,https://pubmed.ncbi.nlm.nih.gov/10072410,https://doi.org/10.1056/nejm199903113401003,https://openalex.org/W4239283954,0


### Mapping

Define a function that transforms the dataframes to the uniform format

| include 	| title 	| abstract 	| doi 	| literatureid 	| openalex_id 	|
|---------	|-------	|----------	|-----	|------	|-------------	|
| bool    	| str   	| str      	| str 	| str  	| str         	|

In [9]:
# column names and values differ between SYNERGY and non-SYNERGY datasets
def uniformize(dataframe: pd.DataFrame, synergy: bool) -> pd.DataFrame:

    # the datasets differ in column names for labels, doi, and identifiers
    label_column = 'label_included' if synergy else 'State'
    doi = 'doi' if synergy else 'Doi'
    id_column = 'pmid' if synergy else 'LiteratureId'

    # uniformize the include label
    exclude_label = 0 if synergy else 3
    mapping = lambda x: False if x == exclude_label else True
    
    # identifiers have different formats for web of science (WOS:), cochrane central (CN-), pubmed () and hand-signed (HS-)
    id_formats = r'(WOS:|CN-|HS-)*([A-Z]|\d)+$'
    # extract the identifier from the original column
    literature_ids = [re.search(id_formats, id).group()
                      if id is not np.nan else pd.NA for id in dataframe[id_column]]

    return pd.DataFrame(
        data={
            'include': dataframe[label_column].map(mapping),
            'title': pd.NA if synergy else dataframe['Title'],
            'abstract': pd.NA if synergy else dataframe['Abstract'],
            'doi': dataframe[doi],
            'literature_id': literature_ids,
        }
    )

Create one dictionary that contains all uniformized dataframes:

In [10]:
uniform_datasets = {key: uniformize(
    value['dataframe'], value['synergy']) for key, value in raw_dataframes.items()}

Demonstrate some sample rows from the dataframes

In [11]:
uniform_datasets['pancreatic_surgery'].sample(5)

Unnamed: 0,include,title,abstract,doi,literature_id
7613,False,[Pancreas and islet transplantation. The role ...,Diabetes mellitus is a chronic disease often l...,10.1007/s00104-008-1633-4,19224184
6819,False,Intervention in necrotizing pancreatitis: an e...,Interventional therapy in necrotizing pancreat...,10.1007/s11605-007-0445-z,18172609
20298,False,Safety and efficacy of TRIANGLE operation appl...,BACKGROUND: Pancreatic surgery is regarded as ...,10.1136/bmjopen-2021-059977,36691122
29129,False,False-Negative Rate of Endoscopic Ultrasound-G...,BACKGROUND The diagnosis of pancreatic tumors ...,10.1002/cncy.21299,WOS:000323036800008
8786,False,Metastatic tumors in the pancreas in the moder...,BACKGROUND: Tumors metastasizing to the pancre...,10.1016/j.jamcollsurg.2010.08.017,21109158


In [12]:
uniform_datasets['animal_depression'].sample(5)

Unnamed: 0,include,title,abstract,doi,literature_id
789,False,,,,
1471,False,,,https://doi.org/10.1136/jnnp.2006.104570,17522102.0
700,False,,,https://doi.org/10.1016/s0924-977x(10)70051-1,
956,False,,,https://doi.org/10.1085/jgp.37.3.335,13118104.0
1223,False,,,https://doi.org/10.1530/acta.0.0770636,4479701.0


In [13]:
uniform_datasets['adhd'].sample(5)

Unnamed: 0,include,title,abstract,doi,literature_id
250,False,,,https://doi.org/10.1001/archpsyc.58.12.1172,11735847
448,False,,,https://doi.org/10.1007/bf03040274,12658913
777,False,,,,9564199
125,False,,,https://doi.org/10.1097/00004714-200008000-00013,10917410
378,False,,,https://doi.org/10.1097/00002826-200209000-00007,12410058


### Export

In [14]:
data_directory_uniform = '../../../data/02_uniform'

[dataframe.to_csv(f'{data_directory_uniform}/{subject}_uniform.csv', index=False)
 for subject, dataframe in uniform_datasets.items()];