# Preprocessing the Datasets

## Uniformization

In [1]:
import re # regular expressions
import os # file system
import pandas as pd # dataframes
import numpy as np # numerical operations

### Loading

The raw datasets all sit in the same directory:

In [2]:
data_directory_raw = '../../../../data/01_original'

In [3]:
files = os.listdir(data_directory_raw)
files

['adhd_raw.csv',
 'animal_depression_raw.csv',
 'atypical_antipsychotics_raw.csv',
 'calcium_channel_blockers_raw.csv',
 'oral_hypoglycemics_raw.csv',
 'pancreatic_surgery_raw.tsv']

In [4]:
subjects = [file.split('_raw')[0] for file in files]
subjects

['adhd',
 'animal_depression',
 'atypical_antipsychotics',
 'calcium_channel_blockers',
 'oral_hypoglycemics',
 'pancreatic_surgery']

Import the raw datasets:

In [5]:
raw_dataframes = {subjects[count]: {
    'dataframe': pd.read_csv(f'{data_directory_raw}/{filename}',
                             sep='\t' if filename.endswith('.tsv') else ','),
    'synergy': True if filename.endswith('.csv') else False
} for count, filename in enumerate(files)}

### Inspection

In [6]:
raw_dataframes['pancreatic_surgery']['dataframe'].head()

Unnamed: 0,State,StudyType,Abstract,Title,LiteratureId,ArticleUrl,FirstAuthor,Doi
0,3,7,"In this paper, I consider: the value of variou...",Reflections and proposals for the standardizat...,10718171,,Elias,10.1053/ejso.1999.0731
1,3,7,The importance of diagnostic endoscopic retrog...,Diagnostic endoscopic retrograde cholangiopanc...,10718385,,Ponchon,10.1055/s-2000-95
2,3,7,A number of endoscopic interventions have expa...,Therapeutic pancreatic endoscopy.,10718387,,Neuhaus,10.1055/s-2000-94
3,3,7,BACKGROUND: Gastric lipase contributes signifi...,Cephalic phase of lipolysis is impaired in pan...,10720121,,Wøjdemann,10.1080/003655200750024407
4,3,7,BACKGROUND/AIM: The pancreas is an organ highl...,Ischemia/Reperfusion-Induced pancreatitis.,10720825,,Sakorafas,10.1159/000018793


In [7]:
raw_dataframes['animal_depression']['dataframe'].head()

Unnamed: 0,doi,pmid,openalex_id,label_included,method
0,https://doi.org/10.1042/bj1300919,https://pubmed.ncbi.nlm.nih.gov/4656804,https://openalex.org/W2401025235,0,id_retrieval_pmid
1,,https://pubmed.ncbi.nlm.nih.gov/6542443,https://openalex.org/W2410512259,0,id_retrieval_pmid
2,,,https://openalex.org/W2418079034,0,search_title
3,https://doi.org/10.1111/ejn.12410,https://pubmed.ncbi.nlm.nih.gov/24188077,https://openalex.org/W2017388204,1,id_retrieval_pmid
4,https://doi.org/10.1097/00003246-200106000-00024,https://pubmed.ncbi.nlm.nih.gov/11395604,https://openalex.org/W1995720522,0,id_retrieval_pmid


In [8]:
raw_dataframes['adhd']['dataframe'].head()

Unnamed: 0,pmid,doi,openalex_id,label_included
0,https://pubmed.ncbi.nlm.nih.gov/10051933,https://doi.org/10.1007/bf03012457,https://openalex.org/W2082613933,0
1,https://pubmed.ncbi.nlm.nih.gov/10053177,https://doi.org/10.1056/nejm199903043400903,https://openalex.org/W2312609348,0
2,https://pubmed.ncbi.nlm.nih.gov/10066996,https://doi.org/10.1037/0021-843x.108.1.90,https://openalex.org/W2022904832,0
3,https://pubmed.ncbi.nlm.nih.gov/10072008,https://doi.org/10.1097/00000539-199903000-00020,https://openalex.org/W2021097359,0
4,https://pubmed.ncbi.nlm.nih.gov/10072410,https://doi.org/10.1056/nejm199903113401003,https://openalex.org/W4239283954,0


### Mapping

Define a function that transforms the dataframes to the uniform format

| include 	| title 	| abstract 	| doi 	| literatureid 	| openalex_id 	|
|---------	|-------	|----------	|-----	|------	|-------------	|
| bool    	| str   	| str      	| str 	| str  	| str         	|

In [9]:
# column names and values differ between SYNERGY and non-SYNERGY datasets
def uniformize(dataframe: pd.DataFrame, synergy: bool) -> pd.DataFrame:

    # the datasets differ in column names for labels, doi, and identifiers
    label_column = 'label_included' if synergy else 'State'
    doi = 'doi' if synergy else 'Doi'
    id_column = 'pmid' if synergy else 'LiteratureId'

    # uniformize the include label
    exclude_label = 0 if synergy else 3
    mapping = lambda x: False if x == exclude_label else True
    
    # identifiers have different formats for web of science (WOS:), cochrane central (CN-), pubmed () and hand-signed (HS-)
    id_formats = r'(WOS:|CN-|HS-)*([A-Z]|\d)+$'
    # extract the identifier from the original column
    literature_ids = [re.search(id_formats, id).group()
                      if id is not np.nan else pd.NA for id in dataframe[id_column]]

    return pd.DataFrame(
        data={
            'include': dataframe[label_column].map(mapping),
            'title': pd.NA if synergy else dataframe['Title'],
            'abstract': pd.NA if synergy else dataframe['Abstract'],
            'doi': dataframe[doi],
            'literature_id': literature_ids,
            'openalex_id': dataframe['openalex_id'] if synergy else pd.NA,
        }
    )

Create one dictionary that contains all uniformized dataframes:

In [10]:
uniform_datasets = {key: uniformize(
    value['dataframe'], value['synergy']) for key, value in raw_dataframes.items()}

Demonstrate some sample rows from the dataframes

In [11]:
uniform_datasets['pancreatic_surgery'].sample(5)

Unnamed: 0,include,title,abstract,doi,literature_id,openalex_id
22311,False,Prognostic significance of DNA ploidy in adeno...,BACKGROUND: The prognostic significance of tum...,10.1002/1097-0142(19930615)71:12<3846::aid-cnc...,8508352,
16001,False,[Duodenal leiomyoma - a rare case report].,Neoplasms of the small bowel are rare and comp...,10.1556/1046.70.2017.3.3,28876117,
23208,False,"Chemotherapy in cancer of the liver, pancreas ...",,10.1080/110241598750004878,9537701,
29169,False,Surgical management in patients with pancreati...,BackgroundLittle has been published regarding ...,10.1111/j.1445-2197.2012.06312.x,WOS:000326237400017,
22659,False,"""Osteoclastic"" giant cell carcinoma of the pan...",BACKGROUND: Osteoclastic giant cell carcinoma ...,10.1159/000334012,8842177,


In [12]:
uniform_datasets['animal_depression'].sample(5)

Unnamed: 0,include,title,abstract,doi,literature_id,openalex_id
1791,False,,,https://doi.org/10.1007/bf00247505,2844579.0,https://openalex.org/W1997014842
610,False,,,https://doi.org/10.2174/1566524016666151222143609,26695696.0,https://openalex.org/W2198649099
1045,False,,,https://doi.org/10.1002/(sici)1521-4184(199805...,,https://openalex.org/W2002420245
1505,False,,,https://doi.org/10.1016/j.jinsphys.2008.12.013,19200436.0,https://openalex.org/W2135452288
1635,False,,,https://doi.org/10.1358/dof.2007.032.09.1135518,,https://openalex.org/W4238522086


In [13]:
uniform_datasets['adhd'].sample(5)

Unnamed: 0,include,title,abstract,doi,literature_id,openalex_id
232,False,,,https://doi.org/10.1016/s0140-6736(01)06128-1,11586978,https://openalex.org/W1992787738
619,False,,,https://doi.org/10.1097/00000539-199602000-00008,8561324,https://openalex.org/W2015741539
552,False,,,https://doi.org/10.1016/0165-1781(94)90007-8,7761553,https://openalex.org/W2010756522
566,False,,,https://doi.org/10.1136/bjo.78.1.30,7906539,https://openalex.org/W2099866489
392,False,,,https://doi.org/10.1177/0091270002042012005,12463729,https://openalex.org/W2075009263


### Export

In [14]:
data_directory_uniform = '../../../data/02_uniform'

[dataframe.to_csv(f'{data_directory_uniform}/{subject}_uniform.csv', index=False)
 for subject, dataframe in uniform_datasets.items()];