In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_rows = 300
import math
import datetime as dt
import itertools
import random
import scipy.stats as stats
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
from sklearn.preprocessing import scale

# Data Synthesis

## eICU

In [None]:
df_type = pd.read_csv("../../03.eICU_SAKI_trajCluster/df_mixAK_fea4_C3_eicu.csv")
df_type_filt = df_type.loc[:,["stay_id","groupHPD"]]
df_type_filt = df_type_filt.drop_duplicates()
all_lst = df_type_filt.stay_id.unique().tolist()
print("Number of included patients:",len(all_lst))
df_type_filt["groupHPD"].value_counts()

df_fea_eicu = pd.read_csv("../../03.eICU_SAKI_trajCluster/sk_feature_timescale_Fb2_eicu.csv")
df_fea_eicu = pd.merge(df_fea_eicu,df_type_filt, how="inner",on="stay_id")
df_fea_eicu["dataset"] = "eicu"
df_fea_eicu.head(1)

In [3]:
# df_fea_eicu["temperature"].hist()
# plt.show()

## aumc

In [None]:
df_type = pd.read_csv("../../02.AUMCdb_SAKI_trajCluster/df_mixAK_fea3_C3_aumc.csv")
df_type_filt = df_type.loc[:,["stay_id","groupHPD"]]
df_type_filt = df_type_filt.drop_duplicates()
all_lst = df_type_filt.stay_id.unique().tolist()
print("Number of included patients:",len(all_lst))
df_type_filt["groupHPD"].value_counts()

df_fea_aumc = pd.read_csv("../../02.AUMCdb_SAKI_trajCluster/sk_feature_timescale_Fb2_aumc.csv")
df_fea_aumc.loc[df_fea_aumc["hemoglobin"]>1000,"hemoglobin"] = np.nan
df_fea_aumc.loc[df_fea_aumc["calcium"]>25,"calcium"] = np.nan
df_fea_aumc['calcium'] = df_fea_aumc.groupby('stay_id')['calcium'].fillna(method='ffill').fillna(method='bfill')
df_fea_aumc['hemoglobin'] = df_fea_aumc.groupby('stay_id')['hemoglobin'].fillna(method='ffill')

# df_fea_aumc = pd.merge(df_fea_aumc,df_type_filt, how="inner",on="stay_id")
# df_fea_aumc["dataset"] = "aumcdb"
# df_fea_aumc.head(1)

In [None]:
### Add bilirubin
# Obtain the occurrence time of saki
df_saki = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v3/00.data_aumc/disease_definition/AKI/aumcdb_sk_event_time.csv")
df_saki = df_saki[df_saki["stay_id"].isin(all_lst)]
df_saki = df_saki[["stay_id", "saki_onset"]]

# obtain Bilirubin time
df_bi = pd.read_csv("/public/hanl/jupyter_dir/database/AMUCdb/raw/feature/bilirubin.csv")
df_bi = df_bi[["admissionid","measuredat","value"]]
df_bi.columns = ["stay_id","measuredat","bilirubin"]
df_bi["charttime"] = df_bi["measuredat"]/(1000*60*60)

df_bi = pd.merge(df_bi,df_saki,how="inner",on="stay_id")
df_bi["time"] = (df_bi["charttime"] - df_bi["saki_onset"])//6
df_bi["time"] = df_bi["time"].map(lambda x:x//1) 
df_bi["time"] = df_bi["time"].map(lambda x: x+1 if x>=0 else x)
df_bi = df_bi.drop(["charttime","saki_onset","measuredat"],axis=1)
df_bi = df_bi.groupby(["stay_id","time"]).agg("mean").reset_index()
df_bi = df_bi[(df_bi["time"]>=-2) &(df_bi["time"]<=28)]

# Interpolation
df_m = pd.merge(df_fea_aumc,df_bi,how="outer",on=["stay_id","time"])
df_m = df_m.sort_values(["stay_id","time"])
df_m["stay_id_copy"] = df_m["stay_id"]
df_m = df_m.groupby(['stay_id_copy']).transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

# Multiple interpolation
import miceforest as mf
kernel = mf.ImputationKernel(
    df_m,
    datasets=3,
    save_all_iterations=True,
    random_state=10
)
kernel.mice(iterations = 3, n_jobs=-1)
df_m2 = kernel.complete_data(dataset=1)


In [6]:
df_fea_aumc = pd.merge(df_m2,df_type_filt, how="inner",on="stay_id")
df_fea_aumc["dataset"] = "aumcdb"
df_fea_aumc.head(1)

Unnamed: 0,stay_id,time,baseexcess,wbc,hematocrit,hemoglobin,pt,ptt,dbp,mbp,...,chloride,creatinine,po2,pco2,fio2,urineoutput,crea_divide_basecrea,bilirubin,groupHPD,dataset
0,5,-1.0,4.2,20.600001,0.443333,14.28686,10.3,34.0,70.2341,84.676544,...,105.0,0.6554,162.0,42.0,41.0,20.0,1.03,5.0,2,aumcdb


In [7]:
# df_fea_aumc["temperature"].hist()
# plt.show()

## MIMIC 

In [None]:
df_type = pd.read_csv("../../01.MIMICIV_SAKI_trajCluster/df_mixAK_fea4_C3.csv")
df_type_filt = df_type.loc[:,["stay_id","groupHPD"]]
df_type_filt = df_type_filt.drop_duplicates()
all_lst = df_type_filt.stay_id.unique().tolist()
print("Number of included patients:",len(all_lst))
df_type_filt["groupHPD"].value_counts()

df_fea_mimic = pd.read_csv("../../01.MIMICIV_SAKI_trajCluster/sk_feature_timescale_Fb2.csv")
df_fea_mimic.loc[df_fea_mimic["temperature"]>1000,"temperature"] = np.nan
df_fea_mimic = pd.merge(df_fea_mimic,df_type_filt, how="inner",on="stay_id")
df_fea_mimic["dataset"] = "mimic"
df_fea_mimic.head(1)

In [9]:
# df_fea_mimic["temperature"].hist()
# plt.show()

## merge- The same variable

In [None]:
df_fea_mimic = df_fea_mimic.rename(columns={'bilirubin_total':'bilirubin','platelet':'platelets'})
# merge
fea1st1 = df_fea_mimic.columns.tolist()
fea1st2 = df_fea_aumc.columns.tolist()
fea1st3 = df_fea_eicu.columns.tolist()
fea_select = list(set(fea1st1).intersection(fea1st2,fea1st3))

print("Common and unique characteristics:",fea_select)
print("The unique features of MIMIC:",list(set(fea1st1).difference(fea_select)))
print("Unique features of AUMCdb:",list(set(fea1st2).difference(fea_select)))
print("The unique characteristics of eICU:",list(set(fea1st3).difference(fea_select)))

df_merge = pd.concat([df_fea_aumc,df_fea_mimic,df_fea_eicu],axis=0)
df_merge = df_merge[fea_select]
all_lst = df_merge.stay_id.unique().tolist()
print(len(all_lst))

In [None]:
#There are 16 samples that do not have data with time= -1 and time=-2. Fill them with data with time= 1
df_fea = df_merge
com_lst = []
for i in df_fea.stay_id.unique().tolist():
    min_t = -2 
    max_t = int(max(df_fea[df_fea.stay_id==i].time))
    for j in range(min_t,max_t+1,1):
        com_lst.append([i,j])
df_complete = pd.DataFrame(com_lst,columns=["stay_id", "time"])  
df_fea = pd.merge(df_complete, df_merge, how="left", on=["stay_id","time"])
df_fea = df_fea[df_fea["time"]!=0]

df_fea = df_fea.sort_values(["stay_id","time"])
df_fea["stay_id_copy"] = df_fea["stay_id"]
df_fea = df_fea.groupby(['stay_id_copy']).transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_fea.to_csv("df_saki_timeseries_feature.csv",index=False) # Characteristics of the Intersection of three databases

## merge- All variables

In [None]:
df_fea_mimic = df_fea_mimic.rename(columns={'bilirubin_total':'bilirubin','platelet':'platelets'})
# merge
fea1st1 = df_fea_mimic.columns.tolist()
fea1st2 = df_fea_aumc.columns.tolist()
fea1st3 = df_fea_eicu.columns.tolist()
fea_select = list(set(fea1st1).intersection(fea1st2,fea1st3))
add_feature = ['bilirubin', 'platelets','inr','pt', 'ptt' ,'alp', 'ast','alt','bun']

print("Common and unique characteristics:",fea_select)
print("The unique features of MIMIC:",list(set(fea1st1).difference(fea_select)))
print("Unique features of AUMCdb:",list(set(fea1st2).difference(fea_select)))
print("The unique characteristics of eICU:",list(set(fea1st3).difference(fea_select)))

fea_select.extend(add_feature)
df_merge = pd.concat([df_fea_aumc,df_fea_mimic,df_fea_eicu],axis=0)
df_merge = df_merge[fea_select]
all_lst = df_merge.stay_id.unique().tolist()
print(len(all_lst))

In [13]:
df_merge[["dataset","stay_id"]].drop_duplicates()

Unnamed: 0,dataset,stay_id
0,aumcdb,5
12,aumcdb,15
41,aumcdb,20
70,aumcdb,35
97,aumcdb,38
...,...,...
29139,eicu,3247421
29159,eicu,3329427
29179,eicu,3338104
29189,eicu,3340575


In [None]:
# There are 16 samples that do not have data with time= -1 and time=-2. Fill them with data with time= 1
df_fea = df_merge
com_lst = []
for i in df_fea.stay_id.unique().tolist():
    min_t = -2 
    max_t = int(max(df_fea[df_fea.stay_id==i].time))
    for j in range(min_t,max_t+1,1):
        com_lst.append([i,j])
df_complete = pd.DataFrame(com_lst,columns=["stay_id", "time"])  
df_fea = pd.merge(df_complete, df_merge, how="left", on=["stay_id","time"])
df_fea = df_fea[df_fea["time"]!=0]

df_fea = df_fea.sort_values(["stay_id","time"])
df_fea["stay_id_copy"] = df_fea["stay_id"]
df_fea = df_fea.groupby(['stay_id_copy']).transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_fea.to_csv("df_saki_timeseries_feature_all.csv",index=False)
df_fea[["dataset","stay_id"]].drop_duplicates()

Unnamed: 0,dataset,stay_id
0,aumcdb,5
14,aumcdb,15
45,aumcdb,20
76,aumcdb,35
105,aumcdb,38
...,...,...
141975,mimic,39985287
142003,mimic,39989733
142020,mimic,39992167
142051,mimic,39993683
