In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_rows = 300
import math
import datetime as dt
import itertools
import random
import scipy.stats as stats
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
from sklearn.preprocessing import scale

# Data Synthesis

## eICU

In [None]:
df_type = pd.read_csv("../../03.eICU_SAKI_trajCluster/df_mixAK_fea4_C3_eicu.csv")
df_type_filt = df_type.loc[:,["stay_id","groupHPD"]]
df_type_filt = df_type_filt.drop_duplicates()
all_lst = df_type_filt.stay_id.unique().tolist()
print("Number of included patients:",len(all_lst))
df_type_filt["groupHPD"].value_counts()

In [3]:
df_time = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_eicu//disease_definition/AKI/eicu_saki_event_time.csv")
df_time = df_time[df_time["stay_id"].isin(all_lst)]
df_time = df_time[["stay_id", "saki_onset"]]

In [4]:
data_dir = "/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_eicu/feature_data/"
infile = data_dir + "eicu_data_merge.csv"
df_fea = pd.read_csv(infile)
df_fea = df_fea[df_fea["stay_id"].isin(all_lst)]
df_fea["charttime"] = df_fea["charttime"]/60

df_fea_add  = pd.merge(df_fea,df_time, how="inner", on="stay_id")
df_fea_add["time"] = (df_fea_add["charttime"] - df_fea_add["saki_onset"])/24
df_fea_add["time"] = df_fea_add["time"].map(lambda x:x//1) 
df_fea_add["time"] = df_fea_add["time"].map(lambda x: x+1 if x>=0 else x)
df_fea_add = df_fea_add[(df_fea_add["time"]>=-2) &(df_fea_add["time"]<=7)]

df_fea_add = df_fea_add.drop(["charttime","saki_onset"],axis=1)
df_fea_add_1 = df_fea_add.groupby(["stay_id","time"]).agg("mean").reset_index()
df_fea_add_1 = df_fea_add_1.drop(["urineoutput"],axis=1)
df_fea_add_2 = df_fea_add.groupby(["stay_id","time"]).agg({"urineoutput":"sum"}).reset_index()
df_fea_add = pd.merge(df_fea_add_1,df_fea_add_2,how="inner",on=["stay_id","time"])

In [None]:
# Interpolation 1
tmp_df = pd.DataFrame()
df_fea_add = df_fea_add.sort_values(by=["stay_id","time"])
for i,j in df_fea_add.groupby(["stay_id"]):
    j = j.fillna(method="ffill")
    tmp_df = pd.concat([tmp_df, j], axis=0)
df_im = tmp_df

# Interpolation 2
import miceforest as mf
kernel = mf.ImputationKernel(
    df_im,
    datasets=3,
    save_all_iterations=True,
    random_state=10
)
kernel.mice(iterations = 3, n_jobs=-1)
df_im2 = kernel.complete_data(dataset=1)

In [6]:
df_fea_eicu = pd.merge(df_im2,df_type_filt, how="inner",on="stay_id")
df_fea_eicu["dataset"] = "eicu"
df_fea_eicu.head(1)

Unnamed: 0,stay_id,time,fio2,po2,pco2,ph,aniongap,baseexcess,peep,albumin,...,heart_rate,resp_rate,spo2,temperature,sbp,dbp,mbp,urineoutput,groupHPD,dataset
0,243208,-1.0,0.7,66.2,37.0,7.56,13.0,3.433333,12.0,1.9,...,96.0,20.8,93.6,37.475,111.2,64.230769,78.678571,400.0,1,eicu


## aumc

In [None]:
df_type = pd.read_csv("../../02.AUMCdb_SAKI_trajCluster/df_mixAK_fea3_C3_aumc.csv")
df_type_filt = df_type.loc[:,["stay_id","groupHPD"]]
df_type_filt = df_type_filt.drop_duplicates()
all_lst = df_type_filt.stay_id.unique().tolist()
print("Number of included patients:",len(all_lst))
df_type_filt["groupHPD"].value_counts()

In [None]:
df_time = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_aumc/disease_definition/AKI/aumcdb_sk_event_time.csv")
df_time = df_time[df_time["stay_id"].isin(all_lst)]
df_time = df_time[["stay_id", "saki_onset"]]

# The start of the clinical characteristic criteria is time//60; end is time//60 + 1, which is the hour of admission to the ICU
df_fea = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_aumc/feature_data/aumc_icu_feature.csv")
df_fea_add = pd.merge(df_fea,df_time,how="inner",on="stay_id")
df_fea_add["time"] = (df_fea_add["charttime"] - df_fea_add["saki_onset"])/24
df_fea_add["time"] = df_fea_add["time"].map(lambda x:x//1) 
df_fea_add["time"] = df_fea_add["time"].map(lambda x: x+1 if x>=0 else x)
df_fea_add = df_fea_add[(df_fea_add["time"]>=-2) &(df_fea_add["time"]<=7)]

df_fea_add = df_fea_add.drop(["charttime","saki_onset","platelet"],axis=1)
df_fea_add_1 = df_fea_add.groupby(["stay_id","time"]).agg("mean").reset_index()
df_fea_add_1 = df_fea_add_1.drop(["urineoutput"],axis=1)
df_fea_add_2 = df_fea_add.groupby(["stay_id","time"]).agg({"urineoutput":"sum"}).reset_index()
df_fea_add = pd.merge(df_fea_add_1,df_fea_add_2,how="inner",on=["stay_id","time"])

In [None]:
# Interpolation 1
tmp_df = pd.DataFrame()
df_fea_add = df_fea_add.sort_values(by=["stay_id","time"])
for i,j in df_fea_add.groupby(["stay_id"]):
    j = j.fillna(method="ffill")
    tmp_df = pd.concat([tmp_df, j], axis=0)
df_im = tmp_df

# Interpolation 2
kernel = mf.ImputationKernel(
    df_im,
    datasets=1,
    save_all_iterations=True,
    random_state=10
)
kernel.mice(iterations = 3, n_jobs=-1)
df_im2 = kernel.complete_data(dataset=0)

In [10]:
df_fea_aumc = pd.merge(df_im2,df_type_filt, how="inner",on="stay_id")
df_fea_aumc["dataset"] = "aumcdb"
df_fea_aumc.head(1)

Unnamed: 0,stay_id,time,baseexcess,rbc,mch,mcv,mchc,rdw,wbc,hematocrit,...,gcs,gcs_eyes,gcs_motor,gcs_verbal,po2,pco2,fio2,urineoutput,groupHPD,dataset
0,5,-1.0,4.2,4.7,30.100952,96.0,30.6147,13.9,20.600001,0.443333,...,15.0,4.0,6.0,5.0,162.0,42.0,41.0,40.0,2,aumcdb


## MIMIC 

In [None]:
df_type = pd.read_csv("../../01.MIMICIV_SAKI_trajCluster/df_mixAK_fea4_C3.csv")
df_type_filt = df_type.loc[:,["stay_id","groupHPD"]]
df_type_filt = df_type_filt.drop_duplicates()
all_lst = df_type_filt.stay_id.unique().tolist()
print("Number of included patients:",len(all_lst))
df_type_filt["groupHPD"].value_counts()

In [None]:
df_time = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_mimic/disease_definition/AKI/sk_event_time.csv")
df_time = df_time[df_time["stay_id"].isin(all_lst)]
df_time = df_time[["stay_id","saki_onset"]]
df_time["saki_onset"] = pd.to_datetime(df_time["saki_onset"])

data_dir = "/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_mimic/feature_data/"
infile = data_dir + "sk_icu_feature.csv"
df_fea = pd.read_csv(infile)
df_fea["charttime"] = pd.to_datetime(df_fea["charttime"])
#  condition1: -4 <= time <= 28
df_fea_add = pd.merge(df_fea, df_time, how="inner", on="stay_id")
df_fea_add["time"] = (df_fea_add["charttime"] - df_fea_add["saki_onset"]).map(lambda x:x/np.timedelta64(24, 'h')).tolist()
df_fea_add["time"] = df_fea_add["time"].map(lambda x:x//1) # 
df_fea_add["time"] = df_fea_add["time"].map(lambda x: x+1 if x>=0 else x)
df_fea_add = df_fea_add[(df_fea_add["time"]>=-2) &(df_fea_add["time"]<=7)]

df_fea_add = df_fea_add.drop(["charttime","saki_onset"],axis=1)
df_fea_add_1 = df_fea_add.groupby(["stay_id","time"]).agg("mean").reset_index()
df_fea_add_1 = df_fea_add_1.drop(["urineoutput"],axis=1)
df_fea_add_2 = df_fea_add.groupby(["stay_id","time"]).agg({"urineoutput":"sum"}).reset_index()
df_fea_add = pd.merge(df_fea_add_1,df_fea_add_2,how="inner",on=["stay_id","time"])

In [None]:
# Interpolation 1
tmp_df = pd.DataFrame()
df_fea_add = df_fea_add.sort_values(by=["stay_id","time"])
for i,j in df_fea_add.groupby(["stay_id"]):
    j = j.fillna(method="ffill")
    tmp_df = pd.concat([tmp_df, j], axis=0)
df_im = tmp_df

# Interpolation 2
import miceforest as mf
kernel = mf.ImputationKernel(
    df_im,
    datasets=3,
    save_all_iterations=True,
    random_state=10
)
kernel.mice(iterations = 3, n_jobs=-1)
df_im2 = kernel.complete_data(dataset=1)

In [14]:
df_fea_mimic = pd.merge(df_im2,df_type_filt, how="inner",on="stay_id")
df_fea_mimic["dataset"] = "mimic"
df_fea_mimic.head(1)

Unnamed: 0,stay_id,time,heart_rate,sbp,dbp,mbp,resp_rate,temperature,spo2,glucose,...,sofa,respiration_sofa,coagulation_sofa,liver_sofa,cardiovascular_sofa,cns_sofa,renal_sofa,urineoutput,groupHPD,dataset
0,30003598,-1.0,75.588235,125.470588,59.117647,83.0,18.2,35.963333,100.0,142.0,...,3.857143,2.0,0.0,0.0,1.0,0.0,0.857143,100.0,1,mimic


## merge- The same variable

In [None]:
df_fea_mimic = df_fea_mimic.rename(columns={'bilirubin_total':'bilirubin','platelet':'platelets'})
# merge
fea1st1 = df_fea_mimic.columns.tolist()
fea1st2 = df_fea_aumc.columns.tolist()
fea1st3 = df_fea_eicu.columns.tolist()
fea_select = list(set(fea1st1).intersection(fea1st2,fea1st3))

print(" Common Unique Features :",fea_select)
print(" Specific features of MIMIC :",list(set(fea1st1).difference(fea_select)))
print(" Unique features of AUMCdb :",list(set(fea1st2).difference(fea_select)))
print(" Unique Features of eICU :",list(set(fea1st3).difference(fea_select)))

df_merge = pd.concat([df_fea_aumc,df_fea_mimic,df_fea_eicu],axis=0)
df_merge = df_merge[fea_select]
all_lst = df_merge.stay_id.unique().tolist()
print(len(all_lst))

In [None]:
# There are 16 samples that do not have data with time= -1 and time=-2. Fill them with data with time= 1
df_fea = df_merge
com_lst = []
for i in df_fea.stay_id.unique().tolist():
    min_t = -2 
    max_t = int(max(df_fea[df_fea.stay_id==i].time))
    for j in range(min_t,max_t+1,1):
        com_lst.append([i,j])
df_complete = pd.DataFrame(com_lst,columns=["stay_id", "time"])  
df_fea = pd.merge(df_complete, df_merge, how="left", on=["stay_id","time"])
df_fea = df_fea[df_fea["time"]!=0]

df_fea = df_fea.sort_values(["stay_id","time"])
df_fea["stay_id_copy"] = df_fea["stay_id"]
df_fea = df_fea.groupby(['stay_id_copy']).transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_fea.to_csv("df_saki_timeseries_feature_24h.csv",index=False) 

## merge- All variables

In [None]:
df_fea_mimic = df_fea_mimic.rename(columns={'bilirubin_total':'bilirubin','platelet':'platelets'})
# merge
fea1st1 = df_fea_mimic.columns.tolist()
fea1st2 = df_fea_aumc.columns.tolist()
fea1st3 = df_fea_eicu.columns.tolist()
fea_select = list(set(fea1st1).intersection(fea1st2,fea1st3))
add_feature = ['bilirubin', 'platelets','inr','pt', 'ptt' ,'alp', 'ast','alt','bun']

print(" Common Unique Features :",fea_select)
print(" Specific features of MIMIC :",list(set(fea1st1).difference(fea_select)))
print(" Unique features of AUMCdb :",list(set(fea1st2).difference(fea_select)))
print(" Unique Features of eICU :",list(set(fea1st3).difference(fea_select)))

fea_select.extend(add_feature)
df_merge = pd.concat([df_fea_aumc,df_fea_mimic,df_fea_eicu],axis=0)
df_merge = df_merge[fea_select]
all_lst = df_merge.stay_id.unique().tolist()
print(len(all_lst))

In [18]:
df_merge[["dataset","stay_id"]].drop_duplicates()

Unnamed: 0,dataset,stay_id
0,aumcdb,5
4,aumcdb,15
12,aumcdb,20
20,aumcdb,35
28,aumcdb,38
...,...,...
12650,eicu,3338104
12655,eicu,3340575
12664,eicu,3344660
12672,eicu,3346371


In [None]:
df_fea = df_merge
com_lst = []
for i in df_fea.stay_id.unique().tolist():
    min_t = -2 
    max_t = int(max(df_fea[df_fea.stay_id==i].time))
    for j in range(min_t,max_t+1,1):
        com_lst.append([i,j])
df_complete = pd.DataFrame(com_lst,columns=["stay_id", "time"])  
df_fea = pd.merge(df_complete, df_merge, how="left", on=["stay_id","time"])
df_fea = df_fea[df_fea["time"]!=0]

df_fea = df_fea.sort_values(["stay_id","time"])
df_fea["stay_id_copy"] = df_fea["stay_id"]
df_fea = df_fea.groupby(['stay_id_copy']).transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_fea.to_csv("df_saki_timeseries_feature_all_24h.csv",index=False)
df_fea[["dataset","stay_id"]].drop_duplicates()

Unnamed: 0,dataset,stay_id
0,aumcdb,5
6,aumcdb,15
16,aumcdb,20
26,aumcdb,35
36,aumcdb,38
...,...,...
52582,mimic,39985287
52592,mimic,39989733
52599,mimic,39992167
52609,mimic,39993683
