In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_rows = 300
import math
import datetime as dt
import itertools
import random
import scipy.stats as stats
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
from sklearn.preprocessing import scale

# 数据合成

## eICU

In [2]:
df_type = pd.read_csv("../../03.eICU_SAKI_trajCluster/df_mixAK_fea4_C3_eicu.csv")
df_type_filt = df_type.loc[:,["stay_id","groupHPD"]]
df_type_filt = df_type_filt.drop_duplicates()
all_lst = df_type_filt.stay_id.unique().tolist()
print("纳入患者数量：",len(all_lst))
df_type_filt["groupHPD"].value_counts()

纳入患者数量： 1970


2    1136
1     648
3     186
Name: groupHPD, dtype: int64

In [3]:
df_time = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_eicu//disease_definition/AKI/eicu_saki_event_time.csv")
df_time = df_time[df_time["stay_id"].isin(all_lst)]
df_time = df_time[["stay_id", "saki_onset"]]

In [None]:
data_dir = "/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_eicu/feature_data/"
infile = data_dir + "eicu_data_merge.csv"
df_fea = pd.read_csv(infile)
df_fea = df_fea[df_fea["stay_id"].isin(all_lst)]
df_fea["charttime"] = df_fea["charttime"]/60

df_fea_add  = pd.merge(df_fea,df_time, how="inner", on="stay_id")
df_fea_add["time"] = (df_fea_add["charttime"] - df_fea_add["saki_onset"])/24
df_fea_add["time"] = df_fea_add["time"].map(lambda x:x//1) 
df_fea_add["time"] = df_fea_add["time"].map(lambda x: x+1 if x>=0 else x)
df_fea_add = df_fea_add[(df_fea_add["time"]>=-2) &(df_fea_add["time"]<=7)]

df_fea_add = df_fea_add.drop(["charttime","saki_onset"],axis=1)
df_fea_add_1 = df_fea_add.groupby(["stay_id","time"]).agg("mean").reset_index()
df_fea_add_1 = df_fea_add_1.drop(["urineoutput"],axis=1)
df_fea_add_2 = df_fea_add.groupby(["stay_id","time"]).agg({"urineoutput":"sum"}).reset_index()
df_fea_add = pd.merge(df_fea_add_1,df_fea_add_2,how="inner",on=["stay_id","time"])

In [None]:
# 插补1
tmp_df = pd.DataFrame()
df_fea_add = df_fea_add.sort_values(by=["stay_id","time"])
for i,j in df_fea_add.groupby(["stay_id"]):
    j = j.fillna(method="ffill")
    tmp_df = pd.concat([tmp_df, j], axis=0)
df_im = tmp_df

# 插补2
import miceforest as mf
kernel = mf.ImputationKernel(
    df_im,
    datasets=3,
    save_all_iterations=True,
    random_state=10
)
kernel.mice(iterations = 3, n_jobs=-1)
df_im2 = kernel.complete_data(dataset=1)

In [None]:
df_fea_eicu = pd.merge(df_im2,df_type_filt, how="inner",on="stay_id")
df_fea_eicu["dataset"] = "eicu"
df_fea_eicu.head(1)

## aumc

In [None]:
df_type = pd.read_csv("../../02.AUMCdb_SAKI_trajCluster/df_mixAK_fea3_C3_aumc.csv")
df_type_filt = df_type.loc[:,["stay_id","groupHPD"]]
df_type_filt = df_type_filt.drop_duplicates()
all_lst = df_type_filt.stay_id.unique().tolist()
print("纳入患者数量：",len(all_lst))
df_type_filt["groupHPD"].value_counts()

In [None]:
df_time = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_aumc/disease_definition/AKI/aumcdb_sk_event_time.csv")
df_time = df_time[df_time["stay_id"].isin(all_lst)]
df_time = df_time[["stay_id", "saki_onset"]]

# 临床特征标准的start是 time//60； end为 time//60 + 1 即入ICU的第几小时
df_fea = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_aumc/feature_data/aumc_icu_feature.csv") # 时间单位为小时
df_fea_add = pd.merge(df_fea,df_time,how="inner",on="stay_id")
df_fea_add["time"] = (df_fea_add["charttime"] - df_fea_add["saki_onset"])/24
df_fea_add["time"] = df_fea_add["time"].map(lambda x:x//1) 
df_fea_add["time"] = df_fea_add["time"].map(lambda x: x+1 if x>=0 else x)
df_fea_add = df_fea_add[(df_fea_add["time"]>=-2) &(df_fea_add["time"]<=7)]

df_fea_add = df_fea_add.drop(["charttime","saki_onset","platelet"],axis=1)
df_fea_add_1 = df_fea_add.groupby(["stay_id","time"]).agg("mean").reset_index()
df_fea_add_1 = df_fea_add_1.drop(["urineoutput"],axis=1)
df_fea_add_2 = df_fea_add.groupby(["stay_id","time"]).agg({"urineoutput":"sum"}).reset_index()
df_fea_add = pd.merge(df_fea_add_1,df_fea_add_2,how="inner",on=["stay_id","time"])

In [None]:
# 插补1
tmp_df = pd.DataFrame()
df_fea_add = df_fea_add.sort_values(by=["stay_id","time"])
for i,j in df_fea_add.groupby(["stay_id"]):
    j = j.fillna(method="ffill")
    tmp_df = pd.concat([tmp_df, j], axis=0)
df_im = tmp_df

# 插补2
kernel = mf.ImputationKernel(
    df_im,
    datasets=1,
    save_all_iterations=True,
    random_state=10
)
kernel.mice(iterations = 3, n_jobs=-1)
df_im2 = kernel.complete_data(dataset=0)

In [None]:
df_fea_aumc = pd.merge(df_im2,df_type_filt, how="inner",on="stay_id")
df_fea_aumc["dataset"] = "aumcdb"
df_fea_aumc.head(1)

## MIMIC 

In [None]:
df_type = pd.read_csv("../../01.MIMICIV_SAKI_trajCluster/df_mixAK_fea4_C3.csv")
df_type_filt = df_type.loc[:,["stay_id","groupHPD"]]
df_type_filt = df_type_filt.drop_duplicates()
all_lst = df_type_filt.stay_id.unique().tolist()
print("纳入患者数量：",len(all_lst))
df_type_filt["groupHPD"].value_counts()

In [None]:
df_time = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_mimic/disease_definition/AKI/sk_event_time.csv")
df_time = df_time[df_time["stay_id"].isin(all_lst)]
df_time = df_time[["stay_id","saki_onset"]]
df_time["saki_onset"] = pd.to_datetime(df_time["saki_onset"])

data_dir = "/public/hanl/jupyter_dir/kidney_sepsis_penotype_v2/00.data_mimic/feature_data/"
infile = data_dir + "sk_icu_feature.csv"
df_fea = pd.read_csv(infile)
df_fea["charttime"] = pd.to_datetime(df_fea["charttime"])
#  condition1: -4 <= time <= 28
df_fea_add = pd.merge(df_fea, df_time, how="inner", on="stay_id")
df_fea_add["time"] = (df_fea_add["charttime"] - df_fea_add["saki_onset"]).map(lambda x:x/np.timedelta64(24, 'h')).tolist()
df_fea_add["time"] = df_fea_add["time"].map(lambda x:x//1) # -1指sepsis前24小时的数据，0指sepsis后第一个24小时的数据
df_fea_add["time"] = df_fea_add["time"].map(lambda x: x+1 if x>=0 else x)
df_fea_add = df_fea_add[(df_fea_add["time"]>=-2) &(df_fea_add["time"]<=7)]

df_fea_add = df_fea_add.drop(["charttime","saki_onset"],axis=1)
df_fea_add_1 = df_fea_add.groupby(["stay_id","time"]).agg("mean").reset_index()
df_fea_add_1 = df_fea_add_1.drop(["urineoutput"],axis=1)
df_fea_add_2 = df_fea_add.groupby(["stay_id","time"]).agg({"urineoutput":"sum"}).reset_index()
df_fea_add = pd.merge(df_fea_add_1,df_fea_add_2,how="inner",on=["stay_id","time"])

In [None]:
# 插补1
tmp_df = pd.DataFrame()
df_fea_add = df_fea_add.sort_values(by=["stay_id","time"])
for i,j in df_fea_add.groupby(["stay_id"]):
    j = j.fillna(method="ffill")
    tmp_df = pd.concat([tmp_df, j], axis=0)
df_im = tmp_df

# 插补2
import miceforest as mf
kernel = mf.ImputationKernel(
    df_im,
    datasets=3,
    save_all_iterations=True,
    random_state=10
)
kernel.mice(iterations = 3, n_jobs=-1)
df_im2 = kernel.complete_data(dataset=1)

In [None]:
df_fea_mimic = pd.merge(df_im2,df_type_filt, how="inner",on="stay_id")
df_fea_mimic["dataset"] = "mimic"
df_fea_mimic.head(1)

##  merge-相同变量

In [None]:
df_fea_mimic = df_fea_mimic.rename(columns={'bilirubin_total':'bilirubin','platelet':'platelets'})
# merge
fea1st1 = df_fea_mimic.columns.tolist()
fea1st2 = df_fea_aumc.columns.tolist()
fea1st3 = df_fea_eicu.columns.tolist()
fea_select = list(set(fea1st1).intersection(fea1st2,fea1st3))

print("共同特有的特征:",fea_select)
print("MIMIC特有的特征:",list(set(fea1st1).difference(fea_select)))
print("AUMCdb特有的特征:",list(set(fea1st2).difference(fea_select)))
print("eICU特有的特征:",list(set(fea1st3).difference(fea_select)))

df_merge = pd.concat([df_fea_aumc,df_fea_mimic,df_fea_eicu],axis=0)
df_merge = df_merge[fea_select]
all_lst = df_merge.stay_id.unique().tolist()
print(len(all_lst))

In [None]:
# 有16个样本没有 time = -1, time=-2的数据，用time= 1的数据填补
df_fea = df_merge
com_lst = []
for i in df_fea.stay_id.unique().tolist():
    min_t = -2 
    max_t = int(max(df_fea[df_fea.stay_id==i].time))
    for j in range(min_t,max_t+1,1):
        com_lst.append([i,j])
df_complete = pd.DataFrame(com_lst,columns=["stay_id", "time"])  
df_fea = pd.merge(df_complete, df_merge, how="left", on=["stay_id","time"])
df_fea = df_fea[df_fea["time"]!=0]

df_fea = df_fea.sort_values(["stay_id","time"])
df_fea["stay_id_copy"] = df_fea["stay_id"]
df_fea = df_fea.groupby(['stay_id_copy']).transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_fea.to_csv("df_saki_timeseries_feature_24h.csv",index=False) # 三个数据库相交的特征

##  merge-所有变量

In [None]:
df_fea_mimic = df_fea_mimic.rename(columns={'bilirubin_total':'bilirubin','platelet':'platelets'})
# merge
fea1st1 = df_fea_mimic.columns.tolist()
fea1st2 = df_fea_aumc.columns.tolist()
fea1st3 = df_fea_eicu.columns.tolist()
fea_select = list(set(fea1st1).intersection(fea1st2,fea1st3))
add_feature = ['bilirubin', 'platelets','inr','pt', 'ptt' ,'alp', 'ast','alt','bun']

print("共同特有的特征:",fea_select)
print("MIMIC特有的特征:",list(set(fea1st1).difference(fea_select)))
print("AUMCdb特有的特征:",list(set(fea1st2).difference(fea_select)))
print("eICU特有的特征:",list(set(fea1st3).difference(fea_select)))

fea_select.extend(add_feature)
df_merge = pd.concat([df_fea_aumc,df_fea_mimic,df_fea_eicu],axis=0)
df_merge = df_merge[fea_select]
all_lst = df_merge.stay_id.unique().tolist()
print(len(all_lst))

In [None]:
df_merge[["dataset","stay_id"]].drop_duplicates()

In [None]:
# 有16个样本没有 time = -1, time=-2的数据，用time= 1的数据填补
df_fea = df_merge
com_lst = []
for i in df_fea.stay_id.unique().tolist():
    min_t = -2 
    max_t = int(max(df_fea[df_fea.stay_id==i].time))
    for j in range(min_t,max_t+1,1):
        com_lst.append([i,j])
df_complete = pd.DataFrame(com_lst,columns=["stay_id", "time"])  
df_fea = pd.merge(df_complete, df_merge, how="left", on=["stay_id","time"])
df_fea = df_fea[df_fea["time"]!=0]

df_fea = df_fea.sort_values(["stay_id","time"])
df_fea["stay_id_copy"] = df_fea["stay_id"]
df_fea = df_fea.groupby(['stay_id_copy']).transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_fea.to_csv("df_saki_timeseries_feature_all_24h.csv",index=False)
df_fea[["dataset","stay_id"]].drop_duplicates()