In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_rows = 300
import datetime as dt
import math
import seaborn as sns
import miceforest as mf
from tableone import TableOne, load_dataset

# Screen patients with S-AKI according to the definition

## sepsis 

In [None]:
data_dir = "/public/hanl/jupyter_dir/kidney_sepsis_penotype_v3/00.data_eicu/disease_definition/sepsis/"
infile = data_dir + "eicu_sepsis3.csv"
df_sepsis = pd.read_csv(infile)

sepsis_lst = df_sepsis.stay_id.tolist()
print("The number of sepsis patients is (by sepsis3)：",df_sepsis.shape[0])
df_sepsis.head(1)

## CKD

In [None]:
data_dir = "/public/hanl/jupyter_dir/kidney_sepsis_penotype_v3/00.data_eicu/disease_definition//CKD/"
infile = data_dir + "eicu_ckd.csv"
df_ckd = pd.read_csv(infile)
ckd_lst = set(df_ckd.stay_id.tolist())
print("The number of ckd patients (by ICD):",len(ckd_lst))

## AKI

In [None]:
data_dir = "/public/hanl/jupyter_dir/kidney_sepsis_penotype_v3/00.data_eicu/disease_definition/AKI/"
infile = data_dir + "eicu_sk_first_and_max_stage.csv"
df_aki = pd.read_csv(infile)
df_aki = df_aki[["stay_id", "first_aki_onset"]]
df_aki.columns = ["stay_id", "aki_onset"]
aki_lst = df_aki.stay_id.tolist()
print("The number of aki patients is (by KDIGO):",len(aki_lst))


## sepsis without CKD;  sepsis with AKI

In [5]:
sepsis_minus_ckd_lst = list(set(sepsis_lst).difference(ckd_lst))
print("sepsis_minus_ckd_lst: " +str(len(sepsis_minus_ckd_lst)))

sepsis_minusCKD_addAKI_lst = list(set(sepsis_minus_ckd_lst).intersection(aki_lst))

sepsis_noKidney_lst = list(set(sepsis_lst).difference(ckd_lst, aki_lst))
print("sepsis_minusCKD_addAKI_lst:",len(sepsis_minusCKD_addAKI_lst)," sepsis_noKidney:",len(sepsis_noKidney_lst))


sepsis_minus_ckd_lst: 12513
sepsis_minusCKD_addAKI_lst: 1748  sepsis_noKidney: 10765


  and should_run_async(code)


# S-AKI inclusion criteria

## Patients who developed AKI within one week after sepsis

In [None]:
df_sepsis = df_sepsis[df_sepsis["stay_id"].isin(sepsis_minusCKD_addAKI_lst)]
df_aki = df_aki[df_aki["stay_id"].isin(sepsis_minusCKD_addAKI_lst)]
df_sepsis_aki = pd.merge(df_sepsis, df_aki, how="inner", on = ["stay_id"])

df_sepsis_aki["time_aki_minus_sepsis"] =  df_sepsis_aki["aki_onset"] - df_sepsis_aki["sepsis_onset"]
df_sepsis_aki = df_sepsis_aki[(df_sepsis_aki["time_aki_minus_sepsis"]>0) & (df_sepsis_aki["time_aki_minus_sepsis"]<=168)]
saki_lst = df_sepsis_aki.stay_id.unique().tolist()
print("The number of patients who developed AKI within one week after sepsis:",len(saki_lst))


## Excluding age<18 and los<1

In [None]:
df_demo =  pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v3/00.data_eicu/feature_data/df_eicu_sk_icudetails.csv")
df_demo = df_demo[df_demo["stay_id"].isin(saki_lst)]
df_demo = df_demo[df_demo["icu_los_hours"]>=24]
df_demo = df_demo[(df_demo["age"]>=18)]

saki_lst_filt_lst = list(df_demo.stay_id.unique())
print("saki, Age >17 and los greater than 1 day:",len(saki_lst_filt_lst))
all_lst = saki_lst_filt_lst

## Patients with baseline creatinine greater than 1.5 and less than 0.5 were excluded

In [None]:
df_base_crea = pd.read_csv("../00.data_eicu/disease_definition/AKI/df_base_crea.csv")
df_base_crea = df_base_crea[(df_base_crea["baseline_creatinine"]>=0.5) &(df_base_crea["baseline_creatinine"]<1.5)]
df_base_crea = df_base_crea[~df_base_crea.isnull().T.any()]

df_base_crea = df_base_crea[df_base_crea["stay_id"].isin(all_lst)]

all_lst = df_base_crea.stay_id.unique()
print("Patients with no baseline, a baseline that is too low, or a baseline that has reached renal failure or uremia were excluded:",len(all_lst))

# Clinical data acquisition for clustering

## Load files

In [None]:
# Import the time file
df_time =  pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v3/00.data_eicu/disease_definition/AKI/eicu_saki_event_time.csv")
df_time = df_time[(df_time["saki_onset"]<=df_time["outtime"]) & (df_time["saki_onset"]>=df_time["intime"])]
df_time = df_time[["stay_id","saki_onset"]]
df_time = df_time[df_time["stay_id"].isin(all_lst)]
print(len(df_time.stay_id.unique()))

  and should_run_async(code)


1417


In [None]:
# Import the clinical feature file
data_dir = "/public/hanl/jupyter_dir/kidney_sepsis_penotype_v3/00.data_eicu/feature_data/"
infile = data_dir + "eicu_data_merge.csv"
df_fea = pd.read_csv(infile)
df_fea = df_fea[df_fea["stay_id"].isin(all_lst)]
df_fea["charttime"] = df_fea["charttime"]/60
df_fea = df_fea.drop_duplicates()
df_fea.head(1)

  and should_run_async(code)


Unnamed: 0,stay_id,charttime,fio2,po2,pco2,ph,aniongap,baseexcess,peep,urineoutput,...,alt,ast,alp,heart_rate,resp_rate,spo2,temperature,sbp,dbp,mbp
104572,243208,-5.5,,,,,13.0,,,,...,,,,,,,,,,


## Obtain data by time window


In [None]:
df_fea_add  = pd.merge(df_fea,df_time, how="left", on="stay_id")
df_fea_add["time"] = (df_fea_add["charttime"] - df_fea_add["saki_onset"])/6
df_fea_add["time"] = df_fea_add["time"].map(lambda x:x//1) 
df_fea_add["time"] = df_fea_add["time"].map(lambda x: x+1 if x>=0 else x)
df_fea_add = df_fea_add[(df_fea_add["time"]>=-4) &(df_fea_add["time"]<=28)]

df_fea_add = df_fea_add.drop(["charttime","saki_onset"],axis=1)
df_fea_add = df_fea_add.groupby(["stay_id","time"]).agg("mean").reset_index()

# condition2: There must be a record of one day or more after SAKI occurs
tmp_df = df_fea_add[df_fea_add["time"]>=0][["stay_id","time"]]
tmp_df = tmp_df.groupby("stay_id").agg(['max','count']).reset_index()
tmp_df.columns = ['_'.join(col) for col in tmp_df.columns]
stay_lst = tmp_df[(tmp_df["time_max"]>=4) & (tmp_df["time_count"]>=4)]["stay_id_"].tolist()
df_fea_add = df_fea_add[df_fea_add["stay_id"].isin(stay_lst)]
all_lst = df_fea_add.stay_id.unique()
print("Patients who recorded more than one day after the occurrence of saki:", len(all_lst))

#  condition3: SAKI must have at least 4 data recording points
df_stage = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v3/00.data_eicu//disease_definition/AKI/kdigo_aki_stage.csv")
df_stage = df_stage[df_stage["stay_id"].isin(all_lst)]
df_stage =df_stage[df_stage["aki_stage"]>0]
df_stage = df_stage.sort_values(["stay_id","time"])
df_stage = df_stage.groupby("stay_id").agg({"aki_stage":"count"}).reset_index()
df_stage = df_stage[df_stage.aki_stage >=4]
all_lst = df_stage.stay_id.unique()
print("AKI has at least four data recording points", len(all_lst))
df_fea_add = df_fea_add[df_fea_add["stay_id"].isin(all_lst)]

all_lst = df_fea_add.stay_id.unique()
print("Finally, the number of included patients:",len(all_lst))
df_fea_add.head(1)

In [None]:
#The proportion of urine output with a null value is so large that forward interpolation leads to a sharp increase in urine output
df_fea_add[df_fea_add["urineoutput"].isnull()].shape,df_fea_add[~df_fea_add["urineoutput"].isnull()].shape

  and should_run_async(code)


((15522, 39), (15647, 39))

## Forward interpolation

In [None]:
df_fea_add["urineoutput"] = df_fea_add["urineoutput"].fillna(0)
# ## Forward interpolation
tmp_df = pd.DataFrame()
df_fea_add = df_fea_add.sort_values(by=["stay_id","time"])
for i,j in df_fea_add.groupby(["stay_id"]):
    j = j.fillna(method="ffill")
    tmp_df = pd.concat([tmp_df, j], axis=0)
df_im = tmp_df

  and should_run_async(code)


In [None]:
# ind according to na
df = df_im
tmp_df = pd.DataFrame(df.isna().sum()/df.shape[0])
tmp_df = tmp_df.reset_index()

fea_lst = tmp_df[tmp_df[0]<=0.6]["index"].tolist()
fea_lst.extend(["mbp","sbp","dbp"])
df_im = df_im[fea_lst]
df_im.to_csv("./df_im_By_ffill.csv",index=False)
len(fea_lst), str(fea_lst)

  and should_run_async(code)


(35,
 "['stay_id', 'time', 'fio2', 'po2', 'pco2', 'ph', 'aniongap', 'baseexcess', 'urineoutput', 'albumin', 'bilirubin', 'bun', 'calcium', 'chloride', 'creatinine', 'glucose', 'bicarbonate', 'hematocrit', 'hemoglobin', 'inr', 'lactate', 'platelets', 'potassium', 'sodium', 'wbc', 'alt', 'ast', 'alp', 'heart_rate', 'resp_rate', 'spo2', 'temperature', 'mbp', 'sbp', 'dbp']")

## Multiple interpolation

In [None]:
# Multiple interpolation
df_im = pd.read_csv("./df_im_By_ffill.csv")

kernel = mf.ImputationKernel(
    df_im,
    datasets=3,
    save_all_iterations=True,
    random_state=10
)
kernel.mice(iterations = 3, n_jobs=-1)
df_im2 = kernel.complete_data(dataset=1)
df_im2.to_csv("df_im_By_MI.csv",index=False)
df_im2.head(1)

  and should_run_async(code)


Unnamed: 0,stay_id,time,fio2,po2,pco2,ph,aniongap,baseexcess,urineoutput,albumin,...,alt,ast,alp,heart_rate,resp_rate,spo2,temperature,mbp,sbp,dbp
0,243208,-4.0,0.55,72.5,32.1,7.386,13.0,-4.8,0.0,3.0,...,18.0,20.0,68.0,107.25,20.0,93.5,36.8,86.470588,109.0,74.0


## Add additional feature - Baseline creatinine rate of change

In [None]:
# add creatinine change number is taken as the characteristic
df_base_crea = pd.read_csv("/public/hanl/jupyter_dir/kidney_sepsis_penotype_v3/00.data_eicu/disease_definition/AKI/df_base_crea.csv")
df_feaf = pd.merge(df_im2,df_base_crea,how="inner",on="stay_id")
df_feaf["crea_divide_basecrea"] = round(df_feaf["creatinine"]/df_feaf["baseline_creatinine"],2)
df_feaf = df_feaf.drop(["baseline_creatinine"],axis=1)
df_feaf.to_csv("sk_feature_timescale_eicu.csv",index=False)
df_feaf.head(2)

  and should_run_async(code)


Unnamed: 0,stay_id,time,fio2,po2,pco2,ph,aniongap,baseexcess,urineoutput,albumin,...,ast,alp,heart_rate,resp_rate,spo2,temperature,mbp,sbp,dbp,crea_divide_basecrea
0,243208,-4.0,0.55,72.5,32.1,7.386,13.0,-4.8,0.0,3.0,...,20.0,68.0,107.25,20.0,93.5,36.8,86.470588,109.0,74.0,1.0
1,243208,-3.0,0.3,54.0,31.5,7.42,13.0,-4.2,0.0,3.5,...,46.0,268.0,112.0,20.0,95.0,37.8,82.8,120.333333,65.75,1.0


## Different outputs
- Only include the average value

In [17]:
df_feaf.to_csv("sk_feature_timescale_eicu.csv",index=False)
len(df_feaf.stay_id.unique())

  and should_run_async(code)


1417

In [None]:
# The starting point of clustering for the final 51,912 patients was determined: It was finally decided to start 12 hours before the onset of SAKI
df = df_feaf
tmp_df = df[df["time"].isin([-4,-3,-2,-1])]
tmp_df["time"].value_counts()/len(df.stay_id.unique())

  and should_run_async(code)


-1.0    1.000000
-2.0    0.969654
-3.0    0.767819
-4.0    0.589273
Name: time, dtype: float64

In [19]:
df = df_feaf[df_feaf["time"]>=-2]
df.to_csv("sk_feature_timescale_Fb2_eicu.csv",index=False)

  and should_run_async(code)
