In [6]:
import pandas as pd
pd.set_option("display.max_rows", 400)
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import datetime
import os
import joblib

from sklearn.feature_extraction import DictVectorizer


import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# data load and merge

In [10]:
data_dir = "/public/hanl/jupyter_dir/ir_recommend_v2/02.state/"
file = data_dir + "S_basicinfo.csv"

df_demo = pd.read_csv(file)
df_demo.columns

Index(['stay_id', 'age', 'weight', 'gender', 'height', 'ethnicity',
       'type1_diabetes', 'type2_diabetes'],
      dtype='object')

In [13]:
data_dir = "/public/hanl/jupyter_dir/ir_recommend_v2/02.state/"
file = data_dir + "S_fea_win60.csv"
df_fea = pd.read_csv(file)
df_fea.columns

Index(['stay_id', 'starttime', 'endtime', 'traj_id', 'heart_rate_max',
       'heart_rate_min', 'heart_rate_mean', 'sbp_max', 'sbp_min', 'sbp_mean',
       ...
       'mcv_mean', 'platelet_max', 'platelet_min', 'platelet_mean', 'rbc_max',
       'rbc_min', 'rbc_mean', 'rdw_max', 'rdw_min', 'rdw_mean'],
      dtype='object', length=148)

In [27]:
# merge
df_fea_demo = pd.merge(df_fea, df_demo, how="left", left_on=["stay_id"], right_on=["stay_id"])
print(df_fea.shape, df_demo.shape, df_fea_demo.shape)
df_fea_demo["starttime"] = pd.to_datetime(df_fea_demo["starttime"])
df_fea_demo["endtime"] = pd.to_datetime(df_fea_demo["endtime"])
df_fea_demo = df_fea_demo.sort_values(by=["stay_id", "starttime"])
#df_fea_demo['tag'] = range(len(df_fea_demo))

(261707, 148) (11624, 8) (261707, 155)


# state 生成

- 1）仅仅只有血糖信息，每个state血糖的最大最小中值均值以及与上次血糖检测的差值
- 血糖信息 + demo信息
- 2）血糖信息即1） + 血糖相关信息（demo如病史 + 输液 + 葡糖 + 用药）
- 3）2）+ vital 均值 +  vital的其他统计信息
- 4）3） + lab的均值
- 5）4）+ lab的其他统计信息

In [15]:
fix_lst = ['stay_id', 'starttime', 'endtime']
demo_lst = [
 'age',
 'weight',
 'gender',
 'height',
 'ethnicity',
 'type1_diabetes',
 'type2_diabetes']

# state 1: glucose 
- S_glu.csv

In [17]:
glu_lst = ["glucose_mean", 'glucose_max', 'glucose_min']
df_s1 = df_fea_demo[glu_lst+fix_lst]

tmp_df = pd.DataFrame()
for i,j in df_s1.groupby("stay_id"):
    j = j.sort_values("starttime")
    j['glucose_mean_last'] = j['glucose_mean'].shift(+1)
    j['glucose_mean_last2'] = j['glucose_mean'].shift(+2)
    j['glucose_mean_last3'] = j['glucose_mean'].shift(+3)
    j["d_glucose_mean"] = j["glucose_mean"] - j["glucose_mean_last"]
    j["d_glucose_mean2"] = j["glucose_mean"] - j["glucose_mean_last2"]
    j["d_glucose_mean3"] = j["glucose_mean"] - j["glucose_mean_last3"]
    j = j.fillna(0)
    j = j.drop(["glucose_mean_last", "glucose_mean_last2", "glucose_mean_last3"], axis=1)
    tmp_df = pd.concat([tmp_df, j])
df_s1 = tmp_df
df_s1 = df_s1.fillna(0)
df_s1.to_csv("S_glucose.csv", index=False)

In [18]:
df_s1.columns
df_s1.isna().sum()

glucose_mean       0
glucose_max        0
glucose_min        0
stay_id            0
starttime          0
endtime            0
d_glucose_mean     0
d_glucose_mean2    0
d_glucose_mean3    0
dtype: int64

# state 2: glucose 相关
- S_glu_relate.csv

In [28]:
# 添加血糖记录差到总表中
df_s1 = pd.read_csv("S_glucose.csv")
df_s1["starttime"] = pd.to_datetime(df_s1["starttime"])
df_s1["endtime"] = pd.to_datetime(df_s1["endtime"])

df_fea_demo = pd.merge(df_fea_demo, df_s1, how="left", left_on = ['glucose_mean', 'glucose_max', 'glucose_min',
       'stay_id', 'starttime', 'endtime'], right_on = ['glucose_mean', 'glucose_max', 'glucose_min',
       'stay_id', 'starttime', 'endtime'])
df_fea_demo.shape

(261707, 158)

In [24]:
"""## 额外输出glucose + demo 信息
glu_lst = ["glucose_mean", 'glucose_max', 'glucose_min', "d_glucose_mean","d_glucose_mean2","d_glucose_mean3"]

fea_lst = fix_lst + demo_lst + glu_lst
df_s2p = df_fea_demo[fea_lst]

df_s2p.to_csv("S_glucose_demo.csv", index=False)"""

In [29]:
# 添加其他胰岛素用药
df_insu_pump = pd.read_csv("S_insu_pump.csv")
df_insu_pump["starttime"] = pd.to_datetime(df_insu_pump["starttime"], format='%Y/%m/%d %H:%M:%S')
df_insu_pump["endtime"] = pd.to_datetime(df_insu_pump["endtime"], format='%Y/%m/%d %H:%M:%S')
df_fea_demo_med = pd.merge(df_fea_demo, df_insu_pump, how= "left", on = ["stay_id", "starttime", "endtime"])

In [32]:
# 添加输液
df_infusion = pd.read_csv("S_bolus_dextrose.csv")
df_infusion["starttime"] = pd.to_datetime(df_infusion["starttime"], format='%Y/%m/%d %H:%M:%S')
df_infusion["endtime"] = pd.to_datetime(df_infusion["endtime"], format='%Y/%m/%d %H:%M:%S')

df_infusion = df_infusion.loc[:,["stay_id", "starttime", "endtime", "traj_id", 'colloid_bolus_win', 'dex_win']]

df_fea_demo_med_infu = pd.merge(df_fea_demo_med, df_infusion, how= "left", on = ["stay_id", "starttime", "endtime"])
df_fea_demo_med_infu = df_fea_demo_med_infu.fillna(0)

In [33]:
glu_lst = ["glucose_mean",  'glucose_max', 'glucose_min', "d_glucose_mean","d_glucose_mean2","d_glucose_mean3"]
med_lst = [ 'colloid_bolus_win', 'dex_win', 'OS_bt1_0', 'OS_bt1_1', 'OS_bt1_2', 'OS_t0_0', 'OS_t0_1', 'OS_t0_2', 'OS_t1_0', 'OS_t1_1', 'OS_t1_2', 'OS_t2_0', 'OS_t2_1', 'OS_t2_2', 'OS_t3_0', 'OS_t3_1', 'OS_t3_2', 'OS_t4_0', 'OS_t4_1', 'OS_t4_2', 'OS_t5_0', 'OS_t5_1', 'OS_t5_2', 'OS_t6_0', 'OS_t6_1', 'OS_t6_2', 'OS_t7_0', 'OS_t7_1', 'OS_t7_2', 'OS_t8_0', 'OS_t8_1', 'OS_t8_2', 'OS_bv1_0', 'OS_bv1_1', 'OS_bv1_2', 'OS_v0_0', 'OS_v0_1', 'OS_v0_2', 'OS_v1_0', 'OS_v1_1', 'OS_v1_2', 'OS_v2_0', 'OS_v2_1', 'OS_v2_2', 'OS_v3_0', 'OS_v3_1', 'OS_v3_2', 'OS_v4_0', 'OS_v4_1', 'OS_v4_2', 'OS_v5_0', 'OS_v5_1', 'OS_v5_2', 'OS_v6_0', 'OS_v6_1', 'OS_v6_2', 'OS_v7_0', 'OS_v7_1', 'OS_v7_2', 'OS_v8_0', 'OS_v8_1', 'OS_v8_2', 'L_bt1_0', 'L_bt1_1', 'L_t0_0', 'L_t0_1', 'L_t1_0', 'L_t1_1', 'L_t2_0', 'L_t2_1', 'L_t3_0', 'L_t3_1', 'L_t4_0', 'L_t4_1', 'L_t5_0', 'L_t5_1', 'L_t6_0', 'L_t6_1', 'L_t7_0', 'L_t7_1', 'L_t8_0', 'L_t8_1', 'L_t9_0', 'L_t9_1', 'L_t10_0', 'L_t10_1', 'L_t11_0', 'L_t11_1', 'L_t12_0', 'L_t12_1', 'L_t13_0', 'L_t13_1', 'L_t14_0', 'L_t14_1', 'L_t15_0', 'L_t15_1', 'L_t16_0', 'L_t16_1', 'L_t17_0', 'L_t17_1', 'L_t18_0', 'L_t18_1', 'L_t19_0', 'L_t19_1', 'L_t20_0', 'L_t20_1', 'L_t21_0', 'L_t21_1', 'L_t22_0', 'L_t22_1', 'L_t23_0', 'L_t23_1', 'L_t24_0', 'L_t24_1', 'L_bv1_0', 'L_bv1_1', 'L_v0_0', 'L_v0_1', 'L_v1_0', 'L_v1_1', 'L_v2_0', 'L_v2_1', 'L_v3_0', 'L_v3_1', 'L_v4_0', 'L_v4_1', 'L_v5_0', 'L_v5_1', 'L_v6_0', 'L_v6_1', 'L_v7_0', 'L_v7_1', 'L_v8_0', 'L_v8_1', 'L_v9_0', 'L_v9_1', 'L_v10_0', 'L_v10_1', 'L_v11_0', 'L_v11_1', 'L_v12_0', 'L_v12_1', 'L_v13_0', 'L_v13_1', 'L_v14_0', 'L_v14_1', 'L_v15_0', 'L_v15_1', 'L_v16_0', 'L_v16_1', 'L_v17_0', 'L_v17_1', 'L_v18_0', 'L_v18_1', 'L_v19_0', 'L_v19_1', 'L_v20_0', 'L_v20_1', 'L_v21_0', 'L_v21_1', 'L_v22_0', 'L_v22_1', 'L_v23_0', 'L_v23_1', 'L_v24_0', 'L_v24_1', 'M_bt1', 'M_t0', 'M_t1', 'M_t2', 'M_t3', 'M_t4', 'M_t5', 'M_t6', 'M_t7', 'M_t8', 'M_t9', 'M_t10', 'M_t11', 'M_t12', 'M_bv1', 'M_v0', 'M_v1', 'M_v2', 'M_v3', 'M_v4', 'M_v5', 'M_v6', 'M_v7', 'M_v8', 'M_v9', 'M_v10', 'M_v11', 'M_v12']

fea_lst = fix_lst + demo_lst + glu_lst + med_lst
df_s2 = df_fea_demo_med_infu[fea_lst]
#df_s2 = df_s2.drop("tag", axis=1)

## 这里还需添加输液，用药等相关信息
df_s2.to_csv("S_glucose_relate.csv", index=False)

# state 3 
- state 2 + viatl 统计值
- S_icufea_addHighFreq.csv

In [51]:
vitalname_lst = ['heart_rate', 'sbp', 'dbp', 'mbp','resp_rate', 'temperature', 'spo2']
add_lst = ["_mean", "_min", "_max"]
vital_lst = []
for i in vitalname_lst:
    for j in add_lst:
        vital_lst.append(i+j)
        
labname_lst = ['urineoutput_mean', 'po2_mean', 'bun_mean', 'aniongap_mean', 'so2_mean', 'aado2_calc_mean', 'ph_mean', 'lactate_mean', 'mchc_mean', 'creatinine_mean', 'chloride_mean', 'ptt_mean', 'mcv_mean', 'baseexcess_mean', 'platelet_mean', 'totalco2_mean', 'inr_mean', 'pao2fio2ratio_mean', 'free_calcium_mean', 'pco2_mean', 'pt_mean', 'hematocrit_mean', 'mch_mean', 'wbc_mean', 'hemoglobin_mean', 'bicarbonate_mean', 'rbc_mean', 'rdw_mean', 'heart_rate_mean', 'resp_rate_mean', 'fio2_chartevents_mean', 'sodium_mean', 'potassium_mean']

In [52]:
vital_lst = list(set(vital_lst).intersection(set(df_fea_demo_med_infu.columns.tolist())))
glu_lst = ["glucose_mean",  'glucose_max', 'glucose_min', "d_glucose_mean","d_glucose_mean2","d_glucose_mean3"]
fea_lst = fix_lst + demo_lst + glu_lst + vital_lst + med_lst + labname_lst

df_s3 = df_fea_demo_med_infu[fea_lst]
df_s3.to_csv("S_merge.csv", index=False)

In [53]:
str(df_s3.columns.tolist())

"['stay_id', 'starttime', 'endtime', 'age', 'weight', 'gender', 'height', 'ethnicity', 'type1_diabetes', 'type2_diabetes', 'glucose_mean', 'glucose_max', 'glucose_min', 'd_glucose_mean', 'd_glucose_mean2', 'd_glucose_mean3', 'sbp_max', 'temperature_max', 'spo2_max', 'mbp_max', 'heart_rate_min', 'resp_rate_max', 'resp_rate_min', 'dbp_max', 'sbp_min', 'temperature_mean', 'mbp_mean', 'temperature_min', 'spo2_min', 'dbp_min', 'dbp_mean', 'sbp_mean', 'spo2_mean', 'mbp_min', 'heart_rate_mean', 'resp_rate_mean', 'heart_rate_max', 'colloid_bolus_win', 'dex_win', 'OS_bt1_0', 'OS_bt1_1', 'OS_bt1_2', 'OS_t0_0', 'OS_t0_1', 'OS_t0_2', 'OS_t1_0', 'OS_t1_1', 'OS_t1_2', 'OS_t2_0', 'OS_t2_1', 'OS_t2_2', 'OS_t3_0', 'OS_t3_1', 'OS_t3_2', 'OS_t4_0', 'OS_t4_1', 'OS_t4_2', 'OS_t5_0', 'OS_t5_1', 'OS_t5_2', 'OS_t6_0', 'OS_t6_1', 'OS_t6_2', 'OS_t7_0', 'OS_t7_1', 'OS_t7_2', 'OS_t8_0', 'OS_t8_1', 'OS_t8_2', 'OS_bv1_0', 'OS_bv1_1', 'OS_bv1_2', 'OS_v0_0', 'OS_v0_1', 'OS_v0_2', 'OS_v1_0', 'OS_v1_1', 'OS_v1_2', 'OS_