In [1]:
import pandas as pd
import numpy as np
time_lst = [ 1, 2,3,4] 
df = pd.read_csv("./df_saki_timeseries_feature.csv")
df = df[df["time"].isin(time_lst)]
print(len(df.stay_id.unique()))
df_ = df.drop(["dataset",'groupHPD'],axis=1)
df_.head(2)

8313


Unnamed: 0,stay_id,time,aniongap,bilirubin,po2,mbp,calcium,baseexcess,heart_rate,temperature,...,hemoglobin,lactate,ph,spo2,glucose,urineoutput,sodium,fio2,dbp,creatinine
2,5,1,10.0,5.0,95.0,72.807439,11.5,2.7,96.275,36.35,...,13.132095,2.35,7.32,96.5,102.600001,40.0,138.0,26.0,58.461111,0.6554
3,5,2,7.0,5.0,58.0,73.729776,11.799999,1.15,94.955556,36.3,...,13.05153,1.3,7.335,94.999999,117.899999,43.333333,137.25,26.0,60.433569,0.7571


In [None]:
import pandas as pd

def calculate_feature_changes(data, patient_id_col, time_col, feature_cols):
    """
    Calculate the characteristic differences and change rates at different time points

    Parameter:
    data (DataFrame): A DataFrame that contains the clinical data of patients
    patient_id_col (str): The name of the patient ID column
    time_col (str): The name of the time column
    feature_cols (list): A list containing the feature columns for which the difference and change rate are to be calculated

    Return value:
    DataFrame: A result DataFrame that contains feature differences and change rates
    """
    
    result = pd.DataFrame(columns=[patient_id_col] + feature_cols)

    grouped = data.groupby(patient_id_col)

    timepoints = data[time_col].unique()
    time_combinations = [(t1, t2) for t1 in timepoints for t2 in timepoints if t1 < t2]
    
    for patient_id, group in grouped:
        for timepoint1, timepoint2 in time_combinations:
            timepoint1_data = group[group[time_col] == timepoint1]
            timepoint2_data = group[group[time_col] == timepoint2]

            if len(timepoint1_data) == 1 and len(timepoint2_data) == 1:
                feature_changes = timepoint2_data[feature_cols].values - timepoint1_data[feature_cols].values
                feature_changes = feature_changes.flatten()

                feature_speeds = feature_changes / ((timepoint2 - timepoint1)*6)

                result_series = pd.Series(data=[patient_id] + list(feature_changes) + list(feature_speeds),
                                          index=[patient_id_col] + [f"{col}_diff_{timepoint1}_{timepoint2}" for col in feature_cols] + [f"{col}_speed_{timepoint1}_{timepoint2}" for col in feature_cols])

                result = result.append(result_series, ignore_index=True)

    return result

# Generate features

In [None]:
data = df_ 
feature_cols = ['bicarbonate', 'sodium', 'temperature', 'po2',
       'fio2', 'dbp', 'creatinine', 'aniongap', 'heart_rate', 'pco2',
       'hematocrit', 'baseexcess', 'potassium', 'wbc', 'calcium', 'resp_rate',
       'chloride', 'glucose', 'urineoutput', 'spo2', 'hemoglobin',
       'crea_divide_basecrea', 'sbp', 'mbp', 'lactate', 'ph']
df_res1 = calculate_feature_changes(data, "stay_id", "time", feature_cols)
df_res1 = df_res1.groupby('stay_id').first().reset_index()
df_res1 = df_res1.drop(feature_cols,axis=1)
df_res1.head(2)
df_res1.to_csv("tmp_df_fea_change.csv",index=False)


In [4]:
tmp = df_.drop("time",axis=1)
df_res2 = tmp.groupby(["stay_id"]).agg(["max","min","mean"])
df_res2.columns = ['_'.join(col) for col in df_res2.columns]
df_res2 = df_res2.reset_index()
df_res2.head(2)

Unnamed: 0,stay_id,aniongap_max,aniongap_min,aniongap_mean,bilirubin_max,bilirubin_min,bilirubin_mean,po2_max,po2_min,po2_mean,...,sodium_mean,fio2_max,fio2_min,fio2_mean,dbp_max,dbp_min,dbp_mean,creatinine_max,creatinine_min,creatinine_mean
0,5,10.0,7.0,7.75,5.0,5.0,5.0,95.0,58.0,71.0,...,137.625,26.0,25.0,25.75,71.833333,58.461111,64.899364,0.7571,0.6554,0.731675
1,15,10.666667,8.5,9.708333,4.5,3.0,3.875,95.5,64.666667,76.541667,...,138.091667,40.0,35.0,38.75,70.579395,59.933752,65.902921,1.1865,1.1074,1.15825


In [5]:
df_merge =  pd.merge(df_res1,df_res2,how="inner",on="stay_id")
len(df_merge.stay_id.unique())

8313

In [6]:
tmp_df = df[["stay_id","groupHPD","dataset"]].drop_duplicates()
tmp_df.index = pd.RangeIndex(len(tmp_df.index))
df_merge = pd.concat([tmp_df[["groupHPD","dataset"]],df_merge],axis=1)
df_merge.to_csv("df_saki_selfv2_generate_features_a1234.csv",index=False)
df_merge.shape

(8313, 396)

# Filtering feature

In [7]:
df_fea = pd.read_csv("./df_saki_selfv2_generate_features_a1234.csv")
X = df_fea.drop(["stay_id","dataset",'groupHPD'],axis=1)
y = df_fea[["groupHPD"]]
X.shape

(8313, 393)

## person correlation coefficient

In [None]:
import seaborn as sns
corr_df=X.corr()
threshold = 0.9 
upper = corr_df.where(np.triu(np.ones(corr_df.shape), k=1).astype(np.bool)) 
corr_drop = [column for column in upper.columns if any(upper[column].abs() > threshold)]
diff_fea = list(set(X.columns.tolist()).difference(corr_drop))
X = X[diff_fea]

## Information quantity

In [None]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import load_iris
mic_res = mutual_info_classif(X,y,random_state=10)
XT = X.T
XT["MIC"] = mic_res
XT = XT.sort_values("MIC",ascending=False)
XT = XT[XT["MIC"]>0]
XT = XT.drop(["MIC"],axis=1)
XTT = XT.T
df_final = pd.concat([df_fea[["stay_id","dataset",'groupHPD']],XTT],axis=1)

print(XTT.shape[1])

In [10]:
df_final.to_csv("df_saki_selfv2_generate_features_a1234_CorrMICfilt.csv",index=False)