# Turns timeseries samples into summary statistics

In [1]:
import numpy as np
from helpers.utils import smart_to_numpy, smart_load

def transform_to_features(samples):
    assert isinstance(samples, np.ndarray), "samples must be a numpy array"
    corrected_samples = np.zeros((samples.shape[0], 5, samples.shape[2]))       # NOTE: the following code is specific to eICU dataset and features trained on 7.24
    for i in range(0, 7, 2):
        corrected_samples[:, int(i/2), :] = np.where(np.round(samples[:, i+1, :]) == 1, np.nan, samples[:, i, :])
    corrected_samples[:, 4, :] = np.round(samples[:, -1, :])
    return corrected_samples

sync_path="results/samples/ddpm_eicu_all_48hrs_1690178780_samples.npy"
train_path="data/eicu-extract/TRAIN-eicu_multiple_60_2880_564.pt"
test_path="data/eicu-extract/TEST-eicu_multiple_60_2880_564.pt"

sync_data, train_data, test_data = smart_to_numpy(smart_load(sync_path)), smart_to_numpy(smart_load(train_path)), smart_to_numpy(smart_load(test_path))
sync_data, train_data, test_data = sync_data, transform_to_features(train_data), transform_to_features(test_data) 
print(sync_data.shape, train_data.shape, test_data.shape)

(20000, 5, 560) (20230, 5, 564) (6705, 5, 564)


In [2]:
import pandas as pd
from scipy.stats import kurtosis, skew, mode
from tqdm import tqdm

def turn_timeseries_to_summary_stats_df(timeseries_sample, channels_to_features):
    """
    Turn timeseries dataframe to summary statistics dataframe. Probably there is a better way to write this but I don't bother anymore :(
    """
    result_df = {}
    for patient_idx in tqdm(range(timeseries_sample.shape[0]), desc="Turning timeseries to summary stats..."):
        for channel_idx in range(timeseries_sample.shape[1]):
            if channel_idx == 4:            # for label column
                try:
                    result_df['hospital_expire_flag'].append(np.unique(timeseries_sample[patient_idx, channel_idx, :])[0])
                except:
                    result_df['hospital_expire_flag'] = [np.unique(timeseries_sample[patient_idx, channel_idx, :])[0]]
                continue
            feature_name = channels_to_features[channel_idx]
            arr = timeseries_sample[patient_idx, channel_idx, :]
            arr = np.squeeze(arr[~np.isnan(arr)])
            if arr.size <= 1:
                first, min, max, range_, mean, std, median, mode_, kurtosis_, lower_quartile, upper_quartile, iqr, skewness = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
            else:
                if arr.size == 1:
                    first = arr
                else:
                    first = arr[0]
                min = arr.min()
                max = arr.max()
                range_ = max - min
                mean = arr.mean()
                std = arr.std()
                mode_ = mode(arr, keepdims=True)[0][0]
                skewness = skew(arr)
            
            try:
                result_df[f"{feature_name}_first"].append(first)
                result_df[f"{feature_name}_min"].append(min)
                result_df[f"{feature_name}_max"].append(max)
                result_df[f"{feature_name}_range"].append(range_)
                result_df[f"{feature_name}_mean"].append(mean)
                result_df[f"{feature_name}_std"].append(std)
                result_df[f"{feature_name}_mode"].append(mode_)
                result_df[f"{feature_name}_skewness"].append(skewness)
            except:
                result_df[f"{feature_name}_first"] = [first]
                result_df[f"{feature_name}_min"] = [min]
                result_df[f"{feature_name}_max"] = [max]
                result_df[f"{feature_name}_range"] = [range_]
                result_df[f"{feature_name}_mean"] = [mean]
                result_df[f"{feature_name}_std"] = [std]
                result_df[f"{feature_name}_mode"] = [mode_]
                result_df[f"{feature_name}_skewness"] = [skewness]
            
    result_df = pd.DataFrame(result_df)
    return result_df

In [3]:
channels_to_features = {0: "heartrate", 1: 'resprate', 2: "spo2", 3: 'meanbp', 4: 'hospital_expire_flag'}

In [4]:
sync_df = turn_timeseries_to_summary_stats_df(timeseries_sample=sync_data, channels_to_features=channels_to_features)
print(sync_df.shape)
sync_df.to_csv("results/processed/ddpm_eicu_all_48hrs_1690178780_samples.csv", index=False)

sync_df_nonan = sync_df.dropna()
print(sync_df_nonan.shape)
sync_df_nonan.dropna().to_csv("results/processed/ddpm_eicu_all_48hrs_1690178780_samples_dropna.csv", index=False)

Turning timeseries to summary stats...: 100%|██████████| 20000/20000 [00:40<00:00, 496.09it/s]


(20000, 53)
(5550, 53)


In [5]:
train_df = turn_timeseries_to_summary_stats_df(timeseries_sample=train_data, channels_to_features=channels_to_features)
print(train_df.shape)
train_df.to_csv("results/processed/TRAIN-eicu_multiple_60_2880_564.csv", index=False)

train_df_nonan = train_df.dropna()
print(train_df_nonan.shape)
train_df_nonan.dropna().to_csv("results/processed/TRAIN-eicu_multiple_60_2880_564_dropna.csv", index=False)

  kurtosis_ = kurtosis(arr)
  skewness = skew(arr)
Turning timeseries to summary stats...: 100%|██████████| 20230/20230 [00:40<00:00, 497.47it/s]


(20230, 53)
(6109, 53)


In [6]:
test_df = turn_timeseries_to_summary_stats_df(timeseries_sample=test_data, channels_to_features=channels_to_features)
print(test_df.shape)
test_df.to_csv("results/processed/TEST-eicu_multiple_60_2880_564.csv", index=False)

test_df_nonan = test_df.dropna()
print(test_df_nonan.shape)
test_df_nonan.dropna().to_csv("results/processed/TEST-eicu_multiple_60_2880_564_dropna.csv", index=False)

  kurtosis_ = kurtosis(arr)
  skewness = skew(arr)
Turning timeseries to summary stats...: 100%|██████████| 6705/6705 [00:13<00:00, 499.27it/s]


(6705, 53)
(2024, 53)


## Class Imbalance

In [7]:
import pandas as pd
train_24hrs = pd.read_csv("results/processed/TRAIN-eicu_multiple_60_1440_276.csv")
test_24hrs = pd.read_csv("results/processed/TEST-eicu_multiple_60_1440_276.csv")
train_48hrs = pd.read_csv("results/processed/TRAIN-eicu_multiple_60_2880_564.csv")
test_48hrs = pd.read_csv('results/processed/TEST-eicu_multiple_60_2880_564.csv')
print(f"Mortality Rate\nTrain 24 hrs: {train_24hrs['hospital_expire_flag'].mean()}\nTest 24 hrs: {test_24hrs['hospital_expire_flag'].mean()}\nTrain 48 hrs: {train_48hrs['hospital_expire_flag'].mean()}\nTest 48 hrs: {test_48hrs['hospital_expire_flag'].mean()}")

Mortality Rate
Train 24 hrs: 0.10703749625892514
Test 24 hrs: 0.10628389154704944
Train 48 hrs: 0.14320316361838853
Test 48 hrs: 0.14571215510812827


In [12]:
pd.read_csv("results/processed/ddpm_eicu_all_24hrs_1690178742_samples.csv")['hospital_expire_flag'].mean()

0.10805