In [None]:
!mkdir -p output

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import pickle, os, time
import itertools
from datetime import datetime, timedelta
from collections import Counter, defaultdict, namedtuple
from PIL import Image
import yaml
from tqdm import tqdm

import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
data_dir = './data/'

# Load the population, labels, and baseline features
pop = pd.read_csv(data_dir + 'population.csv').set_index('BMT_ID')

df_label_full = pop.join(pd.read_csv(data_dir + 'label.csv', index_col='BMT_ID'), how='left')
df_static = pop.join(pd.read_csv(data_dir + 'static.csv', index_col='BMT_ID'), how='left')
df_static.index.rename('id', inplace=True)

df_label = df_label_full['Label_GVHD']
df_label34 = (df_label_full['GVHD_max_grade'] >= 3).astype(int)
assert not df_static[df_static.isnull().any(axis=1)].any().any()

# Load the vital sign time series
ts_vitals_by_bmt = pickle.load(open(data_dir + 'vitals_by_ID.p', 'rb'))
ts_vitals_by_bmt = {ID: ts_vitals_by_bmt[ID] for ID in list(pop.index)}

In [3]:
print('Population size:', len(ts_vitals_by_bmt))
print()
print('Class balance')
print('{{0,1}} vs. {{2,3,4}}: \t{:.1%}'.format(df_label.mean()))
print('{{0,1,2}} vs. {{3,4}}: \t{:.1%}'.format(df_label34.mean()))

Population size: 324

Class balance
{0,1} vs. {2,3,4}: 	31.8%
{0,1,2} vs. {3,4}: 	13.6%


In [4]:
# Extract vital sign features
variables = ['HR', 'RR', 'SysBP', 'DiaBP', 'Temp', 'SpO2']
t0, T = 0, 10
dt = 1

import tsfresh

def get_trend_features(t0, T, dt):
    # Recursively summarizing
    # First computes summary for every daily window (dt)
    # Then computes trend features
    
    # Daily vitals summary statistics
    D_tmp = {}
    for key, df in ts_vitals_by_bmt.items():
        df = df[(t0 <= df['t']) & (df['t'] < T)].set_index('t').copy()
        df = df.rename(columns={v: '{}_dt={}'.format(v, dt) for v in variables})
        df = df.groupby(
            pd.cut(df.index, np.arange(t0, T+dt, dt), right=False)
        ).agg([
            'mean', 'std',
            'min', 'max',
        ])
        df.index.rename('t', inplace=True)
        D_tmp[key] = df.reset_index()
    
    timeseries = pd.concat(D_tmp)
    timeseries.index.rename('id', level=0, inplace=True)
    timeseries = timeseries.sort_index()
    timeseries = timeseries.reset_index(level=0).set_index(['id', 't'])

    timeseries.columns = ['_'.join(col).strip() for col in timeseries.columns.values]
    stacked_ts = timeseries.stack().copy()
    stacked_ts.index.rename('variable', level=-1, inplace=True)
    stacked_ts.rename('value', inplace=True)
    stacked_ts = stacked_ts.reset_index()

    assert not pd.isnull(stacked_ts['value']).any()
    
    feature_params = {
        'mean': None,
        'linear_trend': [{'attr': 'slope'}],
        'sample_entropy': None,
        'fft_coefficient': [
            {'coeff': 1, 'attr': 'abs'},
            {'coeff': 1, 'attr': 'angle'},
        ],
    }
    extracted_features = tsfresh.extract_features(
        stacked_ts, column_id='id', column_sort='t', column_kind='variable', column_value='value',
        default_fc_parameters=feature_params,
    )
    
    return extracted_features


# extracted_features = get_trend_features(t0, T, dt)

In [5]:
extracted_features = get_trend_features(t0, T, dt)

Feature Extraction: 100%|██████████| 278/278 [00:00<00:00, 280.80it/s]


In [6]:
extracted_features.to_csv('output/ts_features.csv')

In [7]:
# Bin values by quintiles
df_features = df_static.join(
    pd.get_dummies(extracted_features.apply(pd.qcut, q=5, duplicates='drop'), prefix_sep='_')
)
print(df_features.shape)

(324, 652)


In [8]:
df_features.head()

Unnamed: 0_level_0,"Age_(-0.001, 18.0]","Age_(18.0, 45.0]","Age_(45.0, 75.0]",Disease Code category_Malignant,Disease Code category_Non-malignant,Disease Risk_0 - Non-malignant,Disease Risk_1 - Low,Disease Risk_2 - Intermediate,Disease Risk_3 - High,Intensity_0 - Full,...,"Temp_dt=1_std__mean_(0.0977, 0.157]","Temp_dt=1_std__mean_(0.157, 0.194]","Temp_dt=1_std__mean_(0.194, 0.235]","Temp_dt=1_std__mean_(0.235, 0.292]","Temp_dt=1_std__mean_(0.292, 0.564]","Temp_dt=1_std__sample_entropy_(0.67, 1.861]","Temp_dt=1_std__sample_entropy_(1.861, 2.197]","Temp_dt=1_std__sample_entropy_(2.197, 2.42]","Temp_dt=1_std__sample_entropy_(2.42, 2.708]","Temp_dt=1_std__sample_entropy_(2.708, 3.807]"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
train_001,0,0,1,1,0,0,0,1,0,1,...,0,0,1,0,0,0,0,1,0,0
train_002,0,0,1,1,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
train_003,0,0,1,1,0,0,1,0,0,1,...,0,0,0,1,0,0,1,0,0,0
train_004,0,0,1,1,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0
train_005,0,0,1,1,0,0,0,0,1,1,...,0,0,0,1,0,0,1,0,0,0


In [9]:
df_features.to_csv('output/df_features.csv')

In [10]:
X = df_features.values
y = df_label.values

# Make sure there are no nan values
assert not np.isnan(X).any()
assert not np.isnan(y).any()

In [11]:
y34 = df_label34.values
assert not np.isnan(y34).any()

In [12]:
X.shape, y.shape

((324, 652), (324,))

In [13]:
np.savez('output/Xy.npz', X=X, y=y, y34=y34)