In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
def support():
    import sys
    sys.path.append('../')
    from auton_survival import datasets
    outcomes, features = datasets.load_support()
    from auton_survival.preprocessing import Preprocessor
    cat_feats = ['sex', 'dzgroup', 'dzclass', 'income', 'race', 'ca']
    num_feats = ['age', 'num.co', 'meanbp', 'wblc', 'hrt', 'resp', 
            'temp', 'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 
                'glucose', 'bun', 'urine', 'adlp', 'adls']

    features = Preprocessor().fit_transform(features, cat_feats=cat_feats, num_feats=num_feats)
    x, t, e = features.values, outcomes.time.values, outcomes.event.values

    n = len(x)

    tr_size = int(n*0.80)
    te_size = int(n*0.20)

    x_train, x_test = x[:tr_size], x[-te_size:]
    t_train, t_test = t[:tr_size], t[-te_size:]
    e_train, e_test = e[:tr_size], e[-te_size:]
    return x_train, t_train , e_train, x_test, t_test , e_test

In [17]:
def synthetic():
    import pandas as pd
    import torch
    from tqdm import tqdm 
    import sys
    sys.path.append('../')

    from auton_survival.datasets import load_dataset

    # Load the synthetic dataset
    outcomes, features, interventions = load_dataset(dataset='SYNTHETIC')

    # Hyper-parameters
    random_seed = 0
    test_size = 0.25

    # Split the synthetic data into training and testing data
    import numpy as np

    np.random.seed(random_seed)
    n = features.shape[0] 

    test_idx = np.zeros(n).astype('bool')
    test_idx[np.random.randint(n, size=int(n*test_size))] = True 

    features_tr = features.iloc[~test_idx] 
    outcomes_tr = outcomes.iloc[~test_idx]
    interventions_tr = interventions[~test_idx]
    print(f'Number of training data points: {len(features_tr)}')

    features_te = features.iloc[test_idx] 
    outcomes_te = outcomes.iloc[test_idx]
    interventions_te = interventions[test_idx]
    print(f'Number of test data points: {len(features_te)}')

    interventions_tr.name, interventions_te.name = 'treat', 'treat'
    features_tr_dcph = pd.concat([features_tr, interventions_tr.astype('float64')], axis=1)
    features_te_dcph = pd.concat([features_te, interventions_te.astype('float64')], axis=1)
    outcomes_tr_dcph = pd.DataFrame(outcomes_tr, columns=['event', 'time']).astype('float64')


    x_train = features_tr_dcph.values
    e_train = outcomes_tr['event'].values.astype(float)
    t_train = outcomes_tr['time'].values

    x_test = features_te_dcph.values
    e_test = outcomes_te['event'].values.astype(float)
    t_test = outcomes_te['time'].values
    return x_train, t_train , e_train, x_test, t_test , e_test


In [18]:
def kkbox():
    from pycox.datasets import from_kkbox


    kkbox_data = from_kkbox._DatasetKKBoxChurn()
    #kkbox_data.download_kkbox()

    df = kkbox_data.read_df()

    import numpy as np
    import pandas as pd

    e = np.array(df.event)
    t = np.array(df.duration)
    x = df.drop(columns=['event','duration','msno'])

    from sklearn.preprocessing import LabelEncoder

    le = LabelEncoder()
    x['gender'] = le.fit_transform(x['gender'])
    x['registered_via'] = le.fit_transform(x['registered_via'])
    x['city'] = le.fit_transform(x['city'])
    x = np.array(x).astype(float)

    import os, sys
    import numpy as np 

    # path = '/home/r10user10/Documents/Jiacheng/dspm-auton-survival'
    # os.chdir(path)
    # print(os.getcwd())

    from auton_survival import datasets

    n = len(x)

    tr_size = int(n * 0.80)
    te_size = int(n * 0.20)


    x_train, x_test = x[:tr_size], x[-te_size:]
    t_train, t_test = t[:tr_size], t[-te_size:]
    e_train, e_test = e[:tr_size], e[-te_size:]
    return x_train, t_train , e_train, x_test, t_test , e_test

In [19]:
def mimic():
    import numpy as np
    x_train = np.load('x_train.npy')
    t_train = np.load('t_train.npy')
    e_train = 1 - np.load('e_train.npy')
    index = np.where(t_train <= 0)[0]
    t_train = np.delete(t_train, index)
    e_train = np.delete(e_train, index)
    x_train = np.delete(x_train, index, axis=0)
    x_test = np.load('x_test.npy')
    t_test = np.load('t_test.npy')
    e_test = 1 - np.load('e_test.npy')
    index = np.where(t_test <= 0)[0]
    t_test = np.delete(t_test, index)
    e_test = np.delete(e_test, index)
    x_test = np.delete(x_test, index, axis=0)
    x_train = np.mean(x_train, axis=1)
    x_test = np.mean(x_test, axis=1)
    print(x_train.shape)
    return x_train, t_train , e_train, x_test, t_test , e_test

In [20]:
def baseline(baseline, dataset, lr, n_components):
    if baseline == 'DeepCox':
        from auton_survival import DeepCoxPH
        model = DeepCoxPH(layers=[100,100])
    if baseline == 'DSM':
        from auton_survival.models.dsm import DeepSurvivalMachines
        model = DeepSurvivalMachines(k = n_components,
                                distribution = 'LogNormal',
                                layers = [100,100])
    if baseline == 'DCM':
        from auton_survival.models.dcm import DeepCoxMixtures
        model = DeepCoxMixtures(k = n_components, layers = [100,100])
    if baseline == 'DDPSM':
        from auton_survival.models.dpsm import DeepDP
        model = DeepDP(k= n_components,
               distribution='Weibull',
               layers=[100,100])

    if dataset == 'support':
        x_train, t_train , e_train, x_test, t_test , e_test = support()
    if dataset == 'synthetic':
        x_train, t_train , e_train, x_test, t_test , e_test = synthetic()
    if dataset == 'kkbox':
        x_train, t_train , e_train, x_test, t_test , e_test = kkbox()
    if dataset == 'mimic':
        x_train, t_train , e_train, x_test, t_test , e_test = mimic()   
    
    model.fit(x_train, t_train, e_train, iters = 100, learning_rate = lr)
    horizons = [0.25, 0.5, 0.75, 0.9]
    x = np.concatenate((x_train, x_test), axis=0)
    t = np.concatenate((t_train, t_test), axis=0)
    e = np.concatenate((e_train, e_test), axis=0)
    times = np.quantile(t[e==1], horizons).tolist()
    out_risk = 1 - model.predict_survival(x_test, times)
    out_survival = model.predict_survival(x_test, times)
    print(out_survival.shape)

    from sksurv.metrics import concordance_index_ipcw, brier_score, cumulative_dynamic_auc

    cis = []
    brs = []

    et_train = np.array([(e_train[i], t_train[i]) for i in range(len(e_train))],
                    dtype = [('e', bool), ('t', float)])
    #print(et_train)
    et_test = np.array([(e_test[i], t_test[i]) for i in range(len(e_test))],
                    dtype = [('e', bool), ('t', float)])
    # et_val = np.array([(e_val[i], t_val[i]) for i in range(len(e_val))],
    #                  dtype = [('e', bool), ('t', float)])
    # print(et_train[0:10])
    for i, _ in enumerate(times):
        cis.append(concordance_index_ipcw(et_train, et_test, out_risk[:, i], times[i])[0])
    brs.append(brier_score(et_train, et_test, out_survival, times)[1])
    roc_auc = []
    for i, _ in enumerate(times):
        roc_auc.append(cumulative_dynamic_auc(et_train, et_test, out_risk[:, i], times[i])[0])
    for horizon in enumerate(horizons):
        print(f"For {horizon[1]} quantile")
        print("TD Concordance Index:", cis[horizon[0]])
        print("Brier Score:", brs[0][horizon[0]])
        print("ROC AUC ", roc_auc[horizon[0]][0], "\n")
    

In [21]:
import numpy as np

In [22]:
# baseline('DDPSM', 'kkbox', 1e-6, 10)

In [23]:
baseline('DDPSM', 'support', 1e-6, 10)

 18%|█▊        | 1820/10000 [00:04<00:20, 397.56it/s]
  0%|          | 0/100 [00:00<?, ?it/s]


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)