In [None]:
import os 
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import sys

sys.path.append('../')
sys.path.append('../DeepSurvivalMachines/')
from nsc import datasets

In [None]:
# Change this to analyze other datasets result
dataset = 'METABRIC'

In [None]:
path = '../Results/' # Path where the data is saved
x, t, e, covariates = datasets.load_dataset(dataset) # Open the data

In [None]:
from sksurv.metrics import concordance_index_ipcw, brier_score, cumulative_dynamic_auc, integrated_brier_score

### Utils
def evaluate(survival):
    folds = survival.iloc[:, -1].values
    survival = survival.iloc[:, :-1]
    times = survival.columns.get_level_values(1).unique()
    risk = 1 - survival

    results = {}

    # If multiple risk, compute cause specific metrics
    for r in survival.columns.get_level_values(0).unique():
        e_ = (e == int(r))
        for fold in np.arange(5):
            e_train, t_train = e_[folds != fold], t[folds != fold]
            e_test,  t_test  = e_[folds == fold], t[folds == fold]

            et_train = np.array([(e_train[i], t_train[i]) for i in range(len(e_train))],
                            dtype = [('e', bool), ('t', float)])
            et_test = np.array([(e_test[i], t_test[i]) for i in range(len(e_test))],
                            dtype = [('e', bool), ('t', float)])
            selection = (t_test < t_train.max()) | (e[folds == fold] == 0)
            
            et_test = et_test[selection]
            survival_fold = survival[folds == fold][r][selection]
            risk_fold = risk[folds == fold][r][selection]

            brs = brier_score(et_train, et_test, survival_fold.values, times)[1]
            # Concordance and ROC for each time
            gcis, cis, rocs = [], [], []
            for time in times:
                gcis.append(concordance_index_ipcw(et_train, et_test, risk_fold[time])[0])
                cis.append(concordance_index_ipcw(et_train, et_test, risk_fold[time], float(time))[0])
                rocs.append(cumulative_dynamic_auc(et_train, et_test, risk_fold[time], float(time))[0][0])

            results[(r, fold)] = pd.DataFrame.from_dict({"GCIS": gcis, "CIS": cis, "BRS": brs, "ROCS": rocs}, orient='index', columns = times)
    results = pd.concat(results)
    results.index.set_names(['Risk', 'Fold', 'Metric'], inplace = True)

    return results

In [None]:
# Open file and compute performance
predictions, clusters, results, likelihood = {}, {}, {}, {}
for file_name in os.listdir(path):
    if dataset in file_name and '.csv' in file_name: 
        model = file_name       
        model = model[model.index('_') + 1: model.index('.')]

        print("Opening :", file_name, ' - ', model)
        predictions[model] = pd.read_csv(path + file_name, header = [0, 1], index_col = 0)
        results[model] = evaluate(predictions[model])

        cluster_file = file_name[: file_name.index('.')] + '_clusters.pickle'
        if os.path.isfile(path + cluster_file):
            clusters[model] = pickle.load(open(path + cluster_file, 'rb'))
# Rename
# TODO: Add your method in the list for nicer display
dict_name = {'nsc': 'NSC', 'cox': 'CoxPH', 'ds': 'DeepSurv', 'dsm': 'DSM', 'dcm': 'DCM', 'dh': 'DeepHit', 'sumo': 'SuMo'} 

likelihood = pd.DataFrame.from_dict(likelihood, 'index').rename(dict_name)
results = pd.concat(results).rename(dict_name)
results.index.set_names('Model', 0, inplace = True)

In [None]:
table = results.groupby(['Model', 'Risk', 'Metric']).apply(lambda x: pd.Series(["{:.3f} ({:.2f})".format(mean, std) for mean, std in zip(x.mean(), x.std())], index = x.columns.astype(float)))
table = table.loc[table.index.get_level_values(2).isin(['CIS', 'BRS'])].unstack(level=-1).stack(level=0).unstack(level=-1).loc[:, ['CIS', 'BRS']]
table = table.loc[['NSC', 'DCM', 'SuMo', 'DSM', 'DeepHit', 'DeepSurv', 'CoxPH']]

if len(table.index.get_level_values(1).unique()) == 1:
    table = table.droplevel(1)
else:
    table = table.reorder_levels(['Risk', 'Model']).sort_index(level = 0, sort_remaining = False)

table

In [None]:
print(table.to_latex())

---------

# Analysis cluster

In [None]:
# Anlayze the outcome of the method
method_display = 'dcm'

In [None]:
average, ordering = {}, {}
for fold in clusters[method_display]:
    horizons_pred = np.linspace(0, 0.75, 10)
    average[fold] = pd.DataFrame(clusters[method_display][fold]['predictions'].T, columns = np.quantile(t[e==1], horizons_pred)).rename_axis('Cluster', axis = 'index') 
    ordering[fold] = {i: j for j, i in enumerate(average[fold].mean(1).sort_values().index)}
    average[fold] = average[fold].rename(index = ordering[fold])
else:
    ordering[-1] = list(range(len(ordering[0])))
    try: 
        average = pd.concat(average, names = ['Fold'])
        mean = average.groupby('Cluster').mean().T
        confidence = 1.96 * average.groupby('Cluster').std().T / len(average.index.get_level_values('Fold').unique())
        ax = mean.plot()
        for c, color in zip(mean.columns, list(mcolors.TABLEAU_COLORS.values())[:len(mean.columns)]):
            ax.fill_between(mean.index, (mean[c] - confidence[c]), (mean[c] + confidence[c]), color = color, alpha = .1)
        plt.xlabel('Time')
        plt.ylabel('Survival Probability')
        plt.grid(alpha = 0.3)
        plt.legend(title = 'Clusters')
        plt.show()
    except:
        print('Not same number of clusters')

In [None]:
# What is the distribution of probability to be part of a given cluster ?
clusters_assignment = {}
for fold in clusters[method_display]:
    clusters_assignment[fold] = pd.DataFrame(clusters[method_display][fold]['alphas_test'].values, 
                                            index = clusters[method_display][fold]['alphas_test'].index,
                                            columns = ordering[fold])[ordering[-1]]
else:
    clusters_assignment = pd.concat(clusters_assignment, axis = 0)
    for cluster in clusters_assignment.columns:
        clusters_assignment[cluster].plot.hist(alpha = 0.5, bins = 100)
    plt.xlabel('Probality cluster')
    plt.grid(alpha = 0.3)
    plt.legend(title = 'Clusters')
    plt.show()

    # Distribution maximally assigned
    axes = clusters_assignment.groupby(clusters_assignment.apply(lambda x: np.argmax(x), axis = 1)).boxplot(layout = (1, 3), figsize = (7, 3), grid = 0.5)
    for ax in axes:
        ax.grid(alpha = 0.3)

In [None]:
from lifelines import KaplanMeierFitter
from lifelines.statistics import multivariate_logrank_test

In [None]:
# Compute average life expectancy for each cluster
clusters_expectancy, clusters_assignments = [], []
for fold in clusters[method_display]:
    index = clusters[method_display][fold]['alphas_test'].index
    clusters_assignment = np.argmax(clusters[method_display][fold]['alphas_test'].values, axis = 1)
    clusters_assignment = pd.DataFrame({'Assignment': pd.Series(clusters_assignment).replace(ordering[fold]), 'Event': e[index], 'Time': t[index]}, index = index)
    clusters_assignments.append(clusters_assignment)
    clusters_expectancy.append(clusters_assignment.groupby('Assignment').apply(lambda x: KaplanMeierFitter().fit(x['Time'], x['Event']).median_survival_time_))
    print(multivariate_logrank_test(clusters_assignment['Time'], clusters_assignment['Assignment'], clusters_assignment['Event']))
clusters_assignments = pd.concat(clusters_assignments, 0)
clusters_expectancy = pd.concat(clusters_expectancy, 1).replace([np.inf, -np.inf], np.nan)

In [None]:
print("Survival")
clusters_expectancy.mean(1), clusters_expectancy.std(1)

In [None]:
print("Characteristics Clusters")
pd.concat([pd.DataFrame(x, columns = covariates), clusters_assignments], axis = 1).groupby('Assignment').mean()

In [None]:
print("Percentage Population")
clusters_assignments.groupby('Assignment').size() * 100 / len(clusters_assignments)

In [None]:
print("Censored")
(1 - clusters_assignments.groupby('Assignment').mean()['Event']) * 100

In [None]:
print("Difference in Survival")
multivariate_logrank_test(clusters_assignments['Time'], clusters_assignments['Assignment'], clusters_assignments['Event']).summary

In [None]:
# Display importance of features obtained by test
importance = []
for j in clusters[method_display]:
    importance.append(pd.Series(clusters[method_display][j]['importance'][0]))

importance = - pd.concat(importance, axis = 1)
importance.index = covariates
importance.mean(1).sort_values().plot.bar(yerr = importance.std(1))
plt.xlabel('Covariate')
plt.ylabel('Likelihood change')
plt.grid(alpha = 0.3)

In [None]:
from sklearn.manifold import TSNE

In [None]:
for i in clusters[method_display]:
    tsne = TSNE(n_components = 2, random_state=0)
    tsne = pd.DataFrame(tsne.fit_transform(clusters[method_display][i]['alphas_train']), columns = ['Projection 1', 'Projection 2'], index = clusters[method_display][i]['alphas_train'].index)
    tsne['Survival time'] = (t[tsne.index]) #np.digitize(t[tsne.index], [0,100,200,400])
    tsne.plot.scatter('Projection 1', 'Projection 2', c = 'Survival time', cmap='viridis', alpha = 0.5)
    plt.show()