In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, inspect, sys
from sksurv.ensemble import RandomSurvivalForest
from sksurv.datasets import load_gbsg2
import eli5
from eli5.sklearn import PermutationImportance

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

Using TensorFlow backend.


This NB does random survival forest analysis on the real data. 

In [2]:
data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/Survival_df.csv'),
                  index_col=0)


In [3]:
data_x = data.iloc[:, 2:]
data_y = data.iloc[:, 0:2]
data_y_num = data_y.to_records(index=False)


## Random survival forest

In [5]:
rsf = RandomSurvivalForest(n_estimators=1000,
                           max_features="auto",
                          oob_score=True,
                          )
rsf.fit(data_x, data_y_num)


RandomSurvivalForest(bootstrap=True, max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None, min_samples_leaf=3,
                     min_samples_split=6, min_weight_fraction_leaf=0.0,
                     n_estimators=1000, n_jobs=None, oob_score=True,
                     random_state=None, verbose=0, warm_start=False)

## Out of bag score (Concordance index)

This is a measure of how well the model orders the sample. The model was trained on all the data because there is so little. This performs worse than random.

In [6]:
rsf.oob_score_

0.4305555555555556

## Feature importance by permutation

This estimates the importance of each feature by permutating it and looking at the effect on the model. This is not good when the features are correlated as they are in this case.

In [7]:
perm = PermutationImportance(rsf, n_iter=15)
perm.fit(data_x, data_y_num)

PermutationImportance(cv='prefit',
                      estimator=RandomSurvivalForest(bootstrap=True,
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_samples_leaf=3,
                                                     min_samples_split=6,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=1000,
                                                     n_jobs=None,
                                                     oob_score=True,
                                                     random_state=None,
                                                     verbose=0,
                                   

In [8]:
feature_names = data_x.columns.tolist()
eli5.show_weights(perm, feature_names=feature_names)

Weight,Feature
0.0778  ± 0.0551,ACPD_Avg_58
0.0153  ± 0.0363,NLE_ratio_51_17
0.0116  ± 0.0557,NLE_ratio_85_17
0.0102  ± 0.0175,amp_ratio
0.0097  ± 0.0316,NLO_avg
0.0069  ± 0.0209,ACEE_30_Avg_58
0.0060  ± 0.0100,pos_ratio
0.0060  ± 0.0123,Avg_RP
0.0051  ± 0.0080,NLE_ratio_51_119
0.0046  ± 0.0140,ACEE_500_Avg_58
