In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, inspect, sys
from sksurv.ensemble import RandomSurvivalForest
from sksurv.datasets import load_gbsg2
import eli5
from eli5.sklearn import PermutationImportance

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/Survival_df.csv'),
                  index_col=0)
data

Unnamed: 0_level_0,Observed,F_Time,NDE_cycle,diff_two_peaks_ratio,amp_ratio,pos_ratio,energy_ratio,NLE_ratio_51_17,NLE_ratio_85_17,NLE_ratio_119_17,NLO_avg,Avg_RP,ACEE_30_Avg_58,ACEE_500_Avg_58,ACPD_Avg_58
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
41C,True,2154137.0,0,2.104871,1.185236,1.002988,1.559453,0.893268,1.107898,0.462483,5.576476,190.979294,-0.13764,-2.86603,4.804749
44C,True,1318034.0,0,2.280959,1.365329,0.998446,1.504218,0.888503,1.140535,0.47716,4.619563,187.204261,-0.596385,-2.7885,3.370729
45C,False,3245267.0,0,1.245687,0.80997,0.989351,1.282046,0.896534,1.137561,0.491483,3.458337,189.010633,-0.64571,-3.20222,4.186639
19C,True,464392.0,450000,0.763296,1.128928,1.002407,1.126849,0.978877,1.235508,0.530102,3.798736,171.925123,-1.065155,-1.57074,3.226957
32C,True,306142.0,450000,0.610771,1.137713,0.994283,0.844962,0.844824,1.091099,0.459167,4.296261,169.717689,-0.61719,-1.312415,2.907922
23C,False,2255225.0,600000,0.39758,0.919473,0.999861,0.846653,0.918699,1.114762,0.495867,5.621833,173.403495,-0.83111,-1.19454,3.061743
25C,False,1444844.0,600000,0.821445,1.178319,1.003158,1.141313,0.881052,1.083718,0.4775,3.061693,172.727399,-0.912065,-0.927155,3.284087
21C,True,200007.0,600000,0.705402,1.151889,0.984968,0.853808,0.929446,1.119635,0.493245,4.626314,174.003954,-0.8849,-1.578795,2.764862
27C,True,297627.0,750000,0.144147,0.777585,1.002966,0.394731,0.912962,1.112708,0.500035,5.020607,173.688949,-1.276355,-1.60215,2.746634
26C,False,2829121.0,750000,0.453026,1.052153,1.003191,0.868,0.890721,1.091059,0.494946,4.831161,169.891439,-0.6878,-1.269005,3.820881


In [3]:
data_x = data.iloc[:, 2:]
data_y = data.iloc[:, 0:2]
data_y_num = data_y.to_records(index=False)


In [4]:
X, y = load_gbsg2()
y
data_y

Unnamed: 0_level_0,Observed,F_Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
41C,True,2154137.0
44C,True,1318034.0
45C,False,3245267.0
19C,True,464392.0
32C,True,306142.0
23C,False,2255225.0
25C,False,1444844.0
21C,True,200007.0
27C,True,297627.0
26C,False,2829121.0


## Random survival forest

In [60]:
rsf = RandomSurvivalForest(n_estimators=1000,
                           max_features="auto",
                          oob_score=True,
                          max_depth=2 # default None
                          )
rsf.fit(data_x, data_y_num)


RandomSurvivalForest(bootstrap=True, max_depth=2, max_features='auto',
                     max_leaf_nodes=None, max_samples=None, min_samples_leaf=3,
                     min_samples_split=6, min_weight_fraction_leaf=0.0,
                     n_estimators=1000, n_jobs=None, oob_score=True,
                     random_state=None, verbose=0, warm_start=False)

## Out of bag score (Concordance index)

In [61]:
rsf.oob_score_

0.37209302325581395

## Feature importance by permutation

This estimates the importance of each feature by permutating it and looking at the effect on the model. This is not good when the features are correlated as they are in this case.

In [7]:
perm = PermutationImportance(rsf, n_iter=15)
perm.fit(data_x, data_y_num)

PermutationImportance(cv='prefit',
                      estimator=RandomSurvivalForest(bootstrap=True,
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_samples_leaf=3,
                                                     min_samples_split=6,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=1000,
                                                     n_jobs=None,
                                                     oob_score=True,
                                                     random_state=None,
                                                     verbose=0,
                                   

In [8]:
feature_names = data_x.columns.tolist()
eli5.show_weights(perm, feature_names=feature_names)

Weight,Feature
0.0760  ± 0.0569,ACPD_Avg_58
0.0279  ± 0.0378,amp_ratio
0.0217  ± 0.0415,ACEE_500_Avg_58
0.0178  ± 0.0369,NLE_ratio_51_17
0.0085  ± 0.0334,NLO_avg
0.0054  ± 0.0238,NLE_ratio_119_17
0.0008  ± 0.0158,NDE_cycle
0.0000  ± 0.0225,ACEE_30_Avg_58
0  ± 0.0000,pos_ratio
-0.0078  ± 0.0139,diff_two_peaks_ratio
