In [1]:
import numpy as np
import pandas as pd
from sksurv.util import Surv
from tqdm import tqdm
from icare.survival import IcareSurv, BaggedIcareSurv
from sklearn.model_selection import cross_val_score, ShuffleSplit
from joblib import Parallel, delayed
from icare.metrics import harrell_cindex
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/home/louis/Documents/work/uncool_package/dev/df_train.csv', index_col='PatientID')
df.shape

(486, 2433)

In [3]:
features = list(set(df.columns.tolist()) - set(['Relapse', 'RFS', 'Task 1', 'Task 2', 'CenterID']))
features = [x for x in features if 'lesions_merged' not in x and 'lymphnodes_merged' not in x]
extra_features = ['Gender',
                  'Age',
                  'Weight',
                  'Tobacco',
                  'Alcohol',
                  'Performance status',
                  'HPV status (0=-, 1=+)',
                  'Surgery',
                  'Chemotherapy', 'nb_lesions', 'nb_lymphnodes', 'whole_body_scan']

features_groups = np.unique([x.split('_shape_')[0].split('_PT_')[0].split('_CT_')[0] for x in features])
features_groups = list(set(features_groups) - set(extra_features))
features_groups = [x + '_' for x in features_groups]
features_groups.append('extra_features')
len(features_groups), features_groups

(14,
 ['everything_mergeddilat16mm_',
  'everything_merged4_',
  'everything_mergedBBox_',
  'everything_mergedshell2mm_',
  'everything_merged_',
  'everything_mergedshell4mm_',
  'everything_merged40%_',
  'everything_mergeddilat4mm_',
  'everything_mergeddilat8mm_',
  'everything_mergedshell8mm_',
  'everything_mergeddilat2mm_',
  'everything_mergeddilat1mm_',
  'everything_merged2.5_',
  'extra_features'])

In [4]:
y = Surv.from_arrays(event=df['Relapse'].values,
                     time=df['RFS'].values)
X = df[features]
mask_keep = (X.isna().sum(axis=1) < 1000).values
X, y = X.iloc[mask_keep], y[mask_keep]

In [5]:
features_groups_id = []
for f in X.columns:
    if f in extra_features:
        features_groups_id.append(features_groups.index('extra_features'))
    else:
        group = f.split('_shape_')[0].split('_PT_')[0].split('_CT_')[0] + '_'
        features_groups_id.append(features_groups.index(group))

# Hyperparameters sets search

In [6]:
def worker_cv_paral(model, X, y, feature_group, train_index, test_index):
    try:
        model.fit(X.iloc[train_index], y[train_index], feature_groups=feature_group)
        pred = model.predict(X.iloc[test_index])
        return harrell_cindex(y[test_index], pred)
    except:
        return np.nan

def cv_paral(model, X, y, feature_group, n_folds):
    cv = ShuffleSplit(n_splits=n_folds, test_size=.5)
    scores = Parallel(n_jobs=-1)(delayed(worker_cv_paral)(model, X, y, feature_group, train_index, test_index)
                                 for train_index, test_index in cv.split(X))

    return np.nanmean(scores)

In [7]:
def get_random_params():
     return {
    'max_features': np.random.uniform(1/len(features), 500/len(features)),
    'rho': np.round_(np.random.uniform(0.2,0.9), 2),
    'cmin': np.round_(np.random.uniform(0.5,0.6), 2),
    'mandatory_features': extra_features,
    'features_groups_to_use': np.random.choice(features_groups,
                                               np.random.randint(len(features_groups)),
                                               replace=False).tolist()
}

In [8]:
from tqdm import tqdm
best_score = None
for _ in tqdm(range(10)):
    params = get_random_params()
    score = cv_paral(IcareSurv(**params), X, y, features_groups_id, n_folds=30)
    if best_score is None or best_score < score:
        best_score = score
        print('New best:')
        print(best_score)
        print(params)
        print()

 10%|█         | 1/10 [00:02<00:23,  2.58s/it]

New best:
0.6096507501248235
{'max_features': 0.09673842853026991, 'rho': 0.51, 'cmin': 0.55, 'mandatory_features': ['Gender', 'Age', 'Weight', 'Tobacco', 'Alcohol', 'Performance status', 'HPV status (0=-, 1=+)', 'Surgery', 'Chemotherapy', 'nb_lesions', 'nb_lymphnodes', 'whole_body_scan'], 'features_groups_to_use': ['everything_mergeddilat1mm_', 'extra_features', 'everything_mergeddilat4mm_', 'everything_merged4_', 'everything_merged40%_', 'everything_mergeddilat2mm_', 'everything_merged_', 'everything_mergedshell2mm_', 'everything_mergeddilat16mm_', 'everything_mergedshell8mm_']}



 20%|██        | 2/10 [00:04<00:16,  2.06s/it]

New best:
0.62245853784188
{'max_features': 0.06122424281460999, 'rho': 0.35, 'cmin': 0.51, 'mandatory_features': ['Gender', 'Age', 'Weight', 'Tobacco', 'Alcohol', 'Performance status', 'HPV status (0=-, 1=+)', 'Surgery', 'Chemotherapy', 'nb_lesions', 'nb_lymphnodes', 'whole_body_scan'], 'features_groups_to_use': ['everything_mergeddilat4mm_', 'everything_mergedBBox_', 'everything_mergeddilat1mm_', 'extra_features']}



 30%|███       | 3/10 [00:05<00:11,  1.58s/it]

New best:
0.6248519739781531
{'max_features': 0.17604502290118526, 'rho': 0.6, 'cmin': 0.5, 'mandatory_features': ['Gender', 'Age', 'Weight', 'Tobacco', 'Alcohol', 'Performance status', 'HPV status (0=-, 1=+)', 'Surgery', 'Chemotherapy', 'nb_lesions', 'nb_lymphnodes', 'whole_body_scan'], 'features_groups_to_use': ['extra_features', 'everything_merged2.5_', 'everything_mergeddilat16mm_']}



100%|██████████| 10/10 [00:12<00:00,  1.26s/it]


In [None]:
all_params_sets, all_scores = [], []
for _ in tqdm(range(50)):
    params = get_random_params()
    model = BaggedIcareSurv(n_estimators=50,
                            parameters_sets=[params],
                            aggregation_method='median',
                            n_jobs=1)
    score = cv_paral(model, X, y, features_groups_id, n_folds=32)
    all_params_sets.append(params)
    all_scores.append(score)
sort_idx = np.argsort(all_scores)[::-1]
all_params_sets = np.array(all_params_sets)[sort_idx]
all_scores = np.array(all_scores)[sort_idx]

 42%|████▏     | 21/50 [10:00<13:37, 28.18s/it]

In [None]:
all_scores[:5]

In [None]:
all_score, all_nb_sets = [], []
for nb_sets in tqdm(range(1, 20)):
    sets = all_params_sets[:nb_sets]
    model = BaggedIcareSurv(n_estimators=50,
                            parameters_sets=sets,
                            aggregation_method='median',
                            n_jobs=1)
    score = cv_paral(model, X, y, features_groups_id, n_folds=32)
    all_nb_sets.append(nb_sets)
    all_score.append(score)
best_nb_sets = all_nb_sets[np.argmax(all_score)]
print("best_nb_sets", best_nb_sets)
plt.plot(all_nb_sets, all_score)
plt.show()
hyperparameters_sets = all_params_sets[:best_nb_sets]
print('best hyperparameters_sets:')
for x in hyperparameters_sets:
    print(x)

In [None]:
model = BaggedIcareSurv(n_estimators=50,
                        parameters_sets=hyperparameters_sets,
                        aggregation_method='median',
                        n_jobs=1)
score = cv_paral(model, X, y, features_groups_id, n_folds=32)
score

In [None]:
10 0.6389