In [6]:
%load_ext autoreload
%autoreload 2
%load_ext autotime
%config InlineBackend.figure_format = 'retina'

import itertools
import os
from pathlib import Path

for folder in itertools.chain([Path.cwd()], Path.cwd().parents):
    if (folder / 'Pipfile').exists():
        os.chdir(folder)
        break

import logging
import shelve
from functools import partial
from matplotlib import pyplot as plt
from pandas import DataFrame, Series
from utils import remove_suffixes

from sklearn.model_selection import KFold
from typing import Mapping, List, Tuple, Iterable, Callable, TypedDict

from evaluation_functions import get_classification_metrics, compute_classification_metrics_from_results_with_statistics
from formatting import dict_to_table_horizontal, b
from functional import pipe
from nested_cv import evaluate_method_on_sets, DefaultHyperParameters, BayesianOptimization
from notebooks.heart_transplant.dependencies.heart_transplant_data import get_survival_dataset_cached
from notebooks.heart_transplant.dependencies.heart_transplant_functions import get_rolling_cv_cached, get_filtered_by_age, AgeGroup, get_ht_metrics_table, get_survival_y, get_shuffled_10_fold_callback, get_rolling_cv_callback
from notebooks.heart_transplant.dependencies.heart_transplant_pipelines import get_xgboost_pipeline, get_cox_ph_pipeline, cox_ph_hyperopt
from utils import evaluate_and_assign_if_not_present, mapping_subset
from visualisation import list_of_lists_to_html_table, display_html



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 11.1 ms (started: 2021-03-26 11:18:10 +00:00)


In [7]:
logging.getLogger().setLevel(logging.DEBUG)

X_365, y_365, futd_365, death_365, dataset_raw = get_survival_dataset_cached(365)

X_90, y_90, futd_90, death_90, _ = get_survival_dataset_cached(90)

[Memory]3612.7s, 60.2min: Loading get_survival_dataset...
[Memory]3613.8s, 60.2min: Loading get_survival_dataset...
time: 2.22 s (started: 2021-03-26 11:18:10 +00:00)


In [8]:
class InputData(TypedDict):
    cv: Iterable[Tuple[List[int], ...]] # List of train-test splits (compatible with sklearn)
    X: DataFrame # Input data
    y: Series # Survival at some time point
    futd: Series # Follow-up time (days)
    death: Series # Outcome (0 or 1)

def get_data(_X: DataFrame, _y: Series, futd: Series, death: Series, _dataset_raw: DataFrame, cv_callback: Callable ) -> InputData:
    return InputData(
        cv= cv_callback(_X, _dataset_raw),
        X= _X,
        y= _y,
        futd= futd.loc[_X.index],
        death= death.loc[_X.index],
    )

time: 728 µs (started: 2021-03-26 11:18:12 +00:00)


- Below is a list of datasets to evaluate the models on. It is a flat structure. You need a separate dataset for 365 and 90 days, since there few individuals are removed
- Two types of CV: chronological (rolling) and shuffled 10-fold
- Subsets: e.g. 365 survival, $ \leq 18$, $ > 18$, 90 days

In [9]:
data: Mapping[str, InputData] = {
    '365_all_rolling': get_data(
        X_365,
        y_365,
        futd_365,
        death_365,
        dataset_raw,
        get_rolling_cv_callback,
    ),

    '90_all_rolling': get_data(
        X_90,
        y_90,
        futd_90,
        death_90,
        dataset_raw,
        get_rolling_cv_callback,
    ),

    '365_all_shuffled_10_fold': get_data(
        X_365,
        y_365,
        futd_365,
        death_365,
        dataset_raw,
        get_shuffled_10_fold_callback,
    ),

    '365_me_18_rolling': get_data(
        get_filtered_by_age(AgeGroup.ME_18, X_365),
        y_365,
        futd_365,
        death_365,
        dataset_raw,
        get_rolling_cv_callback
    ),
    '365_l_18_rolling': get_data(
        get_filtered_by_age(AgeGroup.L_18, X_365).drop(columns=['biopsy_dgn', 'cig_use', 'ebv_igg_cad_don', 'prior_card_surg_trr', 'vessels_50sten']),
        y_365,
        futd_365,
        death_365,
        dataset_raw,
        get_rolling_cv_callback
    ),
}

________________________________________________________________________________
[Memory] Calling notebooks.heart_transplant.dependencies.heart_transplant_functions.get_rolling_cv...
get_rolling_cv(        thoracic_dgn gender abo   bmi_calc ebv_igg_cad_don  iabp_tcr  \
2             1002.0      F   O  25.059293               P         0   
3             1000.0      M   B  32.140327             NaN         0   
5             1007.0      M   A  25.501607               P         0   
6             1007.0      M   A  29.023330               P         0   
7             1000.0      M   A  21.894900               P         0   
...              ...    ...  ..        ...             ...       ...   
153202        1007.0      M   A  24.956678               P         0   
153203        1000.0      F   A  30.973245             NaN         0   
153204        1007.0      F   A  32.269808          ..., n_windows=None, test_size_years=1, minimum_training_years=10, year_stop=2016)
___________________

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  get_rolling_cv_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  get_rolling_cv_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the jobl

In [10]:
def get_cache():
    return shelve.open('cache')

for subset_name, dataset in data.items():

    y_survival = pipe(
        dataset,
        partial(mapping_subset, ('futd', 'death')),
        DataFrame,
        get_survival_y,
    )

    evaluate_and_assign_if_not_present(
        get_cache,
        key=subset_name+"_default",
        callback=lambda: evaluate_method_on_sets(
            lambda: get_cox_ph_pipeline(dataset['X']),   # XGBoost would normally wouldn't take 'death' as y
            dataset['X'],
            y_survival,
            DefaultHyperParameters(),
            splits=dataset['cv'],
            parallel=True,
            n_jobs=len(dataset['cv']),
            get_metrics=lambda _, result: get_classification_metrics(dataset['y'], result)
        ),
    )

    # # TODO: when optimizing: cox ph stuck or taking too long, check cox_ph_hyperopt
    # evaluate_and_assign_if_not_present(
    #     get_cache,
    #     key=subset_name+"_tuned",
    #     callback=lambda: evaluate_method_on_sets(
    #         lambda: get_cox_ph_pipeline(dataset['X']),
    #         dataset['X'],
    #         y_survival,
    #         BayesianOptimization(cox_ph_hyperopt, iterations=1),
    #         splits=dataset['cv'],
    #         parallel=True,
    #         n_jobs=len(dataset['cv']),
    #         get_metrics=lambda _, result: get_classification_metrics(dataset['y'], result)
    #     ),
    # # )

root:INFO: 365_ALL_ROLLING_DEFAULT
root:DEBUG: Key "365_all_rolling_default" not present, executing callback


.
.
.
.
.
.
.
.
.
.
.
.
.
fit         categorical__gender  categorical__abo  categorical__ebv_igg_cad_don  \
17                      NaN               NaN                           NaN   
19                      NaN               NaN                           NaN   
20                      NaN               NaN                           NaN   
25                      NaN               NaN                           NaN   
27                      NaN               NaN                           NaN   
...                     ...               ...                           ...   
153196                  NaN               NaN                           NaN   
153199                  NaN               NaN                           NaN   
153200                  NaN               NaN                           NaN   
153203                  NaN               NaN                           NaN   
153204                  NaN               NaN                           NaN   

        categorical__

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
with get_cache() as cache:
    for subset_name, results in cache.items():
        subset_name_base = remove_suffixes(('_default', '_tuned'), subset_name)
        data_item = data[subset_name_base]
        metrics = compute_classification_metrics_from_results_with_statistics(
            data_item['y'],
            [cache[subset_name]['chosen']['result']],
            ignore_warning=True,
        )
        b(subset_name)
        pipe(
            metrics,
            dict_to_table_horizontal,
            list_of_lists_to_html_table,
            display_html,
        )

In [None]:
for subset_name, dataset in data.items():
    dataset['X'].to_csv(f'{subset_name}.csv')
    dataset['futd'].to_csv(f'{subset_name}_futd.csv')
    dataset['death'].to_csv(f'{subset_name}_death.csv')

    dataset['cv'][4][0] = [0,1,2,3,55,6]
    dataset['cv'][4][1] = [10,12,13,14]
