In [None]:
import shap
import pandas as pd
import xgboost as xgb
from prediction.utils.utils import aggregate_features_over_time
from prediction.outcome_prediction.data_loading.data_loader import load_data
from prediction.utils.shap_helper_functions import check_shap_version_compatibility


In [None]:
# Shap values require very specific versions
check_shap_version_compatibility()

In [None]:
features_path = '/Users/jk1/temp/opsum_prepro_output/gsu_prepro_01012023_233050/preprocessed_features_01012023_233050.csv'
labels_path = '/Users/jk1/temp/opsum_prepro_output/gsu_prepro_01012023_233050/preprocessed_outcomes_01012023_233050.csv'
normalisation_parameters_path = '/Users/jk1/temp/opsum_prepro_output/gsu_prepro_01012023_233050/logs_01012023_233050/normalisation_parameters.csv'
parameters_path = '/Users/jk1/temp/opsum_prediction_output/linear_72h_xgb/with_feature_aggregration/best_overall_parameters.csv'
model_path = '/Users/jk1/temp/opsum_prediction_output/linear_72h_xgb/with_feature_aggregration/testing/feature_aggregration_xgb_3M mRS 0-2_2_0.1_200_100.0_1.0_cv3.json'
outcome = '3M mRS 0-2'
n_splits = 5
n_epochs = 5000
seed = 42
test_size = 0.2

In [None]:
parameters_df = pd.read_csv(parameters_path)
parameters_df

In [None]:
if 'moving_average' in parameters_df:
    moving_average = parameters_df['moving_average'][0]
else:
    moving_average = False

Load data

In [None]:
# (pid_train, pid_test), (train_X_np, train_y_np), (test_X_np, test_y_np), splits, test_features_lookup_table
pids, training_data, test_data, splits, test_features_lookup_table = load_data(features_path, labels_path, outcome, test_size, n_splits, seed)

In [None]:
test_X_np, test_y_np = test_data

In [None]:
X_test, y_test = aggregate_features_over_time(test_X_np, test_y_np, moving_average=moving_average)
# only keep prediction at last timepoint
X_test = X_test.reshape(-1, 72, X_test.shape[-1])[:, -1, :].astype('float32')
y_test = y_test.reshape(-1, 72)[:, -1].astype('float32')

In [None]:
fold_X_train, fold_X_val, fold_y_train, fold_y_val = splits[int(parameters_df['CV'][0])]
X_train, y_train = aggregate_features_over_time(fold_X_train, fold_y_train, moving_average=moving_average)

Load model

In [None]:
xgb_model = xgb.XGBClassifier(learning_rate=parameters_df['learning_rate'][0], max_depth=parameters_df['max_depth'][0], n_estimators=parameters_df['n_estimators'][0], reg_lambda=parameters_df['reg_lambda'][0], alpha=parameters_df['alpha'][0])

xgb_model.load_model(model_path)

In [None]:
xgb_model.predict_proba(X_test)

Compute SHAP

In [None]:
explainer = shap.TreeExplainer(xgb_model, X_train)

In [None]:
 # explain the testing instances (can use fewer instances)
# explaining each prediction requires 2 * background dataset size runs
shap_values = explainer.shap_values(X_test)


In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
# all_features = np.concatenate([features, avg_features, min_features, max_features], 2)

avg_feature_names = [f'avg_{item}' for item in list(test_features_lookup_table['sample_label'])]
min_feature_names = [f'min_{item}' for item in list(test_features_lookup_table['sample_label'])]
max_feature_names = [f'max_{item}' for item in list(test_features_lookup_table['sample_label'])]

feature_names = flatten([list(test_features_lookup_table['sample_label']), avg_feature_names, min_feature_names, max_feature_names])

In [None]:
shap.summary_plot(shap_values, pd.DataFrame(X_test, columns=feature_names)
)

In [None]:
shap.initjs()

In [None]:
shap.force_plot(explainer.expected_value, shap_values, pd.DataFrame(X_test, columns=feature_names))