In [1]:
import sys
import os
# Add the parent directory to the system path
sys.path.append(os.path.abspath(os.path.join('..')))
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
f1_score, roc_auc_score, roc_curve, auc, confusion_matrix, classification_report)
from sklearn.utils import resample
from sklearn.exceptions import NotFittedError
from sklearn.utils.validation import check_is_fitted

In [2]:
from utilities import bs_calibrate, bs_results, utils

## Load the data

In [3]:
# df = pd.read_csv("../dibh_clinical_only/original_training_data.csv")
df = pd.read_csv("/Users/chufal/projects/DIBHproject/dibh_clinical_only/original_training_data.csv")
data_day1 = df[df['day'] == 1]
X = data_day1.drop(['crnumber', 'day', 'DIBH_Y0N1'], axis=1)
y = data_day1['DIBH_Y0N1']
X_t, X_v, y_t, y_v = train_test_split(X, y, test_size=0.30, random_state=42)

# df_internal_val = pd.read_csv("../dibh_clinical_only/final_internal_validation13052024.csv")
df_internal_val = pd.read_csv("/Users/chufal/projects/DIBHproject/dibh_clinical_only/final_internal_validation13052024.csv")
data_day1_val = df_internal_val[df_internal_val['day'] == 1]
X_int_val = data_day1_val.drop(['crnumber', 'day', 'DIBH_Y0N1'], axis=1)
y_int_val = data_day1_val['DIBH_Y0N1']


## Load the model information for all ML models

In [10]:
xgb_load_models_info = joblib.load('../results/bs_training_data/python_objects/top_models_xgb.joblib')
cb_load_models_info = joblib.load('../results/bs_training_data/python_objects/top_models_cb.joblib')
lgbm_load_models_info = joblib.load('../results/bs_training_data/python_objects/top_models_lgbm.joblib')
gb_load_models_info = joblib.load('../results/bs_training_data/python_objects/top_models_gb.joblib')
rf_load_models_info = joblib.load('../results/bs_training_data/python_objects/top_models_rf.joblib')
nb_load_models_info = joblib.load('../results/bs_training_data/python_objects/top_models_nb.joblib')
svm_load_models_info = joblib.load('../results/bs_training_data/python_objects/top_models_svm.joblib')
knn_load_models_info = joblib.load('../results/bs_training_data/python_objects/top_models_knn.joblib')
lr_load_models_info = joblib.load('../results/bs_training_data/python_objects/top_models_lr.joblib')

xgb_load_model = joblib.load('../results/non_bs_training_data/models/non_bs_model_xgb.joblib')
cb_load_model = joblib.load('../results/non_bs_training_data/models/non_bs_model_cb.joblib')
lgbm_load_model = joblib.load('../results/non_bs_training_data/models/non_bs_model_lgbm.joblib')
gb_load_model = joblib.load('../results/non_bs_training_data/models/non_bs_model_gb.joblib')
rf_load_model = joblib.load('../results/non_bs_training_data/models/non_bs_model_rf.joblib')
nb_load_model = joblib.load('../results/non_bs_training_data/models/non_bs_model_nb.joblib')
svm_load_model = joblib.load('../results/non_bs_training_data/models/non_bs_model_svm.joblib')
knn_load_model = joblib.load('../results/non_bs_training_data/models/non_bs_model_knn.joblib')
lr_load_model = joblib.load('../results/non_bs_training_data/models/non_bs_model_lr.joblib')


In [11]:
xgb_load_models_info[0].keys()

dict_keys(['model_calibrated', 'fpr_calibrated', 'tpr_calibrated', 'thresholds_calibrated', 'roc_auc_calibrated', 'optimal_threshold_calibrated', 'accuracy_calibrated', 'precision_calibrated', 'recall_calibrated', 'f1_score_calibrated', 'confusion_matrix_calibrated', 'classification_report_calibrated', 'feature_importances_calibrated', 'model_non_calibrated', 'fpr_non_calibrated', 'tpr_non_calibrated', 'thresholds_non_calibrated', 'roc_auc_non_calibrated', 'optimal_threshold_non_calibrated', 'accuracy_non_calibrated', 'precision_non_calibrated', 'recall_non_calibrated', 'f1_score_non_calibrated', 'confusion_matrix_non_calibrated', 'classification_report_non_calibrated', 'feature_importances_non_calibrated'])

In [12]:
try:
    check_model = xgb_load_models_info[0]["model_calibrated"]
    check_is_fitted(check_model)
    check_model.fit(X,y)
    print("Model is fitted to new data")
              
except NotFittedError:
    print("Not Fitted")


Model is fitted to new data


## Create Ensemble for all ML models without optimal threasholds

In [13]:
ensemble_model_xgb = utils.create_ensemble_model(xgb_load_models_info)
ensemble_model_cb = utils.create_ensemble_model(cb_load_models_info)
ensemble_model_lgbm = utils.create_ensemble_model(lgbm_load_models_info)
ensemble_model_gb = utils.create_ensemble_model(gb_load_models_info)
ensemble_model_rf = utils.create_ensemble_model(rf_load_models_info)
ensemble_model_nb = utils.create_ensemble_model(nb_load_models_info)
ensemble_model_svm = utils.create_ensemble_model(svm_load_models_info)
ensemble_model_knn = utils.create_ensemble_model(knn_load_models_info)
ensemble_model_lr = utils.create_ensemble_model(lr_load_models_info)

ensemble_model_xgb_non_calibrated = utils.create_ensemble_model(xgb_load_models_info, calibration=False)
ensemble_model_cb_non_calibrated = utils.create_ensemble_model(cb_load_models_info, calibration=False)
ensemble_model_lgbm_non_calibrated = utils.create_ensemble_model(lgbm_load_models_info, calibration=False)
ensemble_model_gb_non_calibrated = utils.create_ensemble_model(gb_load_models_info, calibration=False)
ensemble_model_rf_non_calibrated = utils.create_ensemble_model(rf_load_models_info, calibration=False)
ensemble_model_nb_non_calibrated = utils.create_ensemble_model(nb_load_models_info, calibration=False)
ensemble_model_svm_non_calibrated = utils.create_ensemble_model(svm_load_models_info, calibration=False)
ensemble_model_knn_non_calibrated = utils.create_ensemble_model(knn_load_models_info, calibration=False)
ensemble_model_lr_non_calibrated = utils.create_ensemble_model(lr_load_models_info, calibration=False)

## Create Ensemble for all the ML models with optimal threasholds

In [8]:
hard_ensemble_model_xgb = utils.new_create_ensemble_model(xgb_load_models_info, use_optimum_threshold=True)
hard_ensemble_model_cb = utils.new_create_ensemble_model(cb_load_models_info, use_optimum_threshold=True)
hard_ensemble_model_lgbm = utils.new_create_ensemble_model(lgbm_load_models_info, use_optimum_threshold=True)
hard_ensemble_model_gb = utils.new_create_ensemble_model(gb_load_models_info, use_optimum_threshold=True)
hard_ensemble_model_rf = utils.new_create_ensemble_model(rf_load_models_info, use_optimum_threshold=True)
hard_ensemble_model_nb = utils.new_create_ensemble_model(nb_load_models_info, use_optimum_threshold=True)
hard_ensemble_model_svm = utils.new_create_ensemble_model(svm_load_models_info, use_optimum_threshold=True)
hard_ensemble_model_knn = utils.new_create_ensemble_model(knn_load_models_info, use_optimum_threshold=True)
hard_ensemble_model_lr = utils.new_create_ensemble_model(lr_load_models_info, use_optimum_threshold=True)

hard_ensemble_model_xgb_non_calibrated = utils.new_create_ensemble_model(xgb_load_models_info, calibration=False, use_optimum_threshold=True)
hard_ensemble_model_cb_non_calibrated = utils.new_create_ensemble_model(cb_load_models_info, calibration=False, use_optimum_threshold=True)
hard_ensemble_model_lgbm_non_calibrated = utils.new_create_ensemble_model(lgbm_load_models_info, calibration=False, use_optimum_threshold=True)
hard_ensemble_model_gb_non_calibrated = utils.new_create_ensemble_model(gb_load_models_info, calibration=False, use_optimum_threshold=True)
hard_ensemble_model_rf_non_calibrated = utils.new_create_ensemble_model(rf_load_models_info, calibration=False, use_optimum_threshold=True)
hard_ensemble_model_nb_non_calibrated = utils.new_create_ensemble_model(nb_load_models_info, calibration=False, use_optimum_threshold=True)
hard_ensemble_model_svm_non_calibrated = utils.new_create_ensemble_model(svm_load_models_info, calibration=False, use_optimum_threshold=True)
hard_ensemble_model_knn_non_calibrated = utils.new_create_ensemble_model(knn_load_models_info, calibration=False, use_optimum_threshold=True)
hard_ensemble_model_lr_non_calibrated = utils.new_create_ensemble_model(lr_load_models_info, calibration=False, use_optimum_threshold=True)

In [9]:
hard_ensemble_model_xgb.fit(X,y)

NotFittedError: This Pipeline instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## Fit the ensemble model on the training data

In [None]:
#First fit and then create ensemble
#????????????????
ensemble_model_xgb.fit(X,y)
ensemble_model_cb.fit(X,y)
ensemble_model_lgbm.fit(X,y)
ensemble_model_gb.fit(X,y)
ensemble_model_rf.fit(X,y)
ensemble_model_nb.fit(X,y)
ensemble_model_svm.fit(X,y)
ensemble_model_knn.fit(X,y)
ensemble_model_lr.fit(X,y)

ensemble_model_xgb_non_calibrated.fit(X,y)
ensemble_model_cb_non_calibrated.fit(X,y) 
ensemble_model_lgbm_non_calibrated.fit(X,y)
ensemble_model_gb_non_calibrated.fit(X,y) 
ensemble_model_rf_non_calibrated.fit(X,y) 
ensemble_model_nb_non_calibrated.fit(X,y) 
ensemble_model_svm_non_calibrated.fit(X,y)
ensemble_model_knn_non_calibrated.fit(X,y)
ensemble_model_lr_non_calibrated.fit(X,y)


hard_ensemble_model_xgb.fit(X,y)
hard_ensemble_model_cb.fit(X,y)
hard_ensemble_model_lgbm.fit(X,y)
hard_ensemble_model_gb.fit(X,y)
hard_ensemble_model_rf.fit(X,y)
hard_ensemble_model_nb.fit(X,y)
hard_ensemble_model_svm.fit(X,y)
hard_ensemble_model_knn.fit(X,y)
hard_ensemble_model_lr.fit(X,y)

hard_ensemble_model_xgb_non_calibrated.fit(X,y)
hard_ensemble_model_cb_non_calibrated.fit(X,y) 
hard_ensemble_model_lgbm_non_calibrated.fit(X,y)
hard_ensemble_model_gb_non_calibrated.fit(X,y) 
hard_ensemble_model_rf_non_calibrated.fit(X,y) 
hard_ensemble_model_nb_non_calibrated.fit(X,y) 
hard_ensemble_model_svm_non_calibrated.fit(X,y)
hard_ensemble_model_knn_non_calibrated.fit(X,y)
hard_ensemble_model_lr_non_calibrated.fit(X,y)



## Save all the ensemble models

In [None]:
# joblib.dump(ensemble_model_xgb, '../results/models/xgb_ensemble_model')
# joblib.dump(ensemble_model_cb, '../results/models/cb_ensemble_model')
# joblib.dump(ensemble_model_lgbm, '../results/models/lgbm_ensemble_model')
# joblib.dump(ensemble_model_gb, '../results/models/gb_ensemble_model')
# joblib.dump(ensemble_model_rf, '../results/models/rf_ensemble_model')
# joblib.dump(ensemble_model_nb, '../results/models/nb_ensemble_model')
# joblib.dump(ensemble_model_svm, '../results/models/svm_ensemble_model')
# joblib.dump(ensemble_model_lr, '../results/models/lr_ensemble_model')
# joblib.dump(ensemble_model_knn, '../results/models/knn_ensemble_model')

# # Save the non-calibrated models
# joblib.dump(ensemble_model_xgb_non_calibrated, '../results/models/xgb_non_calibrated_ensemble_model')
# joblib.dump(ensemble_model_cb_non_calibrated, '../results/models/cb_non_calibrated_ensemble_model')
# joblib.dump(ensemble_model_lgbm_non_calibrated, '../results/models/lgbm_non_calibrated_ensemble_model')
# joblib.dump(ensemble_model_gb_non_calibrated, '../results/models/gb_non_calibrated_ensemble_model')
# joblib.dump(ensemble_model_rf_non_calibrated, '../results/models/rf_non_calibrated_ensemble_model')
# joblib.dump(ensemble_model_nb_non_calibrated, '../results/models/nb_non_calibrated_ensemble_model')
# joblib.dump(ensemble_model_svm_non_calibrated, '../results/models/svm_non_calibrated_ensemble_model')
# joblib.dump(ensemble_model_lr_non_calibrated, '../results/models/lr_non_calibrated_ensemble_model')
# joblib.dump(ensemble_model_knn_non_calibrated, '../results/models/knn_non_calibrated_ensemble_model')

## Loading all the models

In [None]:
# Load the saved models
ensemble_model_xgb = joblib.load('../results/models/xgb_ensemble_model')
ensemble_model_cb = joblib.load('../results/models/cb_ensemble_model')
ensemble_model_lgbm = joblib.load('../results/models/lgbm_ensemble_model')
ensemble_model_gb = joblib.load('../results/models/gb_ensemble_model')
ensemble_model_rf = joblib.load('../results/models/rf_ensemble_model')
ensemble_model_nb = joblib.load('../results/models/nb_ensemble_model')
ensemble_model_svm = joblib.load('../results/models/svm_ensemble_model')
ensemble_model_lr = joblib.load('../results/models/lr_ensemble_model')
ensemble_model_knn = joblib.load('../results/models/knn_ensemble_model')

# Load the saved non-calibrated models
ensemble_model_xgb_non_calibrated = joblib.load('../results/models/xgb_non_calibrated_ensemble_model')
ensemble_model_cb_non_calibrated = joblib.load('../results/models/cb_non_calibrated_ensemble_model')
ensemble_model_lgbm_non_calibrated = joblib.load('../results/models/lgbm_non_calibrated_ensemble_model')
ensemble_model_gb_non_calibrated = joblib.load('../results/models/gb_non_calibrated_ensemble_model')
ensemble_model_rf_non_calibrated = joblib.load('../results/models/rf_non_calibrated_ensemble_model')
ensemble_model_nb_non_calibrated = joblib.load('../results/models/nb_non_calibrated_ensemble_model')
ensemble_model_svm_non_calibrated = joblib.load('../results/models/svm_non_calibrated_ensemble_model')
ensemble_model_lr_non_calibrated = joblib.load('../results/models/lr_non_calibrated_ensemble_model')
ensemble_model_knn_non_calibrated = joblib.load('../results/models/knn_non_calibrated_ensemble_model')


## Bootstrap Evaluation

In [None]:
# Create a list of the loaded models
models = [ensemble_model_xgb, ensemble_model_cb, ensemble_model_lgbm, ensemble_model_gb,
          ensemble_model_rf, ensemble_model_nb, ensemble_model_svm, ensemble_model_knn, 
          ensemble_model_lr]

model_names = ['xgb', 'cb', 'lgbm', 'gb', 'rf', 'nb', 'svm', 'knn', 'lr']

# Create a list of the loaded non-calibrated models
non_calibrated_models = [ensemble_model_xgb_non_calibrated, ensemble_model_cb_non_calibrated, ensemble_model_lgbm_non_calibrated,
                         ensemble_model_gb_non_calibrated, ensemble_model_rf_non_calibrated, ensemble_model_nb_non_calibrated, 
                         ensemble_model_svm_non_calibrated, ensemble_model_knn_non_calibrated, ensemble_model_lr_non_calibrated
]

non_calibrated_model_names = ['xgb_non_calibrated', 'cb_non_calibrated', 'lgbm_non_calibrated', 'gb_non_calibrated', 'rf_non_calibrated', 
                              'nb_non_calibrated', 'svm_non_calibrated', 'knn_non_calibrated', 'lr_non_calibrated'
]



In [None]:
# probabilities_df_non_calibrated, metrics_df_non_calibrated = utils.evaluate_models_with_ci(non_calibrated_models, non_calibrated_model_names, 
#                                                                                    X_int_val, y_int_val)

In [None]:
# probabilities_df_calibrated, metrics_df_calibrated = utils.evaluate_models_with_ci(models, model_names, X_int_val, y_int_val)

## Save the dataframes to CSV files

In [None]:
# probabilities_df_calibrated.to_csv('../results/int_val_data/model_probabilities_calibrated.csv', index=False)
# metrics_df_calibrated.to_csv('../results/int_val_data/model_metrics_calibrated.csv', index=False)
# probabilities_df_non_calibrated.to_csv('../results/int_val_data/model_probabilities_non_calibrated.csv', index=False)
# metrics_df_non_calibrated.to_csv('../results/int_val_data/model_metrics_non_calibrated.csv', index=False)

## Save the dataframes as python objects

In [None]:
# joblib.dump(probabilities_df_calibrated, '../results/int_val_data/probabilities_df_calibrated.joblib')
# joblib.dump(metrics_df_calibrated, '../results/int_val_data/metrics_df_calibrated.joblib')
# joblib.dump(probabilities_df_non_calibrated, '../results/int_val_data/probabilities_df_non_calibrated.joblib')
# joblib.dump(metrics_df_non_calibrated, '../results/int_val_data/metrics_df_non_calibrated.joblib')

## One shot evaluation

In [None]:
# Evaluate calibrated models
probabilities_df_one_shot, metrics_df_oneshot = utils.evaluate_models(models, model_names, X_int_val, y_int_val)

# Evaluate non-calibrated models
probabilities_df_nc_one_shot, metrics_df_nc_one_shot = utils.evaluate_models(non_calibrated_models, non_calibrated_model_names, X_int_val, y_int_val)

In [None]:
# Save the dataframes to CSV files
probabilities_df_one_shot.to_csv('../results/int_val_data/probabilities_df_one_shot.csv', index=False)
metrics_df_oneshot.to_csv('../results/int_val_data/metrics_df_oneshot.csv', index=False)
probabilities_df_nc_one_shot.to_csv('../results/int_val_data/probabilities_df_nc_one_shot.csv', index=False)
metrics_df_nc_one_shot.to_csv('../results/int_val_data/metrics_df_nc_one_shot.csv', index=False)