In [1]:
import sys
import os
# Add the parent directory to the system path
sys.path.append(os.path.abspath(os.path.join('..')))
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
f1_score, roc_auc_score, roc_curve, auc, confusion_matrix, classification_report)
from sklearn.utils import resample

In [2]:
from utilities import bs_calibrate, bs_results, utils

## Load the data

In [3]:
df = pd.read_csv("../dibh_clinical_only/original_training_data.csv")
data_day1 = df[df['day'] == 1]
X = data_day1.drop(['crnumber', 'day', 'DIBH_Y0N1'], axis=1)
y = data_day1['DIBH_Y0N1']
X_t, X_v, y_t, y_v = train_test_split(X, y, test_size=0.30, random_state=42)

df_internal_val = pd.read_csv("../dibh_clinical_only/final_internal_validation13052024.csv")
data_day1_val = df_internal_val[df_internal_val['day'] == 1]
X_int_val = data_day1_val.drop(['crnumber', 'day', 'DIBH_Y0N1'], axis=1)
y_int_val = data_day1_val['DIBH_Y0N1']


## Load the model information for all ML models

In [4]:
xgb_load_models_info = joblib.load('../XGBoost_Models/top_models_xgb.joblib')
cb_load_models_info = joblib.load('../Catboost_Models/top_models_cb.joblib')
lgbm_load_models_info = joblib.load('../LightGBM_Models/top_models_lgbm.joblib')
gb_load_models_info = joblib.load('../GB_Models/top_models_gb.joblib')
rf_load_models_info = joblib.load('../RF_Models/top_models_rf.joblib')
nb_load_models_info = joblib.load('../NB_Models/top_models_nb.joblib')
svm_load_models_info = joblib.load('../SVM_Models/top_models_svm.joblib')
knn_load_models_info = joblib.load('../KNN_Models/top_models_knn.joblib')
lr_load_models_info = joblib.load('../LR_Models/top_models_lr.joblib')


## Create Ensemble for all the ML models

In [6]:

ensemble_model_xgb = utils.create_ensemble_model(xgb_load_models_info)
ensemble_model_cb = utils.create_ensemble_model(cb_load_models_info)
ensemble_model_lgbm = utils.create_ensemble_model(lgbm_load_models_info)
ensemble_model_gb = utils.create_ensemble_model(gb_load_models_info)
ensemble_model_rf = utils.create_ensemble_model(rf_load_models_info)
ensemble_model_nb = utils.create_ensemble_model(nb_load_models_info)
ensemble_model_svm = utils.create_ensemble_model(svm_load_models_info)
ensemble_model_knn = utils.create_ensemble_model(knn_load_models_info)
ensemble_model_lr = utils.create_ensemble_model(lr_load_models_info)

ensemble_model_xgb_non_calibrated = utils.create_ensemble_model(xgb_load_models_info, calibration=False)
ensemble_model_cb_non_calibrated = utils.create_ensemble_model(cb_load_models_info, calibration=False)
ensemble_model_lgbm_non_calibrated = utils.create_ensemble_model(lgbm_load_models_info, calibration=False)
ensemble_model_gb_non_calibrated = utils.create_ensemble_model(gb_load_models_info, calibration=False)
ensemble_model_rf_non_calibrated = utils.create_ensemble_model(rf_load_models_info, calibration=False)
ensemble_model_nb_non_calibrated = utils.create_ensemble_model(nb_load_models_info, calibration=False)
ensemble_model_svm_non_calibrated = utils.create_ensemble_model(svm_load_models_info, calibration=False)
ensemble_model_knn_non_calibrated = utils.create_ensemble_model(knn_load_models_info, calibration=False)
ensemble_model_lr_non_calibrated = utils.create_ensemble_model(lr_load_models_info, calibration=False)

## Fit the ensemble model on the training data

In [8]:

ensemble_model_xgb.fit(X,y)
ensemble_model_cb.fit(X,y)
ensemble_model_lgbm.fit(X,y)
ensemble_model_gb.fit(X,y)
ensemble_model_rf.fit(X,y)
ensemble_model_nb.fit(X,y)
ensemble_model_svm.fit(X,y)
ensemble_model_knn.fit(X,y)
ensemble_model_lr.fit(X,y)

ensemble_model_xgb_non_calibrated.fit(X,y)
ensemble_model_cb_non_calibrated.fit(X,y) 
ensemble_model_lgbm_non_calibrated.fit(X,y)
ensemble_model_gb_non_calibrated.fit(X,y) 
ensemble_model_rf_non_calibrated.fit(X,y) 
ensemble_model_nb_non_calibrated.fit(X,y) 
ensemble_model_svm_non_calibrated.fit(X,y)
ensemble_model_knn_non_calibrated.fit(X,y)
ensemble_model_lr_non_calibrated.fit(X,y)


## Save all the ensemble models

In [9]:
joblib.dump(ensemble_model_xgb, '../saved_models/xgb_ensemble_model')
joblib.dump(ensemble_model_cb, '../saved_models/cb_ensemble_model')
joblib.dump(ensemble_model_lgbm, '../saved_models/lgbm_ensemble_model')
joblib.dump(ensemble_model_gb, '../saved_models/gb_ensemble_model')
joblib.dump(ensemble_model_rf, '../saved_models/rf_ensemble_model')
joblib.dump(ensemble_model_nb, '../saved_models/nb_ensemble_model')
joblib.dump(ensemble_model_svm, '../saved_models/svm_ensemble_model')
joblib.dump(ensemble_model_lr, '../saved_models/lr_ensemble_model')
joblib.dump(ensemble_model_knn, '../saved_models/knn_ensemble_model')

# Save the non-calibrated models
joblib.dump(ensemble_model_xgb_non_calibrated, '../saved_models/xgb_non_calibrated_ensemble_model')
joblib.dump(ensemble_model_cb_non_calibrated, '../saved_models/cb_non_calibrated_ensemble_model')
joblib.dump(ensemble_model_lgbm_non_calibrated, '../saved_models/lgbm_non_calibrated_ensemble_model')
joblib.dump(ensemble_model_gb_non_calibrated, '../saved_models/gb_non_calibrated_ensemble_model')
joblib.dump(ensemble_model_rf_non_calibrated, '../saved_models/rf_non_calibrated_ensemble_model')
joblib.dump(ensemble_model_nb_non_calibrated, '../saved_models/nb_non_calibrated_ensemble_model')
joblib.dump(ensemble_model_svm_non_calibrated, '../saved_models/svm_non_calibrated_ensemble_model')
joblib.dump(ensemble_model_lr_non_calibrated, '../saved_models/lr_non_calibrated_ensemble_model')
joblib.dump(ensemble_model_knn_non_calibrated, '../saved_models/knn_non_calibrated_ensemble_model')

['../saved_models/knn_non_calibrated_ensemble_model']

## Loading all the models

In [10]:
# Load the saved models
ensemble_model_xgb = joblib.load('../saved_models/xgb_ensemble_model')
ensemble_model_cb = joblib.load('../saved_models/cb_ensemble_model')
ensemble_model_lgbm = joblib.load('../saved_models/lgbm_ensemble_model')
ensemble_model_gb = joblib.load('../saved_models/gb_ensemble_model')
ensemble_model_rf = joblib.load('../saved_models/rf_ensemble_model')
ensemble_model_nb = joblib.load('../saved_models/nb_ensemble_model')
ensemble_model_svm = joblib.load('../saved_models/svm_ensemble_model')
ensemble_model_lr = joblib.load('../saved_models/lr_ensemble_model')
ensemble_model_knn = joblib.load('../saved_models/knn_ensemble_model')

# Load the saved non-calibrated models
ensemble_model_xgb_non_calibrated = joblib.load('../saved_models/xgb_non_calibrated_ensemble_model')
ensemble_model_cb_non_calibrated = joblib.load('../saved_models/cb_non_calibrated_ensemble_model')
ensemble_model_lgbm_non_calibrated = joblib.load('../saved_models/lgbm_non_calibrated_ensemble_model')
ensemble_model_gb_non_calibrated = joblib.load('../saved_models/gb_non_calibrated_ensemble_model')
ensemble_model_rf_non_calibrated = joblib.load('../saved_models/rf_non_calibrated_ensemble_model')
ensemble_model_nb_non_calibrated = joblib.load('../saved_models/nb_non_calibrated_ensemble_model')
ensemble_model_svm_non_calibrated = joblib.load('../saved_models/svm_non_calibrated_ensemble_model')
ensemble_model_lr_non_calibrated = joblib.load('../saved_models/lr_non_calibrated_ensemble_model')
ensemble_model_knn_non_calibrated = joblib.load('../saved_models/knn_non_calibrated_ensemble_model')


## Bootstrap Evaluation

In [12]:
# Create a list of the loaded models
models = [ensemble_model_xgb, ensemble_model_cb, ensemble_model_lgbm, ensemble_model_gb,
          ensemble_model_rf, ensemble_model_nb, ensemble_model_svm, ensemble_model_knn, 
          ensemble_model_lr]

model_names = ['xgb', 'cb', 'lgbm', 'gb', 'rf', 'nb', 'svm', 'knn', 'lr']

# Create a list of the loaded non-calibrated models
non_calibrated_models = [ensemble_model_xgb_non_calibrated, ensemble_model_cb_non_calibrated, ensemble_model_lgbm_non_calibrated,
                         ensemble_model_gb_non_calibrated, ensemble_model_rf_non_calibrated, ensemble_model_nb_non_calibrated, 
                         ensemble_model_svm_non_calibrated, ensemble_model_knn_non_calibrated, ensemble_model_lr_non_calibrated
]

non_calibrated_model_names = ['xgb_non_calibrated', 'cb_non_calibrated', 'lgbm_non_calibrated', 'gb_non_calibrated', 'rf_non_calibrated', 
                              'nb_non_calibrated', 'svm_non_calibrated', 'knn_non_calibrated', 'lr_non_calibrated'
]



In [16]:
probabilities_df_non_calibrated, metrics_df_non_calibrated = utils.evaluate_models(non_calibrated_models, non_calibrated_model_names, 
                                                                                   X_int_val, y_int_val)

Bootstrapong Evaluation done for -- xgb_non_calibrated
Bootstrapong Evaluation done for -- cb_non_calibrated
Bootstrapong Evaluation done for -- lgbm_non_calibrated
Bootstrapong Evaluation done for -- gb_non_calibrated
Bootstrapong Evaluation done for -- rf_non_calibrated
Bootstrapong Evaluation done for -- nb_non_calibrated
Bootstrapong Evaluation done for -- svm_non_calibrated
Bootstrapong Evaluation done for -- knn_non_calibrated
Bootstrapong Evaluation done for -- lr_non_calibrated


In [17]:
probabilities_df_calibrated, metrics_df_calibrated = utils.evaluate_models(models, model_names, X_int_val, y_int_val)

Bootstrapong Evaluation done for -- xgb
Bootstrapong Evaluation done for -- cb
Bootstrapong Evaluation done for -- lgbm
Bootstrapong Evaluation done for -- gb
Bootstrapong Evaluation done for -- rf
Bootstrapong Evaluation done for -- nb
Bootstrapong Evaluation done for -- svm
Bootstrapong Evaluation done for -- knn
Bootstrapong Evaluation done for -- lr


# Save the dataframes to CSV files

In [19]:
probabilities_df_calibrated.to_csv('../saved_models/model_probabilities_calibrated.csv', index=False)
metrics_df_calibrated.to_csv('../saved_models/model_metrics_calibrated.csv', index=False)
probabilities_df_non_calibrated.to_csv('../saved_models/model_probabilities_non_calibrated.csv', index=False)
metrics_df_non_calibrated.to_csv('../saved_models/model_metrics_non_calibrated.csv', index=False)

In [14]:
metrics_df_non_calibrated

Unnamed: 0,model,roc_auc_mean,roc_auc_std,roc_auc_ci_lower,roc_auc_ci_upper,accuracy_mean,accuracy_std,accuracy_ci_lower,accuracy_ci_upper,precision_mean,...,precision_ci_lower,precision_ci_upper,recall_mean,recall_std,recall_ci_lower,recall_ci_upper,f1_score_mean,f1_score_std,f1_score_ci_lower,f1_score_ci_upper
0,xgb_non_calibrated,0.792017,0.064441,0.646072,0.904259,0.700404,0.065033,0.574468,0.809043,0.614832,...,0.409091,0.809524,0.683434,0.110241,0.449861,0.882353,0.641744,0.08864,0.444144,0.790698
1,cb_non_calibrated,0.783696,0.069011,0.644663,0.906255,0.701319,0.069437,0.553191,0.829787,0.629431,...,0.4,0.850179,0.631368,0.117784,0.4,0.85,0.62393,0.097483,0.421053,0.791875
2,lgbm_non_calibrated,0.788799,0.066575,0.649077,0.902096,0.700511,0.067155,0.553191,0.808511,0.631537,...,0.4,0.833333,0.626556,0.110986,0.4,0.833333,0.623102,0.093278,0.411765,0.780488
3,gb_non_calibrated,0.787135,0.066786,0.649417,0.903929,0.68134,0.069787,0.553191,0.808511,0.60438,...,0.388889,0.826087,0.629244,0.114348,0.4,0.85,0.61028,0.094863,0.411727,0.789474
4,rf_non_calibrated,0.808272,0.063224,0.679961,0.922943,0.72066,0.063261,0.595745,0.851064,0.632663,...,0.439937,0.823529,0.730768,0.103014,0.533235,0.923077,0.672516,0.083082,0.5125,0.820588
5,nb_non_calibrated,0.788553,0.070568,0.630909,0.91459,0.74434,0.06246,0.617021,0.851064,0.632238,...,0.444333,0.8125,0.888627,0.072058,0.727273,1.0,0.734476,0.073373,0.578918,0.862745
6,svm_non_calibrated,0.728525,0.082046,0.554126,0.87917,0.65834,0.068584,0.510638,0.787234,0.572808,...,0.333333,0.777778,0.630485,0.115664,0.4,0.842206,0.593742,0.093961,0.386879,0.760048
7,knn_non_calibrated,0.764389,0.074631,0.605311,0.899015,0.682553,0.07005,0.531915,0.808511,0.625633,...,0.375,0.851984,0.529563,0.117645,0.293908,0.75,0.566803,0.104002,0.347826,0.750139
8,lr_non_calibrated,0.761694,0.072726,0.616882,0.888907,0.67766,0.066808,0.55266,0.808511,0.599833,...,0.368301,0.809598,0.634638,0.109414,0.416544,0.8335,0.610726,0.091579,0.4,0.769231


In [20]:
metrics_df_calibrated

Unnamed: 0,model,roc_auc_mean,roc_auc_std,roc_auc_ci_lower,roc_auc_ci_upper,accuracy_mean,accuracy_std,accuracy_ci_lower,accuracy_ci_upper,precision_mean,...,precision_ci_lower,precision_ci_upper,recall_mean,recall_std,recall_ci_lower,recall_ci_upper,f1_score_mean,f1_score_std,f1_score_ci_lower,f1_score_ci_upper
0,xgb,0.800931,0.064998,0.657394,0.917767,0.745723,0.064775,0.617021,0.87234,0.667847,...,0.458239,0.863636,0.73489,0.10398,0.521659,0.933333,0.69435,0.086767,0.514249,0.844444
1,cb,0.786415,0.068483,0.643476,0.905491,0.725255,0.063344,0.595745,0.851064,0.637459,...,0.437432,0.842105,0.737565,0.100769,0.529412,0.928571,0.678454,0.082925,0.5,0.826087
2,lgbm,0.807147,0.063193,0.676235,0.919423,0.724745,0.06532,0.595745,0.851064,0.670958,...,0.444271,0.882516,0.630762,0.115804,0.4,0.85,0.643255,0.094481,0.451613,0.808511
3,gb,0.788515,0.065987,0.643109,0.905602,0.724553,0.065872,0.595745,0.851064,0.628138,...,0.437432,0.809656,0.78522,0.097333,0.588235,0.952381,0.692936,0.081094,0.512821,0.836364
4,rf,0.81073,0.063222,0.674295,0.920328,0.749064,0.063129,0.617021,0.87234,0.669681,...,0.454545,0.866667,0.744493,0.099716,0.538462,0.9375,0.699651,0.083116,0.516129,0.844444
5,nb,0.796809,0.070402,0.647842,0.921425,0.724149,0.067724,0.595745,0.851064,0.60782,...,0.419355,0.785714,0.899511,0.074344,0.733333,1.0,0.720714,0.078073,0.555556,0.857143
6,svm,0.713812,0.080291,0.552612,0.869761,0.637447,0.07033,0.510106,0.765957,0.540213,...,0.346094,0.727424,0.736185,0.10146,0.538333,0.923214,0.617806,0.086824,0.438986,0.77551
7,knn,0.772802,0.068174,0.632727,0.893185,0.746298,0.063232,0.617021,0.87234,0.65518,...,0.454545,0.850179,0.790566,0.092784,0.6,0.952381,0.711428,0.079096,0.549886,0.851064
8,lr,0.763643,0.072518,0.609255,0.891024,0.681383,0.068873,0.531915,0.808511,0.604136,...,0.399783,0.8125,0.638873,0.109979,0.420943,0.842105,0.615182,0.092139,0.420991,0.775567
