In [None]:
import os
from functools import partial
from datetime import datetime
import optuna
import torch as ch
from os import path
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

from sklearn.metrics import accuracy_score, roc_auc_score, matthews_corrcoef, average_precision_score
import xgboost as xgb

from prediction.short_term_outcome_prediction.timeseries_decomposition import prepare_aggregate_dataset
from prediction.utils.scoring import precision, recall, specificity
from prediction.utils.utils import ensure_dir

In [None]:
# data_path = '/mnt/data1/klug/datasets/opsum/short_term_outcomes/gsu_Extraction_20220815_prepro_08062024_083500/early_neurological_deterioration_train_data_splits/train_data_splits_early_neurological_deterioration_ts0.8_rs42_ns5.pth'
data_path = '/Users/jk1/temp/opsum_end/preprocessing/gsu_Extraction_20220815_prepro_09052025_220520/early_neurological_deterioration_train_data_splits/train_data_splits_early_neurological_deterioration_ts0.8_rs42_ns5.pth'
output_dir = '/Users/jk1/Downloads'


In [None]:
splits = ch.load(os.path.join(data_path))

In [None]:
selected_split_data = splits[0]
dataset = prepare_aggregate_dataset(selected_split_data, rescale=True, target_time_to_outcome=6, 
                          target_interval=True, restrict_to_first_event=False)

In [None]:
X_train, X_val, y_train, y_val = dataset

In [None]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
# print positive percentage
print(f'Positive percentage in train: {y_train.sum() / len(y_train)*100:.2f}')
print(f'Positive percentage in val: {y_val.sum() / len(y_val)*100:.2f}')


In [None]:
subset_X_train = X_train[:10000]
subset_y_train = y_train[:10000]
subset_X_val = X_val[:1000]
subset_y_val = y_val[:1000]


In [None]:

subset_X_train = X_train
subset_y_train = y_train
subset_X_val = X_val
subset_y_val = y_val

In [None]:
scale_pos_weight = len(subset_y_train[subset_y_train == 0]) / len(subset_y_train[subset_y_train == 1])
print(f'Scale pos weight: {scale_pos_weight:.2f}')

xgb_model = xgb.XGBClassifier(    
            reg_lambda=10,  # L2 regularization (default is 1)
            reg_alpha=1, 
            scale_pos_weight=scale_pos_weight,  
            max_depth=4,
    min_child_weight=3,
    learning_rate=0.1,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    colsample_bylevel=0.8,
    booster='dart', 
        grow_policy='lossguide',
    num_boost_round=500,
)

# trained_xgb = xgb_model.fit(subset_X_train, subset_y_train, eval_metric=["aucpr", "auc"],
#                                 early_stopping_rounds=70,  # Stop if no improvement in 10 rounds

#                                     eval_set=[(subset_X_train, subset_y_train), (subset_X_val, subset_y_val)])
# train_history = trained_xgb.evals_result()


In [None]:
from sklearn.ensemble import BaggingClassifier


model = BaggingClassifier(
    base_estimator=xgb_model,
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=-1,
    random_state=42
)
trained_xgb = model.fit(subset_X_train, subset_y_train)

In [None]:

# print(f"Best iteration: {trained_xgb.best_iteration}")
# iterations = list(range(len(train_history["validation_0"]["auc"])))

# 2 subfigure
fig, ax = plt.subplots(1, 3, figsize=(18, 5))
# ax[0].plot(iterations, train_history["validation_0"]["auc"], label="training data")
# ax[0].plot(iterations, train_history["validation_1"]["auc"], label="val data")

# set axes
ax[0].set_title("AUC")
ax[0].set_xlabel("iterations")
ax[0].set_ylabel("AUC")
ax[0].legend()


y_pred_proba_train = trained_xgb.predict_proba(subset_X_train)
y_pred_proba_val = trained_xgb.predict_proba(subset_X_val)

fpr_val, tpr_val, _ = sklearn.metrics.roc_curve(subset_y_val, y_pred_proba_val[:, 1])
fpr_train, tpr_train, _ = sklearn.metrics.roc_curve(subset_y_train, y_pred_proba_train[:, 1])

auc_val = sklearn.metrics.auc(fpr_val, tpr_val)
auc_train = sklearn.metrics.auc(fpr_train, tpr_train)


ax[1].set_title(f"ROC curve, AUC=(val: {auc_val:.4f}, train: {auc_train:.4f})")
ax[1].plot(fpr_val, tpr_val, label="val data")
ax[1].plot(fpr_train, tpr_train, label="train data")

# plot precision recall curve
precision_val, recall_val, _ = sklearn.metrics.precision_recall_curve(subset_y_val, y_pred_proba_val[:, 1])
precision_train, recall_train, _ = sklearn.metrics.precision_recall_curve(subset_y_train, y_pred_proba_train[:, 1])
pr_auc_val = sklearn.metrics.auc(recall_val, precision_val)
pr_auc_train = sklearn.metrics.auc(recall_train, precision_train)
ax[2].set_title(f"PR curve, AUC=(val: {pr_auc_val:.4f}, train: {pr_auc_train:.4f})")
ax[2].plot(recall_val, precision_val, label="val data")
ax[2].plot(recall_train, precision_train, label="train data")
# set axes
ax[2].set_xlabel("Recall")
ax[2].set_ylabel("Precision")
ax[2].legend()
plt.tight_layout()

In [None]:
y_pred_proba_val

In [None]:
# best for now
# reg_lambda=1,  
# reg_alpha=1