In [1]:
import pandas as pd

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, precision_score, recall_score, make_scorer
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Visualization imports (if you wish to visualize metrics)
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the preprocessed datasets
combined_y1 = pd.read_csv("../data/preprocessed/prp_combined_Y1.csv")
combined_y2 = pd.read_csv("../data/preprocessed/prp_combined_Y2.csv")

# Combine Y1 and Y2 data for training
combined_df = pd.concat([combined_y1, combined_y2], axis=0)

# Separate the features and target variable
X = combined_df.drop(columns=['DaysInHospitalY2', 'DaysInHospitalY3', 'MemberID'])
y = combined_df['DaysInHospitalY2']

# Display a summary of the features and the target
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (147396, 16)
Target shape: (147396,)


In [3]:
# Check if there are any NaN values in y
print(f"Number of NaN values in y before dropping: {y.isna().sum()}")

# Drop rows where y is NaN
X = X[~y.isna()]
y = y.dropna()

# Confirm there are no NaN values left
print(f"Number of NaN values in y after dropping: {y.isna().sum()}")

Number of NaN values in y before dropping: 71400
Number of NaN values in y after dropping: 0


In [4]:
# Set up nested cross-validation strategy
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'n_estimators': [50, 100, 200, 300],
    'subsample': [0.8, 0.9, 1.0]
}

# Define the XGBoost classifier
xgb_model = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=10,
    scoring='roc_auc',
    cv=inner_cv,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Cross-validation scoring metrics
scorers = {
    'roc_auc': make_scorer(roc_auc_score),
    'precision': make_scorer(precision_score, zero_division=1),
    'recall': make_scorer(recall_score, zero_division=1)
}

# Set up empty lists to store scores
outer_scores = []

# Loop for the outer cross-validation
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Perform hyperparameter tuning using inner cross-validation
    random_search.fit(X_train, y_train)
    
    # Best estimator from inner cross-validation
    best_model = random_search.best_estimator_
    
    # Evaluate the model on the outer test fold
    y_pred_probs = best_model.predict_proba(X_test)  # Get the predicted probabilities
    y_pred = best_model.predict(X_test)  # Get the predicted class labels

    # Ensure y_pred and y_test are one-dimensional
    y_pred = y_pred.ravel() if len(y_pred.shape) > 1 else y_pred
    y_test = y_test.ravel() if len(y_test.shape) > 1 else y_test

    # Evaluate the model on the outer test fold
    if len(np.unique(y_test)) > 2:
        roc_auc = roc_auc_score(y_test, y_pred_probs, multi_class='ovr')
        precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
        recall = recall_score(y_test, y_pred, average='macro', zero_division=1)
    else:
        roc_auc = roc_auc_score(y_test, y_pred_probs[:, 1])  # For binary classification, take the second column
        precision = precision_score(y_test, y_pred, average='binary', zero_division=1)
        recall = recall_score(y_test, y_pred, average='binary', zero_division=1)
    
    # Store the scores
    outer_scores.append({'roc_auc': roc_auc, 'precision': precision, 'recall': recall})

# Convert the scores list into a DataFrame to analyze the results
results_df = pd.DataFrame(outer_scores)
print("Mean ROC-AUC:", results_df['roc_auc'].mean())
print("Mean Precision:", results_df['precision'].mean())
print("Mean Recall:", results_df['recall'].mean())

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Mean ROC-AUC: 0.7528702669235218
Mean Precision: 0.8311581339610541
Mean Recall: 0.06312740781028725
