In [25]:
import pandas as pd
import numpy as np
import yaml
import os
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from imblearn.combine import SMOTETomek
from datetime import datetime
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from google.colab import files, drive

In [24]:

# Upload files directly
print("Please upload your 'config_train_xgb.yaml' file:")
uploaded = files.upload()

# Check and upload data file
DATA_PATH_COLAB = "processed_data.csv"
if not os.path.exists(DATA_PATH_COLAB):
    print("\nPlease upload your 'processed_data.csv' file:")
    files.upload()

print("Files uploaded to the Colab session root directory.")

# Define paths for the script
CONFIG_PATH_COLAB = "config_train_xgb.yaml"

Please upload your 'config_train_xgb.yaml' file:


Saving config_train_xgb.yaml to config_train_xgb (7).yaml
Files uploaded to the Colab session root directory.


In [None]:
#  Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)
print(f"Data split into training (shape: {X_train.shape}) and testing (shape: {X_test.shape}) sets.")

# Initial CV for Baseline (Using X_train/y_train only)
xgb_baseline_estimator = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=RANDOM_STATE,
    n_jobs=-1,
    n_estimators=100, learning_rate=0.1, max_depth=5
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
f1_scores = cross_val_score(xgb_baseline_estimator, X_train, y_train, cv=skf, scoring='f1_macro', n_jobs=-1)
acc_scores = cross_val_score(xgb_baseline_estimator, X_train, y_train, cv=skf, scoring='accuracy', n_jobs=-1)

print(f"CV Macro F1 (Baseline): {f1_scores.mean():.4f} ± {f1_scores.std():.4f}")
print(f"CV Accuracy (Baseline): {acc_scores.mean():.4f} ± {acc_scores.std():.4f}")

# Imbalance Handling (SMOTETomek)
if APPLY_SMOTE:
    print("Applying SMOTETomek resampling on training data...")
    smt = SMOTETomek(random_state=RANDOM_STATE, n_jobs=-1)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)
    print(f"Training data resampled from {X_train.shape[0]} to {X_train_res.shape[0]} rows.")
else:
    X_train_res, y_train_res = X_train, y_train


## Hyperparameter Tuning with Randomized Search & Stratified CV

print("\n--- Starting RandomizedSearchCV for Optimal XGBoost Tuning ---")

# 1. Define the XGBoost Estimator
xgb_tuner_estimator = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss', # Use logloss for tuning stability
    random_state=RANDOM_STATE,
    n_jobs=-1,
    use_label_encoder=False,
    scale_pos_weight=1.0
)

N_ITER = config.get("n_iter", 5)

## Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_tuner_estimator,
    param_distributions=param_dist,
    n_iter=N_ITER,
    scoring='f1',
    cv=skf,
    verbose=1,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

# Search on the Resampled Training Data
start_time = datetime.now()
random_search.fit(X_train_res, y_train_res)
end_time = datetime.now()

print(f"Randomized Search completed in {(end_time - start_time).total_seconds():.2f} seconds.")
print("\n--- Best Model Found ---")
print(f"Best CV F1 Score: {random_search.best_score_:.4f}")
print("Best Parameters:", random_search.best_params_)

# 6. Train Final Model using the BEST Estimator
model = random_search.best_estimator_

## Evaluation
print("\nEvaluation on Test Set (using Best Model)")
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Find optimal threshold using the test set
def find_best_threshold(y_true, y_probs):
    best_thresh, best_f1 = 0.5, 0
    # Search from 10% to 90% in small steps
    for t in np.linspace(0.1, 0.9, 81):
        preds = (y_probs >= t).astype(int)
        score = f1_score(y_true, preds)
        if score > best_f1:
            best_f1 = score
            best_thresh = t
    return best_thresh, best_f1

best_thresh, best_f1 = find_best_threshold(y_test, y_pred_proba)
y_pred_tuned = (y_pred_proba >= best_thresh).astype(int)

auc_score = roc_auc_score(y_test, y_pred_proba)

print(f"ROC AUC Score: **{auc_score:.4f}**")
print(f"Best F1 Score (Threshold={best_thresh:.3f}): **{best_f1:.4f}**")
# print("\nClassification Report (Tuned Threshold):")
# print(classification_report(y_test, y_pred_tuned))