In [20]:
from preprocess import preprocess_data_flat
from sklearn.model_selection import RandomizedSearchCV, cross_validate, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer, accuracy_score, balanced_accuracy_score, roc_auc_score
from xgboost import XGBClassifier
from tabulate import tabulate 

In [21]:
X_train_flat, y_train_flat = preprocess_data_flat("2", 15)
X_test_flat, y_test_flat = preprocess_data_flat("1", 15)

In [22]:
model = XGBClassifier(random_state=123, eval_metric="logloss")

param_dist = {
    "max_depth": [2, 3, 4, 5, 6],
    "learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],
    "n_estimators": [300, 500, 700, 900, 1100],
    "min_child_weight": [1, 3, 5, 7],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9],
    "reg_alpha": [0, 0.1, 0.5, 1.0, 1.5],
    "reg_lambda": [0.5, 1.0, 1.5, 2.0, 2.5],
    "gamma": [0, 0.1, 0.5, 1.0, 1.5],
    "colsample_bylevel": [0.7, 0.8, 0.9, 1.0],
    "scale_pos_weight": [1, 3, 5, 10],
}

n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)

random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    scoring="balanced_accuracy",
    cv=cv,
    random_state=123,
    n_jobs=10,
    refit="balanced_accuracy",
    verbose=0,
)

random_search.fit(X_train_flat, y_train_flat)
best_model = random_search.best_estimator_


In [23]:
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "balanced_accuracy": make_scorer(balanced_accuracy_score),
    "roc_auc": make_scorer(roc_auc_score),
    "f1": make_scorer(f1_score),
}


cv_results = cross_validate(
    best_model,
    X_train_flat,
    y_train_flat,
    cv=cv,
    scoring=scoring,
    return_train_score=True,
    n_jobs=-1,
)


fold_results = []
for i in range(n_splits):
    fold_results.append(
        {
            "Fold": i + 1,
            "Train Acc": f"{cv_results['train_accuracy'][i]:.4f}",
            "Test Acc": f"{cv_results['test_accuracy'][i]:.4f}",
            "Train BA": f"{cv_results['train_balanced_accuracy'][i]:.4f}",
            "Test BA": f"{cv_results['test_balanced_accuracy'][i]:.4f}",
            "Train AUC": f"{cv_results['train_roc_auc'][i]:.4f}",
            "Test AUC": f"{cv_results['test_roc_auc'][i]:.4f}",
            "Train F1": f"{cv_results['train_f1'][i]:.4f}",
            "Test F1": f"{cv_results['test_f1'][i]:.4f}",
            "Time (s)": f"{cv_results['fit_time'][i] + cv_results['score_time'][i]:.3f}",
        }
    )

summary_results = {
    "F1 Score": f"{cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}",
    "Accuracy": f"{cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}",
    "Balanced Accuracy": f"{cv_results['test_balanced_accuracy'].mean():.4f} ± {cv_results['test_balanced_accuracy'].std():.4f}",
    "ROC AUC": f"{cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}",
    "Total Time (s)": f"{(cv_results['fit_time'] + cv_results['score_time']).sum():.3f}",
}

print("\n" + "=" * 80)
print("CROSS-VALIDATION RESULTS")
print("=" * 80)

print("\n RESULTS BY FOLD:")
print(tabulate(fold_results, headers="keys", tablefmt="pretty"))

print("\n STATISTICS (Test Set within CV):")
for metric, value in summary_results.items():
    print(f"   • {metric:<20}: {value}")


print("\n" + "=" * 80)
print("FINAL EVALUATION ON HELD-OUT TEST SET")
print("=" * 80)
y_pred_test = best_model.predict(X_test_flat)
y_prob_test = best_model.predict_proba(X_test_flat)[:, 1]

test_f1 = f1_score(y_test_flat, y_pred_test)
test_accuracy = accuracy_score(y_test_flat, y_pred_test)
test_balanced_accuracy = balanced_accuracy_score(y_test_flat, y_pred_test)
test_roc_auc = roc_auc_score(y_test_flat, y_prob_test)

print(f"   • {'F1 Score':<20}: {test_f1:.4f}")
print(f"   • {'Accuracy':<20}: {test_accuracy:.4f}")
print(f"   • {'Balanced Accuracy':<20}: {test_balanced_accuracy:.4f}")
print(f"   • {'ROC AUC':<20}: {test_roc_auc:.4f}")
print("=" * 80)


CROSS-VALIDATION RESULTS

 RESULTS BY FOLD:
+------+-----------+----------+----------+---------+-----------+----------+----------+---------+----------+
| Fold | Train Acc | Test Acc | Train BA | Test BA | Train AUC | Test AUC | Train F1 | Test F1 | Time (s) |
+------+-----------+----------+----------+---------+-----------+----------+----------+---------+----------+
|  1   |  1.0000   |  0.9247  |  1.0000  | 0.9230  |  1.0000   |  0.9230  |  1.0000  | 0.9320  |  1.585   |
|  2   |  1.0000   |  0.8602  |  1.0000  | 0.8492  |  1.0000   |  0.8492  |  1.0000  | 0.8829  |  1.553   |
|  3   |  1.0000   |  0.8710  |  1.0000  | 0.8614  |  1.0000   |  0.8614  |  1.0000  | 0.8909  |  1.455   |
|  4   |  1.0000   |  0.8817  |  1.0000  | 0.8710  |  1.0000   |  0.8710  |  1.0000  | 0.9009  |  1.517   |
|  5   |  1.0000   |  0.9130  |  1.0000  | 0.9120  |  1.0000   |  0.9120  |  1.0000  | 0.9216  |  1.569   |
+------+-----------+----------+----------+---------+-----------+----------+----------+-----