In [15]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from preprocess import preprocess_data_flat
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    roc_auc_score,
    f1_score,
    make_scorer, 
)
from xgboost import XGBClassifier
import pandas as pd
from tabulate import tabulate

In [16]:
df_final = [preprocess_data_flat(x, 60) for x in ["nico", "sofia"]]
x_final, y_final = [pd.concat([x[i] for x in df_final], ignore_index=True) for i in range(2)]
X_train_flat, X_test_flat, y_train_flat, y_test_flat = train_test_split(
    x_final, y_final, test_size=0.5, random_state=123
)

In [17]:
n_splits = 5

cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)

model = XGBClassifier(
    colsample_bytree=0.7,
    learning_rate=0.05,
    max_depth=2,
    min_child_weight=3,
    reg_lambda=2.0,
    reg_alpha=0.5,
    random_state=123,
    subsample=0.7,
    n_estimators=150,
)


scoring = {
    'accuracy': make_scorer(accuracy_score),
    'balanced_accuracy': make_scorer(balanced_accuracy_score),
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score)
}

cv_results = cross_validate(
    model, 
    x_final, 
    y_final, 
    cv=cv,
    scoring=scoring,
    return_train_score=True,
    n_jobs=-1,
)


In [18]:
fold_results = []
for i in range(n_splits):
    fold_results.append({
        'Fold': i + 1,
        'Train Acc': f"{cv_results['train_accuracy'][i]:.4f}",
        'Test Acc': f"{cv_results['test_accuracy'][i]:.4f}",
        'Train BA': f"{cv_results['train_balanced_accuracy'][i]:.4f}",
        'Test BA': f"{cv_results['test_balanced_accuracy'][i]:.4f}",
        'Train AUC': f"{cv_results['train_roc_auc'][i]:.4f}",
        'Test AUC': f"{cv_results['test_roc_auc'][i]:.4f}",
        'Train F1': f"{cv_results['train_f1'][i]:.4f}",
        'Test F1': f"{cv_results['test_f1'][i]:.4f}",
        'Time (s)': f"{cv_results['fit_time'][i] + cv_results['score_time'][i]:.3f}"
    })

summary_results = {
    'Accuracy': f"{cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}",
    'Balanced Accuracy': f"{cv_results['test_balanced_accuracy'].mean():.4f} ± {cv_results['test_balanced_accuracy'].std():.4f}",
    'ROC AUC': f"{cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}",
    'F1 Score': f"{cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}",
    'Total Time (s)': f"{(cv_results['fit_time'] + cv_results['score_time']).sum():.3f}"
}

print("\n" + "="*80)
print("CROSS-VALIDATION RESULTS")
print("="*80)

print("\n RESULTS BY FOLD:")
print(tabulate(fold_results, headers='keys', tablefmt='pretty'))

print("\n STATISTICS (Test Set):")
for metric, value in summary_results.items():
    print(f"   • {metric:<20}: {value}")



CROSS-VALIDATION RESULTS

 RESULTS BY FOLD:
+------+-----------+----------+----------+---------+-----------+----------+----------+---------+----------+
| Fold | Train Acc | Test Acc | Train BA | Test BA | Train AUC | Test AUC | Train F1 | Test F1 | Time (s) |
+------+-----------+----------+----------+---------+-----------+----------+----------+---------+----------+
|  1   |  1.0000   |  0.7447  |  1.0000  | 0.7194  |  1.0000   |  0.7194  |  1.0000  | 0.6471  |  1.288   |
|  2   |  0.9946   |  0.7609  |  0.9935  | 0.7495  |  0.9935   |  0.7495  |  0.9935  | 0.7027  |  1.269   |
|  3   |  0.9946   |  0.7609  |  0.9954  | 0.7417  |  0.9954   |  0.7417  |  0.9935  | 0.6857  |  1.273   |
|  4   |  0.9892   |  0.7391  |  0.9889  | 0.7232  |  0.9889   |  0.7232  |  0.9870  | 0.6667  |  1.278   |
|  5   |  0.9784   |  0.7174  |  0.9759  | 0.6735  |  0.9759   |  0.6735  |  0.9737  | 0.5517  |  1.298   |
+------+-----------+----------+----------+---------+-----------+----------+----------+-----

In [14]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, _ = clf.fit(X_train_flat, X_test_flat, y_train_flat, y_test_flat)
models

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 96, number of negative: 135
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33361
[LightGBM] [Info] Number of data points in the train set: 231, number of used features: 442
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.415584 -> initscore=-0.340927
[LightGBM] [Info] Start training from score -0.340927


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.75,0.74,0.74,0.75,0.41
LGBMClassifier,0.75,0.73,0.73,0.75,0.06
ExtraTreesClassifier,0.75,0.73,0.73,0.75,0.08
NuSVC,0.74,0.71,0.71,0.73,0.02
RandomForestClassifier,0.72,0.71,0.71,0.72,0.21
Perceptron,0.7,0.69,0.69,0.7,0.01
PassiveAggressiveClassifier,0.69,0.69,0.69,0.69,0.02
LogisticRegression,0.69,0.69,0.69,0.7,0.03
KNeighborsClassifier,0.69,0.69,0.69,0.7,0.02
LinearSVC,0.69,0.68,0.68,0.69,0.06
