In [None]:
from sklearn.decomposition import IncrementalPCA
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier


data = pd.read_csv('train_2025_2026.csv')
X = data.drop(columns=['Outcome', 'Id'])
y = data['Outcome']



# Pipeline
pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=3, metric='nan_euclidean')),
    ('smote', SMOTE(k_neighbors=3, random_state=42)),
    ('pca', IncrementalPCA()),
    ('xgb', XGBClassifier(
        eval_metric='mlogloss',
        random_state=42,
        tree_method='hist',
        max_bin=128
    ))
])



# Scoring
scoring = {
    'roc_auc_ovr': make_scorer(roc_auc_score, multi_class='ovr', response_method='predict_proba'),
    'f1_macro': make_scorer(f1_score, average='macro')
}



# Hyperparameter search 
param_distributions = {
    'pca__n_components': [20, 50, 100, 200],

    'xgb__n_estimators': [200, 500, 800, 1200],
    'xgb__learning_rate': [0.01, 0.03, 0.1],
    'xgb__max_depth': [3, 5, 7],
    'xgb__subsample': [0.6, 0.8, 1.0],
    'xgb__colsample_bytree': [0.6, 0.8, 1.0],

    'xgb__min_child_weight': [1, 3, 5, 10],
    'xgb__gamma': [0, 0.1, 0.3, 1],
    'xgb__reg_alpha': [0, 0.1, 0.5],
    'xgb__reg_lambda': [1, 1.5, 2],
}



# Inner CV search
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    cv=inner_cv,
    n_jobs=1,
    scoring='f1_macro',     
    refit=True,
    random_state=42
)


# Outer CV evaluation
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

nested_scores = cross_validate(
    grid_search,
    X, y,
    cv=outer_cv,
    scoring=scoring,
    n_jobs=-2,
    return_estimator=True
)

# Print metrics
roc_aucreg = np.mean(nested_scores['test_roc_auc_ovr'])
f1_macroreg = np.mean(nested_scores['test_f1_macro'])

print("Nested CV Metrics")
print(f"Average ROC-AUC OvR: {roc_aucreg:.4f}")
print(f"Average F1-macro: {f1_macroreg:.4f}")


# Best params per fold
best_params_per_fold = [est.best_params_ for est in nested_scores['estimator']]


# Most common hyperparameters across folds
df_params = pd.DataFrame(best_params_per_fold)
most_common_params = df_params.mode().iloc[0]

print("Most Frequent Parameters")
print(most_common_params)






Nested CV Metrics
Average ROC-AUC OvR: 0.9666
Average F1-macro: 0.8445
Most Frequent Parameters
xgb__subsample              1.0
xgb__reg_lambda             1.5
xgb__reg_alpha              0.1
xgb__n_estimators        1200.0
xgb__min_child_weight      10.0
xgb__max_depth              3.0
xgb__learning_rate          0.1
xgb__gamma                  0.1
xgb__colsample_bytree       0.6
pca__n_components         200.0
Name: 0, dtype: float64
