In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score

from tqdm import tqdm

import warnings; warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

In [3]:
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [4]:
from pathlib import Path
DATA_PATH = Path("./data/")
OUTPUT_PATH = Path("./output/")

# General settings
SEED = 42
N_FOLDS = 5

# Metric choice placeholder
#Adjust depending on the competition
METRIC = "auc"

In [None]:
def get_folds(
    X,
    y,
    cv_type="kfold",
    n_splits=5,
    shuffle=True,
    random_state=42,
    groups=None,
):
    """
    Unified CV builder for Kaggle workflows.
    Returns a list of (train_idx, valid_idx) pairs.
    """

    cv_type = cv_type.lower()

    if cv_type not in ["kfold", "stratified", "group"]:
        raise ValueError(f"Unknown cv_type: {cv_type}")

    # -------------------------
    # Stratified KFold
    # -------------------------
    if cv_type == "stratified":
        # Safety: each class must appear at least n_splits times
        unique, counts = np.unique(y, return_counts=True)
        if np.any(counts < n_splits):
            raise ValueError(
                "Some classes have fewer samples than n_splits. "
                "StratifiedKFold would be invalid."
            )

        splitter = StratifiedKFold(
            n_splits=n_splits,
            shuffle=shuffle,
            random_state=random_state,
        )
        folds = list(splitter.split(X, y))

    # -------------------------
    # Group KFold
    # -------------------------
    elif cv_type == "group":
        if groups is None:
            raise ValueError("groups must be provided for GroupKFold.")

        splitter = GroupKFold(n_splits=n_splits)
        folds = list(splitter.split(X, y, groups))

    # -------------------------
    # Standard KFold
    # -------------------------
    else:  # "kfold"
        splitter = KFold(
            n_splits=n_splits,
            shuffle=shuffle,
            random_state=random_state,
        )
        folds = list(splitter.split(X, y))

    return folds


In [None]:
def evaluate(y_true, y_pred, metric="auc"):
    """
    Compute score based on chosen metric.
    """
    metric = metric.lower()
    
    if metric == "auc":
        return roc_auc_score(y_true, y_pred)
    elif metric == "rmse":
        return mean_squared_error(y_true, y_pred, squared=False)
    elif metric == "accuracy":
        return accuracy_score(y_true, y_pred)
    else:
        raise ValueError(f"Unknown metric: {metric}")


In [None]:
def run_training(X, y, model, folds, metric="auc"):
    """
    Train model across folds and return OOF predictions and scores.
    """
    oof_preds = np.zeros(len(y))
    scores = []

    for fold, (train_idx, val_idx) in enumerate(folds):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_val)
        oof_preds[val_idx] = preds

        score = evaluate(y_val, preds, metric)
        scores.append(score)
        print(f"Fold {fold+1} | {metric.upper()}: {score:.4f}")

    print(f"CV mean {metric.upper()}: {np.mean(scores):.4f} | std: {np.std(scores):.4f}")
    return oof_preds, scores