univariate feature selection (SelectKBest)
automatically chooses classification vs regression, tunes

In [None]:
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, Ridge

# ---------------------------------------------------------------------
# Detect task type (classification vs regression) from y_train
def _is_classification(y: pd.Series) -> bool:
    # object/category -> classification
    if y.dtype.kind in ("O",) or str(y.dtype).startswith("category"):
        return True
    # integer/bool with a reasonably small number of classes -> classification
    if y.dtype.kind in ("b", "i", "u") and y.nunique(dropna=False) <= 50:
        return True
    return False

IS_CLASSIFICATION = _is_classification(y_train)

score_func = mutual_info_classif if IS_CLASSIFICATION else mutual_info_regression
model = LogisticRegression(max_iter=1000, n_jobs=None) if IS_CLASSIFICATION else Ridge()

# Use the scaled features created above
Xtr, Xva, Xte = X_train_std, X_val_std, X_test_std

# Candidate k values (donâ€™t exceed number of columns)
n_feats = Xtr.shape[1]
k_grid = [k for k in [5, 10, 15, 20, 30, 40, 60] if 1 <= k <= n_feats]
if not k_grid:
    k_grid = [min(10, max(1, n_feats))]

pipe = Pipeline([
    ("kbest", SelectKBest(score_func=score_func, k=k_grid[0])),
    ("model", model),
])

param_grid = {"kbest__k": k_grid}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=0,
)
grid.fit(Xtr, y_train)

best_k = grid.best_params_["kbest__k"]
selector: SelectKBest = grid.best_estimator_.named_steps["kbest"]

# Transform datasets
Xtr_sel_np = selector.transform(Xtr)
Xva_sel_np = selector.transform(Xva)
Xte_sel_np = selector.transform(Xte)

# Recover kept column names
kept_mask = selector.get_support()
kept_cols = Xtr.columns[kept_mask].tolist()

Xtr_sel = pd.DataFrame(Xtr_sel_np, index=Xtr.index, columns=kept_cols)
Xva_sel = pd.DataFrame(Xva_sel_np, index=Xva.index, columns=kept_cols)
Xte_sel = pd.DataFrame(Xte_sel_np, index=Xte.index, columns=kept_cols)

# Small report of top features + scores
scores = pd.Series(selector.scores_, index=Xtr.columns).dropna().sort_values(ascending=False)
top10 = scores.head(10)

print(f"Task type: {'Classification' if IS_CLASSIFICATION else 'Regression'}")
print(f"Best k selected via 5-fold CV: {best_k}")
print(f"Best CV score (pipeline): {grid.best_score_:.4f}")
print("\nTop 10 feature scores:")
print(top10)

# Save selector + selected datasets
OUT = Path(OUT) if not isinstance(OUT, Path) else OUT  # use your OUT dir from above
OUT.mkdir(parents=True, exist_ok=True)

joblib.dump(selector, OUT / "selector_kbest.pkl")

train_kbest = Xtr_sel.copy(); train_kbest[TARGET] = y_train.values
val_kbest   = Xva_sel.copy();  val_kbest[TARGET]   = y_val.values
test_kbest  = Xte_sel.copy();  test_kbest[TARGET]  = y_test.values

train_kbest.to_csv(OUT / "train_selected_kbest.csv", index=False)
val_kbest.to_csv(  OUT / "val_selected_kbest.csv",   index=False)
test_kbest.to_csv( OUT / "test_selected_kbest.csv",  index=False)

print("\nSaved:")
print(f"- selector_kbest.pkl")
print(f"- train_selected_kbest.csv")
print(f"- val_selected_kbest.csv")
print(f"- test_selected_kbest.csv")
print(f"\nKept {len(kept_cols)}/{n_feats} features.")
