Clustering/KNN with leak-free pipeline helpers

In [32]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler

from src.data_preprocessing.features import (
    stratified_split,
    feature_mask_from_train,
    fit_transforms,
    evaluate_k,
    stability_score,
    build_per_cluster_knn,
)
from src.data_preprocessing.groups import GROUPS

CATEGORICAL_COLS = [
    "Rk",
    "Player",
    "Nation",
    "Pos",
    "Squad",
    "Comp",
    "Age",
    "Born",
    "MP",
    "Starts",
    "Min",
    "90s",
    "numeric_wage",
    "foot",
    "W",
    "D",
    "L",
]


# Resolve data dir relative to repository root
DATA_DIR = Path(__file__).resolve().parents[2] / "data" / "processed"



def load_position_df(pos: str) -> pd.DataFrame:
    path = DATA_DIR / f"players_data_{pos}_normalized.parquet"
    if not path.exists():
        raise FileNotFoundError(f"Missing normalized parquet for {pos} at {path}")
    return pd.read_parquet(path).reset_index(drop=True)


def select_group_columns(df: pd.DataFrame, use_groups) -> list[str] | None:
    if not use_groups:
        return None
    selected = []
    for g in use_groups:
        cols = GROUPS.get(g, [])
        selected.extend(cols)
    selected = list(dict.fromkeys(selected))  # dedupe, keep order
    return [c for c in selected if c in df.columns]


def select_top_loading_cols(train_df: pd.DataFrame, base_allowed: list[str] | None, top_n: int = 15) -> list[str]:
    numeric_cols = [c for c in train_df.columns if c not in CATEGORICAL_COLS]
    if base_allowed is not None:
        numeric_cols = [c for c in numeric_cols if c in base_allowed]
    num_df = train_df[numeric_cols]
    imputer = SimpleImputer(strategy="median")
    scaler = StandardScaler()
    num_imp = imputer.fit_transform(num_df)
    num_scaled = scaler.fit_transform(num_imp)
    # fit PCA on train only; full rank but we only need first 2 comps
    pca = PCA(random_state=42)
    pca.fit(num_scaled)
    comps = pca.components_
    if comps.shape[0] < 2:
        return numeric_cols  # not enough components to rank; fallback to all
    loadings = pd.DataFrame(comps[:2].T, columns=["PC1", "PC2"], index=numeric_cols)
    top_pc1 = loadings["PC1"].abs().nlargest(top_n).index
    top_pc2 = loadings["PC2"].abs().nlargest(top_n).index
    selected = list(dict.fromkeys(list(top_pc1) + list(top_pc2)))
    return selected


def sweep_k(train_X, val_X, val_df, k_grid, seed=42):
    rows = []
    for k in k_grid:
        metrics, _, _ = evaluate_k(train_X, val_X, k, seed=seed)
        stab = stability_score(train_X, val_X, k, seed=seed)
        rows.append({"k": k, **metrics, "stability_ari": stab})
    res = pd.DataFrame(rows)
    return res


def choose_k(results):
    # Prefer higher silhouette, then CH, then lower DB; drop NaNs
    scored = results.copy()
    scored = scored.dropna(subset=["silhouette"])
    if scored.empty:
        return int(results.iloc[0]["k"])
    scored = scored.sort_values(by=["silhouette", "ch", "db"], ascending=[False, False, True])
    return int(scored.iloc[0]["k"])


def fit_final_model(train_X, k: int, seed=42):
    km = KMeans(
        n_clusters=k,
        n_init=20,
        max_iter=300,
        tol=1e-4,
        random_state=seed,
        algorithm="elkan",
    )
    km.fit(train_X)
    return km


def nearest_train_neighbors(
    km,
    train_X,
    train_df,
    test_X,
    test_df,
    test_labels,
    k = 5,
    id_col = "Player",
    prefer_same_foot = False,
    prefer_same_side = False,
    max_age_diff = None,
):
    per_cluster = build_per_cluster_knn(train_X, km.labels_, n_neighbors=k + 1, metric="cosine")

    def for_test_idx(test_idx: int) -> pd.DataFrame:
        cluster = test_labels[test_idx]
        nn, idx = per_cluster[int(cluster)]
        n_q = min(k * 2, len(idx))  # grab more to allow post-filters
        dists, inds = nn.kneighbors(test_X[test_idx].reshape(1, -1), n_neighbors=n_q)
        global_inds = idx[inds[0]]
        cols_to_show = [c for c in [id_col, "Rk", "Pos", "Squad", "Comp"] if c in train_df.columns]
        out = train_df.iloc[global_inds][cols_to_show].copy()
        out["distance"] = dists[0]
        out["cluster"] = cluster

        # Post-hoc filters (optional)
        query = test_df.iloc[test_idx]
        if prefer_same_foot and "foot" in train_df.columns and "foot" in query:
            qf = str(query["foot"]).lower()
            out = out[out["foot"].str.lower() == qf] if "foot" in out.columns else out
        if prefer_same_side and "Pos" in train_df.columns and "Pos" in query:
            def side(val):
                if not isinstance(val, str):
                    return None
                val = val.upper()
                if "L" in val:
                    return "L"
                if "R" in val:
                    return "R"
                return None
            qs = side(query["Pos"])
            if qs is not None and "Pos" in out.columns:
                out = out[out["Pos"].apply(side) == qs]
        if max_age_diff is not None and "Age" in train_df.columns and "Age" in query:
            try:
                qa = float(query["Age"])
                out = out[abs(out["Age"].astype(float) - qa) <= max_age_diff] if "Age" in out.columns else out
            except (TypeError, ValueError):
                pass

        out = out.head(k)
        return out.reset_index(drop=True)

    return for_test_idx


def knn_reciprocity_stats(X: np.ndarray, labels: np.ndarray, k: int = 5) -> dict:
    """Simple KNN graph stats on a given split (uses that split as both query and pool)."""
    per_cluster = build_per_cluster_knn(X, labels, n_neighbors=k + 1, metric="cosine")
    total = 0
    mutual = 0
    mean_kth_dist = []
    for _, (nn, idx) in per_cluster.items():
        if len(idx) < 2:
            continue
        n_q = min(k + 1, len(idx))
        dists, inds = nn.kneighbors(X[idx], n_neighbors=n_q)
        for row_idx, (row_inds, row_dists) in enumerate(zip(inds, dists)):
            global_inds = idx[row_inds]
            mask = global_inds != idx[row_idx]
            neighs = global_inds[mask][:k]
            neigh_dists = row_dists[mask][:k]
            if len(neighs) == 0:
                continue
            total += len(neighs)
            mean_kth_dist.append(neigh_dists[-1])
            for n in neighs:
                n_neighbors = idx[inds[nn.kneighbors(X[n].reshape(1, -1), n_neighbors=n_q)[1][0]]]
                if idx[row_idx] in n_neighbors[1:]:  # exclude self at [0]
                    mutual += 1
    reciprocity = mutual / total if total else np.nan
    return {
        "reciprocity": reciprocity,
        "mean_kth_dist": float(np.mean(mean_kth_dist)) if mean_kth_dist else np.nan,
    }


def self_hit_rate(X: np.ndarray, labels: np.ndarray, k: int = 5, eps: float = 1e-9) -> float:
    """
    Leave-one-out self hit: for each point, query within its cluster excluding itself,
    count a hit if the nearest neighbor is effectively identical (distance <= eps).
    Useful to flag collapses/duplicates.
    """
    per_cluster = build_per_cluster_knn(X, labels, n_neighbors=k + 1, metric="cosine")
    hits = 0
    total = 0
    for _, (nn, idx) in per_cluster.items():
        if len(idx) < 2:
            continue
        n_q = min(k + 1, len(idx))
        dists, inds = nn.kneighbors(X[idx], n_neighbors=n_q)
        for row_idx, (row_inds, row_dists) in enumerate(zip(inds, dists)):
            global_inds = idx[row_inds]
            mask = global_inds != idx[row_idx]
            neighs = global_inds[mask][:k]
            neigh_dists = row_dists[mask][:k]
            if len(neighs) == 0:
                continue
            total += 1
            if neigh_dists[0] <= eps:
                hits += 1
    return hits / total if total else np.nan


def run_position(
    pos="FW",
    k_grid=(2, 3, 4),
    max_missing=0.4,
    min_variance=1e-6,
    corr_thresh=0.9,
    with_pca=0.95,
    seed=42,
    use_groups=None,
    group_presets=None,
    include_pca_top=False,
    pca_top_n=15,
):
    df = load_position_df(pos)
    train_df, val_df, test_df = stratified_split(df, seed=seed)

    combos = group_presets if group_presets else [use_groups]
    if include_pca_top and "pca_top" not in combos:
        combos = list(combos) + ["pca_top"]
    results_summary = []
    best_combo = None
    best_row = None
    best_state = None

    for combo in combos:
        if combo == "pca_top":
            base_allowed = select_group_columns(train_df, use_groups) if use_groups else None
            allowed_numeric = select_top_loading_cols(train_df, base_allowed, top_n=pca_top_n)
        else:
            allowed_numeric = select_group_columns(train_df, combo)
        numeric_cols = feature_mask_from_train(
            train_df,
            max_missing=max_missing,
            min_variance=min_variance,
            corr_thresh=corr_thresh,
            allowed_numeric=allowed_numeric,
        )
        feats = fit_transforms(
            train_df,
            val_df,
            test_df,
            numeric_cols,
            with_pca=with_pca,
            seed=seed,
        )

        group_msg = f"groups={combo}" if combo else "groups=all"
        print(f"\n{pos}: kept {len(numeric_cols)} numeric cols; with_pca={with_pca}; {group_msg}")
        val_results = sweep_k(feats["X_train"], feats["X_val"], val_df, k_grid, seed=seed)
        print("Validation metrics:")
        print(val_results)

        best_k = choose_k(val_results)
        chosen_row = val_results[val_results["k"] == best_k].iloc[0]
        results_summary.append({"groups": combo, "best_k": best_k, "silhouette": chosen_row["silhouette"]})

        if best_row is None or chosen_row["silhouette"] > best_row["silhouette"]:
            best_row = chosen_row
            best_combo = combo
            best_state = (numeric_cols, feats, best_k, group_msg)

    # Train/test on best combo
    numeric_cols, feats, best_k, group_msg = best_state
    print(f"\nSelected combo: {group_msg} with k={best_k}")
    km = fit_final_model(feats["X_train"], best_k, seed=seed)
    test_labels = km.predict(feats["X_test"])
    if len(np.unique(test_labels)) > 1:
        test_sil = silhouette_score(feats["X_test"], test_labels)
        test_db = davies_bouldin_score(feats["X_test"], test_labels)
        test_ch = calinski_harabasz_score(feats["X_test"], test_labels)
    else:
        test_sil = test_db = test_ch = np.nan

    print("\nTest metrics:")
    test_knn_stats = knn_reciprocity_stats(feats["X_test"], test_labels, k=10)
    test_self_hit = self_hit_rate(feats["X_test"], test_labels, k=10)
    print(
        {
            "silhouette": float(test_sil) if test_sil == test_sil else np.nan,
            "db": float(test_db) if test_db == test_db else np.nan,
            "ch": float(test_ch) if test_ch == test_ch else np.nan,
            **test_knn_stats,
            "self_hit_at_10": test_self_hit,
        }
    )

    neighbors_fn = nearest_train_neighbors(
        km,
        feats["X_train"],
        train_df,
        feats["X_test"],
        test_df,
        test_labels,
        k=5,
        prefer_same_foot=False,
        prefer_same_side=False,
        max_age_diff=None,
    )
    # Example: show neighbors for first test row if available
    if len(test_df) > 0:
        example = neighbors_fn(0)
        print("\nNearest train neighbors for first test player:")
        print(example)
    return {
        "val_results": results_summary,
        "best_combo": best_combo,
        "best_k": best_k,
        "test_metrics": {
            "silhouette": test_sil,
            "db": test_db,
            "ch": test_ch,
            **test_knn_stats,
            "self_hit_at_10": test_self_hit,
        },
    }


if __name__ == "__main__":
    run_position(
    pos="FW",
    k_grid=(3,4),
    with_pca=0.6,           
    include_pca_top=True,  
    pca_top_n=15,
    group_presets=[None, ["goal_shot_creation"],["passing", "goal_shot_creation"],["passing", "goal_shot_creation", "pass_types", "possession"],
                   ["passing", "goal_shot_creation", "pass_types", "possession", "defense","misc"]],
)