In [1]:
# ===========================================
# 03_GK Static anomaly (IsolationForest)
# ===========================================
import os
import json
import numpy as np
import pandas as pd

from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest

# ÂÖ•ÂäõÔºà02_gkÔºâ
BASE_02_GK = "/workspace/data/02_gk/players"

# Âá∫ÂäõÔºà03_gkÔºâ
BASE_03_GK = "/workspace/data/03_gk/static"
os.makedirs(BASE_03_GK, exist_ok=True)

GK_STATIC_CONFIG = {
    "scaler": "RobustScaler",
    "contamination": 0.05,      # GK„ÅØ„Éá„Éº„ÇøÂ∞ë„Å™„ÇÅ„Å™„ÅÆ„ÅßFP(0.03)„Çà„ÇäÂ∞ë„ÅóÈ´ò„ÇÅÊé®Â•®
    "top_k_features": 5,
    "random_state": 42,
    "min_samples": 12,          # Â∞ë„Å™„Åô„Åé„ÇãÈÅ∏Êâã„ÅØ„Çπ„Ç≠„ÉÉ„Éó
}

META_COLS = ["athlete_id", "athlete_name", "date_", "md_offset", "md_phase", "is_match_day"]

GK_COLS = [
    "total_dive_load_centre", "total_dive_load_left", "total_dive_load_right",
    "dive_right_count", "dive_left_count", "dive_centre_count",
    "total_dives_centre", "total_dives_left", "total_dives_right",
    "total_time_to_feet_centre", "total_time_to_feet_left", "total_time_to_feet_right"
]

def prepare_gk_static_features(df_gk: pd.DataFrame):
    """
    GK static:
    - metaÂàóÈô§Â§ñ
    - GK_COLS„ÅÆ„ÅÜ„Å°Â≠òÂú®„Åô„ÇãÂàó„ÅÆ„Åø
    - Êï∞ÂÄ§Âàó„ÅÆ„Åø
    - std==0ÂàóÈô§Â§ñ
    """
    feature_cols = [c for c in GK_COLS if c in df_gk.columns]
    X = df_gk[feature_cols].select_dtypes(include=[np.number])

    # std=0Èô§Â§ñ
    std = X.std()
    valid_cols = std[std > 0].index.tolist()
    return X[valid_cols], valid_cols

def compute_top_features_z(row, mean_vec, std_vec, cols, top_k=5):
    """
    top_features „Çí„ÄåË™¨Êòé„Åß„Åç„ÇãÂΩ¢„Äç„ÅßÂá∫„ÅôÔºö
    [{"feature": ..., "z": ...}, ...]
    """
    z = (row[cols] - mean_vec) / std_vec
    z = z.replace([np.inf, -np.inf], np.nan).fillna(0)
    z_abs = z.abs().sort_values(ascending=False).head(top_k)

    out = []
    for feat in z_abs.index.tolist():
        out.append({"feature": feat, "z": float(z[feat])})
    return out

def run_gk_static_anomaly_single_player(df_gk: pd.DataFrame, config: dict):
    X_raw, feature_cols = prepare_gk_static_features(df_gk)

    if X_raw.shape[0] < config["min_samples"]:
        return None

    # Ê¨†ÊêçË£úÂÆåÔºàmedianÔºâ
    X_filled = X_raw.fillna(X_raw.median())

    # Robust scaling
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X_filled)

    # IsolationForest
    iso = IsolationForest(
        contamination=config["contamination"],
        random_state=config["random_state"],
    )
    iso_pred  = iso.fit_predict(X_scaled)
    iso_score = iso.decision_function(X_scaled)  # È´ò„ÅÑ„Åª„Å©Ê≠£Â∏∏Ôºàsklearn‰ªïÊßòÔºâ

    # ÈñæÂÄ§Ôºàscore „ÅÆ lower tail „Çí‰Ωø„ÅÜÔºöcontaminationÂàÜÔºâ
    thr = float(np.quantile(iso_score, config["contamination"]))

    out = pd.DataFrame({
        "date_": pd.to_datetime(df_gk["date_"]).values,
        "static_score": iso_score,
        "static_thr": thr,
        "static_anomaly": (iso_pred == -1).astype(int),
    })

    # top_featuresÔºàÁï∞Â∏∏Êó•„ÅÆ„ÅøÔºâ
    mean_vec = X_filled.mean()
    std_vec  = X_filled.std().replace(0, np.nan)

    out["top_features"] = None
    m = out["static_anomaly"] == 1
    if m.any():
        out.loc[m, "top_features"] = (
            X_filled.loc[m]
            .apply(lambda row: compute_top_features_z(row, mean_vec, std_vec, feature_cols, config["top_k_features"]), axis=1)
            .apply(json.dumps)  # parquet„ÅßÂÆâÂÆö„Åô„Çã„Çà„ÅÜ JSON ÊñáÂ≠óÂàóÂåñ
        )

    return out

# ===========================================
# Run for ALL GK players
# ===========================================
athlete_dirs = sorted([
    d for d in os.listdir(BASE_02_GK)
    if os.path.isdir(os.path.join(BASE_02_GK, d))
])

print("GK players:", len(athlete_dirs))

results = []

for athlete_id in athlete_dirs:
    in_path = os.path.join(BASE_02_GK, athlete_id, f"{athlete_id}_gk_static.parquet")
    if not os.path.exists(in_path):
        continue

    df_gk = pd.read_parquet(in_path).sort_values("date_").reset_index(drop=True)

    res = run_gk_static_anomaly_single_player(df_gk, GK_STATIC_CONFIG)
    if res is None:
        print(f"‚ö† skip (too few samples): {athlete_id}")
        continue

    res["athlete_id"] = athlete_id
    res["method"] = "IsolationForest"
    res["params"] = json.dumps(GK_STATIC_CONFIG)

    out_dir = os.path.join(BASE_03_GK, athlete_id)
    os.makedirs(out_dir, exist_ok=True)

    out_path = os.path.join(out_dir, "static_labels.parquet")
    res.to_parquet(out_path, index=False)

    results.append(res)
    print(f"‚úÖ saved GK static anomaly ‚Üí {athlete_id}")

print("üéâ GK static anomaly completed")

# -------------------------------------------
# sanity check
# -------------------------------------------
if len(results) > 0:
    df_all = pd.concat(results, ignore_index=True)
    print("\n=== GK static_anomaly counts ===")
    print(df_all["static_anomaly"].value_counts())
    print("\n=== GK player-wise anomaly rate (describe) ===")
    print(df_all.groupby("athlete_id")["static_anomaly"].mean().describe())
    display(df_all.head())


GK players: 4
‚úÖ saved GK static anomaly ‚Üí 09bb407a-a555-496a-957d-25a97ccb1519
‚úÖ saved GK static anomaly ‚Üí 0bced2f8-0a31-4d07-b836-f7456918c0dd
‚úÖ saved GK static anomaly ‚Üí abe73ab3-ecbc-41f5-8d98-45856655d084
‚úÖ saved GK static anomaly ‚Üí dc36ad4c-11e2-4ca1-865c-239a0845d4bd
üéâ GK static anomaly completed

=== GK static_anomaly counts ===
static_anomaly
0    885
1     49
Name: count, dtype: int64

=== GK player-wise anomaly rate (describe) ===
count    4.000000
mean     0.052493
std      0.001272
min      0.051064
25%      0.051611
50%      0.052595
75%      0.053478
max      0.053719
Name: static_anomaly, dtype: float64


Unnamed: 0,date_,static_score,static_thr,static_anomaly,top_features,athlete_id,method,params
0,2023-01-12,0.035705,-1.3986210000000002e-17,0,,09bb407a-a555-496a-957d-25a97ccb1519,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
1,2023-01-13,0.178917,-1.3986210000000002e-17,0,,09bb407a-a555-496a-957d-25a97ccb1519,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
2,2023-01-14,-0.172331,-1.3986210000000002e-17,1,"[{""feature"": ""total_dives_centre"", ""z"": 7.6572...",09bb407a-a555-496a-957d-25a97ccb1519,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
3,2023-01-16,0.089095,-1.3986210000000002e-17,0,,09bb407a-a555-496a-957d-25a97ccb1519,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
4,2023-01-17,0.147581,-1.3986210000000002e-17,0,,09bb407a-a555-496a-957d-25a97ccb1519,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
