In [6]:
# ===========================================
# 0. Setup
# ===========================================
import os
import json
import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest

# 入力（02）
BASE_02 = "/workspace/data/02/players"

# 出力（03）
BASE_03 = "/workspace/data/03/static"
os.makedirs(BASE_03, exist_ok=True)

# パラメータ（★ 後で config.json に保存）
STATIC_CONFIG = {
    "scaler": "RobustScaler",
    "contamination": 0.03,     # 下位何%を異常にするか（score閾値で使う）
    "top_k_features": 5,
    "random_state": 42,
}


In [7]:
# ===========================================
# 1. Utility functions (Static)
# ===========================================
GK_COLS = [
    "total_dive_load_centre", "total_dive_load_left", "total_dive_load_right",
    "dive_right_count", "dive_left_count", "dive_centre_count",
    "total_dives_centre", "total_dives_left", "total_dives_right",
    "total_time_to_feet_centre", "total_time_to_feet_left", "total_time_to_feet_right"
]
META_COLS = ["athlete_id", "athlete_name", "date_", "md_offset", "md_phase", "is_match_day"]


def prepare_static_features(df_static: pd.DataFrame):
    """
    - GK列除外
    - meta列除外
    - 数値列のみ
    - std == 0 を除外
    """
    feature_cols = [
        c for c in df_static.columns
        if c not in META_COLS and c not in GK_COLS
    ]

    X = df_static[feature_cols].select_dtypes(include=[np.number])

    # std=0 列除外
    std = X.std()
    valid_cols = std[std > 0].index.tolist()

    return X[valid_cols], valid_cols

def compute_top_features_with_z(row, mean_vec, std_vec, cols, top_k=5):
    """
    異常日の説明用：
    - 全期間平均との差のZ-score（方向付き）
    - 上位 top_k を [{feature, z}] で返す
    """
    # 0割回避
    std_vec = std_vec.replace(0, np.nan)
    z = (row[cols] - mean_vec) / std_vec
    z = z.dropna()

    z_abs = z.abs().sort_values(ascending=False).head(top_k)

    return [{"feature": f, "z": float(z[f])} for f in z_abs.index]

In [8]:
# ===========================================
# 2. Static anomaly for single player
# ===========================================
def run_static_anomaly_single_player(
    df_static: pd.DataFrame,
    contamination: float,
    top_k: int,
    random_state: int
):
    """
    return:
      out_df: DataFrame with
        [date_, static_score, static_thr, static_anomaly, top_features]
      artifacts: dict (for backend use)
    """

    # ---------- feature preparation ----------
    X_raw, feature_cols = prepare_static_features(df_static)

    if X_raw.shape[0] < 10:
        return None, None

    # 欠損補完（median）
    impute_median = X_raw.median()
    X_filled = X_raw.fillna(impute_median)

    # Robust scaling
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X_filled)

    # ---------- Isolation Forest (fit) ----------
    # ※ contamination は fit にも渡すが、最終判定は score の閾値で決める（S1）
    iso = IsolationForest(
        contamination=contamination,
        random_state=random_state
    )
    iso.fit(X_scaled)

    # score（大きいほど正常）
    static_score = iso.decision_function(X_scaled)

    # ---------- S1: thresholding by score quantile ----------
    # 下位 contamination を異常扱い
    static_thr = float(np.quantile(static_score, contamination))
    static_anom = (static_score <= static_thr).astype(int)

    # ---------- result dataframe ----------
    out = pd.DataFrame({
        "date_": df_static["date_"].values,
        "static_score": static_score,
        "static_thr": static_thr,
        "static_anomaly": static_anom,
    })

    # ---------- S2: top features with signed z ----------
    mean_vec = X_filled.mean()
    std_vec  = X_filled.std()

    out["top_features"] = None
    anomaly_idx = out["static_anomaly"] == 1

    if anomaly_idx.any():
        out.loc[anomaly_idx, "top_features"] = (
            X_filled.loc[anomaly_idx]
            .apply(
                lambda row: json.dumps(
                    compute_top_features_with_z(row, mean_vec, std_vec, feature_cols, top_k),
                    ensure_ascii=False
                ),
                axis=1
            )
        )

    # ---------- S3: artifacts for backend ----------
    artifacts = {
        "feature_cols": feature_cols,
        "impute_median": {k: float(v) if pd.notna(v) else None for k, v in impute_median.to_dict().items()},
        "static_thr": static_thr,
        "config": STATIC_CONFIG,
    }

    return out, artifacts

In [9]:
# ===========================================
# 3. Run for ALL players
# ===========================================
athlete_dirs = sorted([
    d for d in os.listdir(BASE_02)
    if os.path.isdir(os.path.join(BASE_02, d))
])

print("対象選手数:", len(athlete_dirs))

results = []

for athlete_id in athlete_dirs:

    static_path = os.path.join(BASE_02, athlete_id, f"{athlete_id}_static.parquet")
    if not os.path.exists(static_path):
        continue

    df_static = pd.read_parquet(static_path).sort_values("date_").reset_index(drop=True)

    res, artifacts = run_static_anomaly_single_player(
        df_static=df_static,
        contamination=STATIC_CONFIG["contamination"],
        top_k=STATIC_CONFIG["top_k_features"],
        random_state=STATIC_CONFIG["random_state"],
    )

    if res is None:
        continue

    # メタ情報付与（型を揃えるなら athlete_id は int にしてもOK）
    res["athlete_id"] = athlete_id
    res["method"] = "IsolationForest"
    res["params"] = json.dumps(STATIC_CONFIG, ensure_ascii=False)

    # 保存
    out_dir = os.path.join(BASE_03, athlete_id)
    os.makedirs(out_dir, exist_ok=True)

    out_path = os.path.join(out_dir, "static_labels.parquet")
    res.to_parquet(out_path, index=False)

    # S3 artifacts 保存
    art_path = os.path.join(out_dir, "static_artifacts.json")
    with open(art_path, "w") as f:
        json.dump(artifacts, f, ensure_ascii=False)

    results.append(res)

    print(f"✅ saved static anomaly → {athlete_id}")

print("完了：static anomaly (all players)")

対象選手数: 30
✅ saved static anomaly → 121b05df-f5f6-4029-92a7-5420dea45e4d
✅ saved static anomaly → 13bb34b4-8c38-4c86-86b8-bbe8574988c8
✅ saved static anomaly → 15d36f96-6a91-4787-96f8-5fdf8565006b
✅ saved static anomaly → 223a7cbc-a76b-4e36-ab5c-215fc9492e84
✅ saved static anomaly → 3ded61ff-c67b-4776-a1ef-5050bb5c7fd3
✅ saved static anomaly → 44eea4b6-3614-4ca2-b8d7-098b6120c1fb
✅ saved static anomaly → 45f771a4-dab1-4e2a-8f92-cd219b677ab9
✅ saved static anomaly → 4759d9d8-9e0e-44b8-9c70-874bf974c4ba
✅ saved static anomaly → 68341b43-2561-4972-95f4-341f8530e023
✅ saved static anomaly → 6eda50d0-970c-44ab-b470-de9ebc71ae52
✅ saved static anomaly → 714e5fd4-609c-4e96-bea6-b7f3966bf681
✅ saved static anomaly → 7995a06b-bb0d-4343-bc18-cbdf623d4640
✅ saved static anomaly → 83761ac5-fbd3-4422-ba30-b6b4915da945
✅ saved static anomaly → 83a85906-44bd-4976-8906-53faed1684f3
✅ saved static anomaly → 854ad249-d0e7-4e32-bdab-adde8740c980
✅ saved static anomaly → 8fdd27ba-fabe-4df0-a4ec-a53e7cda838

In [10]:
# ===========================================
# 4. Sanity check
# ===========================================
if len(results) > 0:
    df_all_static = pd.concat(results, ignore_index=True)
    print(df_all_static["static_anomaly"].value_counts())
    display(df_all_static.head())


static_anomaly
0    10841
1      353
Name: count, dtype: int64


Unnamed: 0,date_,static_score,static_thr,static_anomaly,top_features,athlete_id,method,params
0,2023-01-12,0.149732,1.466383e-17,0,,121b05df-f5f6-4029-92a7-5420dea45e4d,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
1,2023-01-13,0.159617,1.466383e-17,0,,121b05df-f5f6-4029-92a7-5420dea45e4d,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
2,2023-01-14,0.062777,1.466383e-17,0,,121b05df-f5f6-4029-92a7-5420dea45e4d,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
3,2023-01-16,0.020711,1.466383e-17,0,,121b05df-f5f6-4029-92a7-5420dea45e4d,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
4,2023-01-17,0.138838,1.466383e-17,0,,121b05df-f5f6-4029-92a7-5420dea45e4d,IsolationForest,"{""scaler"": ""RobustScaler"", ""contamination"": 0...."
