In [3]:
import os
import glob
from datetime import datetime
import pandas as pd

# --- Paths ---
SRC_DIR  = r"C:\Users\leere\PycharmProjects\Football_ML3\Goals\Under_2_5"
DEST_DIR = os.path.join(SRC_DIR, "best_model_metrics")
PATH_HT_SCORE = os.path.join(SRC_DIR, "model_file")  # directory containing .pkl model files
os.makedirs(DEST_DIR, exist_ok=True)

# Safety for model-file pruning: set to False to actually delete non-best .pkl files
DRY_RUN_PKLS = False

# ---------- delete *_FAILED files ----------
def is_failed_filename(path: str) -> bool:
    base = os.path.basename(path)
    name_no_ext, _ = os.path.splitext(base)
    return base.endswith("_FAILED") or base.endswith("_FAILED.csv") or name_no_ext.endswith("_FAILED")

def purge_failed_files(dirs: list[str]) -> None:
    for d in dirs:
        pattern = os.path.join(d, "model_metrics_*.csv")
        for p in glob.glob(pattern):
            if os.path.isfile(p) and is_failed_filename(p):
                try:
                    os.remove(p)
                    print(f"✂ Deleted FAILED file: {p}")
                except Exception as e:
                    print(f"[WARN] Could not delete FAILED file {p}: {e}")

# ---------- utils ----------
def read_csv_safely(path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(path, low_memory=False)
    except Exception as e:
        print(f"[WARN] Failed to read {path}: {e}")
        return pd.DataFrame()

def _normalise_model_path(p: str) -> str | None:
    """Normalise possibly relative model paths against PATH_HT_SCORE (British spelling ✅)."""
    if not isinstance(p, str) or not p.strip():
        return None
    p = p.strip().strip('"').strip("'")
    if not os.path.isabs(p):
        p = os.path.join(PATH_HT_SCORE, p)
    try:
        return os.path.realpath(p)
    except Exception:
        return os.path.abspath(p)

def choose_best_model_path(df: pd.DataFrame) -> str | None:
    """Pick the first non-empty model_pkl in the already-sorted combined DataFrame."""
    if "model_pkl" not in df.columns:
        print("[WARN] 'model_pkl' column not found in combined CSV — cannot retain any model.")
        return None
    for _, row in df.iterrows():
        raw = str(row.get("model_pkl", "") or "").strip()
        if raw:
            p = _normalise_model_path(raw)
            if p:
                return p
    print("[WARN] No non-empty 'model_pkl' found in combined CSV — cannot retain any model.")
    return None

def prune_pkls_keep_one(keep_path: str | None) -> None:
    """Delete all .pkl files in PATH_HT_SCORE except keep_path."""
    if not os.path.isdir(PATH_HT_SCORE):
        print(f"[WARN] PATH_HT_SCORE does not exist: {PATH_HT_SCORE}")
        return

    all_pkls = [
        os.path.realpath(os.path.join(PATH_HT_SCORE, name))
        for name in os.listdir(PATH_HT_SCORE)
        if os.path.isfile(os.path.join(PATH_HT_SCORE, name)) and name.lower().endswith(".pkl")
    ]

    if keep_path:
        keep_path = os.path.realpath(keep_path)

    to_delete = sorted([p for p in all_pkls if p != keep_path])

    print("\n— Pruning model_file directory —")
    print(f" Found .pkl files: {len(all_pkls)}")
    print(f" Keeping: {keep_path if keep_path else '(none — no best model_pkl found)'}")
    print(f" Will delete: {len(to_delete)}")

    if DRY_RUN_PKLS:
        for p in to_delete[:20]:
            print(f"   (dry-run) would delete: {p}")
        if len(to_delete) > 20:
            print(f"   ...and {len(to_delete) - 20} more.")
        print(" Set DRY_RUN_PKLS = False to actually delete.")
        return

    for p in to_delete:
        try:
            os.remove(p)
            print(f" ✂ Deleted: {p}")
        except Exception as e:
            print(f" [WARN] Could not delete {p}: {e}")

def deduplicate_sorted(df: pd.DataFrame) -> pd.DataFrame:
    """
    After sorting by val_precision_lcb desc, drop duplicate candidates.
    We treat a 'candidate' as (threshold + any present hyper-parameter columns).
    """
    XGB_PARAM_COLS = [
        "n_estimators", "max_depth", "learning_rate", "min_child_weight",
        "subsample", "colsample_bytree", "reg_lambda",
    ]
    MLP_PARAM_COLS = [
        "hidden_layer_sizes", "alpha", "learning_rate_init",
        "batch_size", "max_iter",
    ]
    key_cols = [c for c in (["threshold"] + XGB_PARAM_COLS + MLP_PARAM_COLS) if c in df.columns]
    if key_cols:
        return df.drop_duplicates(subset=key_cols, keep="first").reset_index(drop=True)
    # Fallback: drop exact duplicates
    return df.drop_duplicates(keep="first").reset_index(drop=True)

# -----------------------------
def main():
    # 1) Clean up *_FAILED CSVs first (SRC and DEST)
    purge_failed_files([SRC_DIR, DEST_DIR])

    # 2) Collect ALL CSVs to merge:
    #    - Every CSV in SRC (single-run + COMBINED)
    #    - Every COMBINED CSV in DEST (older results)
    src_csvs  = [p for p in glob.glob(os.path.join(SRC_DIR,  "model_metrics_*.csv")) if not is_failed_filename(p)]
    dest_csvs = [p for p in glob.glob(os.path.join(DEST_DIR, "model_metrics_*COMBINED_*.csv")) if not is_failed_filename(p)]
    merge_inputs = sorted(set(src_csvs + dest_csvs))

    if not merge_inputs:
        print("No CSVs found to merge; nothing to do.")
        return

    # 3) Load, concat
    frames, used_paths = [], []
    for p in merge_inputs:
        df = read_csv_safely(p)
        if df.empty:
            print(f"  [WARN] Empty or unreadable CSV (skipped): {p}")
            continue
        frames.append(df)
        used_paths.append(p)

    if not frames:
        print("No usable CSV data after reading; nothing to do.")
        return

    combined = pd.concat(frames, ignore_index=True, sort=False)

    # 4) Sort by val_precision_lcb (desc) and de-duplicate logical duplicates
    if "val_precision_lcb" not in combined.columns:
        raise KeyError("Expected column 'val_precision_lcb' not found in CSVs.")
    combined["val_precision_lcb"] = pd.to_numeric(combined["val_precision_lcb"], errors="coerce")
    combined = combined.sort_values(by=["val_precision_lcb"], ascending=False, kind="mergesort").reset_index(drop=True)
    combined = deduplicate_sorted(combined)

    # 5) Write fresh COMBINED to DEST
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = os.path.join(DEST_DIR, f"model_metrics_COMBINED_{ts}.csv")
    combined.to_csv(out_path, index=False)
    print(f"✓ Wrote combined CSV ({len(combined)} rows):\n  {out_path}")

    # 6) Delete ALL CSV inputs used for the merge (both SRC and DEST)
    deleted, failed = 0, 0
    for p in used_paths:
        # never delete the brand-new output we just wrote (not in used_paths anyway)
        try:
            os.remove(p)
            deleted += 1
            print(f" ✂ Deleted merged input CSV: {p}")
        except Exception as e:
            failed += 1
            print(f" [WARN] Could not delete CSV {p}: {e}")
    print(f"Input CSV cleanup — deleted: {deleted}, failed: {failed}")

    # 7) Pick the best model_pkl from the top-most row with a non-empty model_pkl
    best_model_path = choose_best_model_path(combined)
    if best_model_path and not os.path.exists(best_model_path):
        print(f"[WARN] Best model path does not exist on disk (will still keep path): {best_model_path}")

    # 8) Prune PKLs in PATH_HT_SCORE to keep only the best
    prune_pkls_keep_one(best_model_path)

    print("\nAll done.")

if __name__ == "__main__":
    main()


✂ Deleted FAILED file: C:\Users\leere\PycharmProjects\Football_ML3\Goals\Under_2_5\model_metrics_20250819_190150_FAILED.csv
✂ Deleted FAILED file: C:\Users\leere\PycharmProjects\Football_ML3\Goals\Under_2_5\model_metrics_20250819_211324_FAILED.csv
✂ Deleted FAILED file: C:\Users\leere\PycharmProjects\Football_ML3\Goals\Under_2_5\model_metrics_20250820_020937_FAILED.csv
✓ Wrote combined CSV (5 rows):
  C:\Users\leere\PycharmProjects\Football_ML3\Goals\Under_2_5\best_model_metrics\model_metrics_COMBINED_20250820_070744.csv
 ✂ Deleted merged input CSV: C:\Users\leere\PycharmProjects\Football_ML3\Goals\Under_2_5\best_model_metrics\model_metrics_COMBINED_20250819_112054.csv
Input CSV cleanup — deleted: 1, failed: 0

— Pruning model_file directory —
 Found .pkl files: 1
 Keeping: C:\Users\leere\PycharmProjects\Football_ML3\Goals\Under_2_5\model_file\best_model_xgb_calibrated_20250818_175932.pkl
 Will delete: 0

All done.
