In [3]:
import os
import glob
from datetime import datetime
import pandas as pd

# ========= Settings =========
BASE = r"C:\Users\leere\PycharmProjects\Football_ML3\FT Results"
MARKETS = ["Lay_Home", "Lay_Away", "Lay_Draw", "Back_Home", "Back_Away", "Back_Draw"]

# Safety: keep True while testing (will only print what would be deleted)
DRY_RUN_PKLS = True

# ========= Helpers =========
def metrics_dir(market: str) -> str:
    return os.path.join(BASE, market, "best_model_metrics")

def model_dir(market: str) -> str:
    return os.path.join(BASE, market, "model_file")

def is_failed_filename(path: str) -> bool:
    base = os.path.basename(path)
    name_no_ext, _ = os.path.splitext(base)
    return base.endswith("_FAILED") or base.endswith("_FAILED.csv") or name_no_ext.endswith("_FAILED")

def read_csv_safely(path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(path, low_memory=False)
    except Exception as e:
        print(f"  [WARN] Failed to read {path}: {e}")
        return pd.DataFrame()

def deduplicate_sorted(df: pd.DataFrame) -> pd.DataFrame:
    """
    After sorting by val_precision_lcb desc, drop duplicates on logical 'candidate' keys.
    """
    XGB = ["n_estimators","max_depth","learning_rate","min_child_weight","subsample","colsample_bytree","reg_lambda"]
    MLP = ["hidden_layer_sizes","alpha","learning_rate_init","batch_size","max_iter"]
    key_cols = [c for c in (["threshold"] + XGB + MLP) if c in df.columns]
    if key_cols:
        return df.drop_duplicates(subset=key_cols, keep="first").reset_index(drop=True)
    return df.drop_duplicates(keep="first").reset_index(drop=True)

def normalise_model_path(p: str) -> str | None:
    if not isinstance(p, str) or not p.strip():
        return None
    p = p.strip().strip('"').strip("'")
    try:
        return os.path.realpath(p)
    except Exception:
        return os.path.abspath(p)

def choose_best_model_path(df: pd.DataFrame) -> str | None:
    if "model_pkl" not in df.columns:
        print("  [WARN] 'model_pkl' column not in CSVs; cannot pick a model to keep.")
        return None
    for _, row in df.iterrows():
        raw = (row.get("model_pkl") or "").strip()
        if raw:
            p = normalise_model_path(raw)
            if p:
                return p
    print("  [WARN] No non-empty 'model_pkl' found in combined CSV.")
    return None

def prune_pkls_keep_one(folder: str, keep_path: str | None) -> None:
    print("— Pruning model_file directory —")
    if not os.path.isdir(folder):
        print(f"  [WARN] model folder does not exist: {folder}")
        return

    all_pkls = []
    for name in os.listdir(folder):
        p = os.path.join(folder, name)
        if os.path.isfile(p) and name.lower().endswith(".pkl"):
            all_pkls.append(os.path.realpath(p))

    keep_real = os.path.realpath(keep_path) if keep_path else None
    to_delete = sorted([p for p in all_pkls if p != keep_real])

    print(f"  Found .pkl files: {len(all_pkls)}")
    print(f"  Keeping: {keep_real if keep_real else '(none)'}")
    print(f"  Will delete: {len(to_delete)}")

    if DRY_RUN_PKLS:
        for p in to_delete[:20]:
            print(f"    (dry-run) would delete: {p}")
        if len(to_delete) > 20:
            print(f"    ...and {len(to_delete) - 20} more.")
        print("  Set DRY_RUN_PKLS = False to actually delete.")
        return

    for p in to_delete:
        try:
            os.remove(p)
            print(f"   ✂ Deleted: {p}")
        except Exception as e:
            print(f"   [WARN] Could not delete {p}: {e}")

def collect_csvs(mdir: str) -> list[str]:
    """
    Collect run CSVs and any existing COMBINED CSVs in the metrics folder.
    Excludes *_FAILED*.csv
    """
    patterns = [
        os.path.join(mdir, "model_metrics_*.csv"),
        os.path.join(mdir, "model_metrics_*COMBINED_*.csv"),
    ]
    out = []
    for pat in patterns:
        for p in glob.glob(pat):
            if not is_failed_filename(p):
                out.append(p)
    # Deduplicate by realpath
    seen, uniq = set(), []
    for p in out:
        rp = os.path.realpath(p)
        if rp not in seen:
            seen.add(rp)
            uniq.append(rp)
    return sorted(uniq)

def merge_and_rank_for_market(market: str) -> None:
    mdir = metrics_dir(market)
    pdir = model_dir(market)
    os.makedirs(mdir, exist_ok=True)
    os.makedirs(pdir, exist_ok=True)

    print(f"\n===== {market} =====")
    print(f" Metrics dir: {mdir}")
    print(f" Model dir:   {pdir}")

    csv_paths = collect_csvs(mdir)
    if not csv_paths:
        print("  No CSVs found to merge; skipping.")
        return

    frames, used = [], []
    for p in csv_paths:
        df = read_csv_safely(p)
        if df.empty:
            print(f"  [WARN] Empty/unreadable CSV skipped: {p}")
            continue
        frames.append(df)
        used.append(p)

    if not frames:
        print("  No usable CSV data; skipping.")
        return

    combined = pd.concat(frames, ignore_index=True, sort=False)

    if "val_precision_lcb" not in combined.columns:
        print("  [WARN] 'val_precision_lcb' missing; cannot rank. Skipping.")
        return

    # rank
    combined["val_precision_lcb"] = pd.to_numeric(combined["val_precision_lcb"], errors="coerce")
    combined = combined.dropna(subset=["val_precision_lcb"]).reset_index(drop=True)
    if combined.empty:
        print("  No rows with numeric 'val_precision_lcb'; skipping.")
        return

    combined = combined.sort_values(by=["val_precision_lcb"], ascending=False, kind="mergesort").reset_index(drop=True)
    combined = deduplicate_sorted(combined)

    # write combined
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = os.path.join(mdir, f"model_metrics_COMBINED_{ts}.csv")
    combined.to_csv(out_path, index=False)
    print(f"  ✓ Wrote COMBINED CSV ({len(combined)} rows): {out_path}")

    # delete inputs used for this merge (both runs and older COMBINEDs)
    deleted, failed = 0, 0
    for p in used:
        try:
            if os.path.exists(p) and p != out_path:
                os.remove(p)
                deleted += 1
                print(f"   ✂ Deleted merged input CSV: {p}")
        except Exception as e:
            failed += 1
            print(f"   [WARN] Could not delete CSV {p}: {e}")
    if deleted or failed:
        print(f"  Input CSV cleanup — deleted: {deleted}, failed: {failed}")

    # keep only the top model's PKL
    best_model = choose_best_model_path(combined)
    if best_model and not os.path.exists(best_model):
        print(f"  [WARN] Best model path does not exist on disk (keeping path anyway): {best_model}")
    prune_pkls_keep_one(pdir, best_model)

def main():
    for m in MARKETS:
        merge_and_rank_for_market(m)

if __name__ == "__main__":
    main()



===== Lay_Home =====
 Metrics dir: C:\Users\leere\PycharmProjects\Football_ML3\FT Results\Lay_Home\best_model_metrics
 Model dir:   C:\Users\leere\PycharmProjects\Football_ML3\FT Results\Lay_Home\model_file
  No CSVs found to merge; skipping.

===== Lay_Away =====
 Metrics dir: C:\Users\leere\PycharmProjects\Football_ML3\FT Results\Lay_Away\best_model_metrics
 Model dir:   C:\Users\leere\PycharmProjects\Football_ML3\FT Results\Lay_Away\model_file
  ✓ Wrote COMBINED CSV (13 rows): C:\Users\leere\PycharmProjects\Football_ML3\FT Results\Lay_Away\best_model_metrics\model_metrics_COMBINED_20250907_134415.csv
   ✂ Deleted merged input CSV: C:\Users\leere\PycharmProjects\Football_ML3\FT Results\Lay_Away\best_model_metrics\model_metrics_LAY_AWAY_20250907_132958.csv
   ✂ Deleted merged input CSV: C:\Users\leere\PycharmProjects\Football_ML3\FT Results\Lay_Away\best_model_metrics\model_metrics_LAY_AWAY_20250907_133926.csv
  Input CSV cleanup — deleted: 2, failed: 0
  [WARN] Best model path does 