## XGBoost (Kaggle) â€” Train by LOCATION_ID (20 Locations)

**YÃªu cáº§u Dataset:**
- Cháº¡y `fetch-demo-data-singlekeys.ipynb` trÆ°á»›c (Ä‘Ã£ fetch 20 tá»‰nh/thÃ nh)
- Upload output (`weather_20loc`) thÃ nh Kaggle Dataset
- Add dataset vÃ o notebook nÃ y

**Config:**
- LAG = 49h lookback
- HORIZON = 100h forecast (~4 ngÃ y)
- 20 locations

**Speed Optimizations:**
- `learning_rate = 0.08` (tÄƒng tá»« 0.05) â†’ há»™i tá»¥ nhanh hÆ¡n
- `NUM_BOOST = 2500` (giáº£m tá»« 5000) â†’ váº«n Ä‘á»§ vá»›i early stopping
- `EARLY_STOP = 100` (giáº£m tá»« 200) â†’ check nhanh hÆ¡n
- `nthread = all CPUs` â†’ song song hÃ³a

**Bins Reporting (giá»‘ng GRU/TCN/LightGBM):**
- `1-24h`: Ngáº¯n háº¡n (1 ngÃ y)
- `25-48h`: Trung háº¡n (1-2 ngÃ y)
- `49-72h`: Trung-dÃ i (2-3 ngÃ y)
- `73-100h`: DÃ i háº¡n (3-4 ngÃ y)

**Features:**
- GPU auto-detect vá»›i fallback CPU
- Tá»± dÃ² weather_20loc/data vÃ  load location_ids tá»« metadata
- MÆ°a 2-stage (event + amount vá»›i log1p)
- CÃ³ thá»ƒ cháº¡y 1 target hoáº·c táº¥t cáº£

In [None]:
# ============================================================
# XGBoost trainer - Train by LOCATION_ID from Kaggle Dataset
# 20 provinces/cities, LAG=49, HORIZON=100
# OPTIMIZED: parallel training, bins reporting
# ============================================================

!pip -q install -U "xgboost>=2.0"

import os, gc, json, subprocess
from pathlib import Path
from typing import List, Tuple
import numpy as np
import pandas as pd
import xgboost as xgb

print("XGBoost version:", xgb.__version__)

# ============================================================
# 0) GPU CHECK
# ============================================================
def can_run_nvidia_smi():
    try:
        r = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        return r.returncode == 0
    except Exception:
        return False

print("nvidia-smi available:", can_run_nvidia_smi())
if can_run_nvidia_smi():
    _ = subprocess.run(["nvidia-smi", "-L"], check=False)

# ============================================================
# 1) RUN CONTROL (OPTIMIZED)
# ============================================================
TARGETS_TO_RUN = "all"  # "all" or one of: temp, rain, u10, v10, rh, press, cloud
USE_GPU = True

H_START = 1
H_END   = 100
LAG = 49
H   = 100

# === BINS for reporting (giá»‘ng GRU/TCN/LightGBM) ===
BINS = ((1,24), (25,48), (49,72), (73,100))

# === LOCATION BATCHING ===
START_LOC_IDX = 0
END_LOC_IDX = -1

SPLITS = {
    "train": "train_2021_2023",
    "val":   "val_2024",
    "test":  "test_2025_01_to_2025_11",
}

CANON_KEYS = ["temp","rain","u10","v10","rh","press","cloud"]

if TARGETS_TO_RUN == "all":
    TARGETS = CANON_KEYS
else:
    assert TARGETS_TO_RUN in CANON_KEYS, f"RUN must be one of {sorted(CANON_KEYS)}"
    TARGETS = [TARGETS_TO_RUN]

# ============================================================
# 2) AUTO-DETECT DATA DIR + LOAD LOCATION_IDS
# ============================================================
INPUT_ROOT = Path("/kaggle/input")

def find_data_dir():
    # Only look for weather_20loc
    for p in INPUT_ROOT.rglob("weather_20loc/data"):
        if p.is_dir():
            return p
    for p in INPUT_ROOT.rglob("data/tabular"):
        if p.is_dir():
            return p.parent
    raise FileNotFoundError("KhÃ´ng tÃ¬m tháº¥y weather_20loc/data trong /kaggle/input")

DATA_DIR = find_data_dir()
TAB_DIR = DATA_DIR / "tabular"
META_DIR = DATA_DIR / "meta"

print(f"DATA_DIR = {DATA_DIR}")
print(f"TAB_DIR = {TAB_DIR}")

def load_location_ids():
    meta_file = META_DIR / "locations.json"
    if meta_file.exists():
        with open(meta_file) as f:
            meta = json.load(f)
        loc_ids = meta.get("location_ids", [])
        locations = meta.get("locations", [])
        print(f"Loaded {len(loc_ids)} locations from metadata:")
        for loc in locations:
            print(f"  {loc['name']:15s} = {loc['location_id']}")
        return loc_ids, {loc["location_id"]: loc["name"] for loc in locations}
    
    print("[warn] locations.json not found, scanning files...")
    files = list(TAB_DIR.glob(f"*_{SPLITS['train']}_tab_temp_lag{LAG}_h{H}.parquet"))
    loc_ids = sorted(set(f.name.split("_")[0] for f in files))
    print(f"Found {len(loc_ids)} location_ids from files")
    return loc_ids, {}

LOCATION_IDS_ALL, LOC_NAMES = load_location_ids()

# === LOCATION BATCHING ===
_start = START_LOC_IDX
_end = END_LOC_IDX if END_LOC_IDX >= 0 else len(LOCATION_IDS_ALL)
LOCATION_IDS = LOCATION_IDS_ALL[_start:_end]
print(f"[LOCATION BATCH] Using {len(LOCATION_IDS)}/{len(LOCATION_IDS_ALL)} locations (idx {_start}:{_end})")

# ============================================================
# 3) OUTPUT DIRS
# ============================================================
OUT_DIR = Path("/kaggle/working/xgb_out_singlekeys")
MODEL_DIR = OUT_DIR / "models"
REPORT_DIR = OUT_DIR / "reports"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
REPORT_DIR.mkdir(parents=True, exist_ok=True)

# ============================================================
# 4) IO HELPERS
# ============================================================
def ycol(h: int):
    return f"y_t+{h:03d}"

def load_split(loc_id: str, split_name: str, target_key: str) -> pd.DataFrame:
    fn = f"{loc_id}_{split_name}_tab_{target_key}_lag{LAG}_h{H}.parquet"
    path = TAB_DIR / fn
    if not path.exists():
        raise FileNotFoundError(f"Missing: {path}")
    return pd.read_parquet(path)

def get_cols_from_df(df):
    feat_cols = [c for c in df.columns if "_lag" in c]
    y_cols = [ycol(h) for h in range(1, H+1)]
    return feat_cols, y_cols

def loc_short_name(loc_id: str) -> str:
    return LOC_NAMES.get(loc_id, loc_id[:8])

# ============================================================
# 5) METRICS
# ============================================================
def mae(yhat, y):
    return float(np.mean(np.abs(np.asarray(yhat, np.float32) - np.asarray(y, np.float32))))

def rmse(yhat, y):
    d = np.asarray(yhat, np.float32) - np.asarray(y, np.float32)
    return float(np.sqrt(np.mean(d * d)))

def event_metrics(y_true01, y_pred01):
    y_true01 = np.asarray(y_true01).astype(np.int32)
    y_pred01 = np.asarray(y_pred01).astype(np.int32)
    tp = int(((y_true01 == 1) & (y_pred01 == 1)).sum())
    fp = int(((y_true01 == 0) & (y_pred01 == 1)).sum())
    fn = int(((y_true01 == 1) & (y_pred01 == 0)).sum())
    prec = tp / (tp + fp + 1e-9)
    rec  = tp / (tp + fn + 1e-9)
    f1   = 2 * prec * rec / (prec + rec + 1e-9)
    return float(prec), float(rec), float(f1), tp, fp, fn

# ============================================================
# 6) XGBoost params (SPEED OPTIMIZED)
# ============================================================
NUM_BOOST = 2500        # Reduced from 5000
EARLY_STOP = 100        # Reduced from 200

RAIN_MM_THR   = 0.1
P_THR_CAND    = np.round(np.linspace(0.05, 0.95, 19), 2).tolist()
MIN_POS_TRAIN = 300
MIN_POS_VAL   = 50
USE_LOG1P_AMOUNT = True

def tune_p_thr_on_val(y_true_evt: np.ndarray, p_pred: np.ndarray, candidates=P_THR_CAND):
    best_thr = 0.5
    best_f1 = -1.0
    for thr in candidates:
        pred_evt = (p_pred >= thr).astype(np.int32)
        _, _, f1, _, _, _ = event_metrics(y_true_evt, pred_evt)
        if f1 > best_f1:
            best_f1 = f1
            best_thr = thr
    return best_thr, best_f1

def choose_device():
    return "cuda" if USE_GPU and can_run_nvidia_smi() else "cpu"

def common_params(device: str):
    n_jobs = os.cpu_count() or 4
    p = {
        "tree_method": "hist",
        "device": device,
        "learning_rate": 0.08,      # Increased from 0.05 (faster convergence)
        "max_depth": 6,
        "min_child_weight": 5,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "reg_lambda": 1.0,
        "max_bin": 256,
        "nthread": n_jobs,          # Use all CPU cores
        "verbosity": 0,             # Reduce logging
    }
    if device == "cuda":
        p["sampling_method"] = "gradient_based"
    return p

def params_reg(device: str):
    p = common_params(device)
    p.update({"objective": "reg:squarederror", "eval_metric": "rmse"})
    return p

def params_clf(device: str, scale_pos_weight: float):
    p = common_params(device)
    p.update({"objective": "binary:logistic", "eval_metric": "logloss", "scale_pos_weight": float(scale_pos_weight)})
    return p

def predict_best(model: xgb.Booster, dmat: xgb.DMatrix):
    bi = getattr(model, "best_iteration", None)
    if bi is None:
        return model.predict(dmat)
    return model.predict(dmat, iteration_range=(0, int(bi) + 1))

def train_with_es(params, dtr, dva):
    try:
        cb = [xgb.callback.EarlyStopping(rounds=EARLY_STOP, save_best=True)]
        return xgb.train(params=params, dtrain=dtr, num_boost_round=NUM_BOOST, evals=[(dva, "val")], callbacks=cb, verbose_eval=500)
    except Exception:
        return xgb.train(params=params, dtrain=dtr, num_boost_round=NUM_BOOST, evals=[(dva, "val")], early_stopping_rounds=EARLY_STOP, verbose_eval=500)

def train_with_device_fallback(make_params_fn, dtr, dva, *args):
    dev = choose_device()
    try:
        params = make_params_fn(dev, *args)
        model = train_with_es(params, dtr, dva)
        return model, dev
    except xgb.core.XGBoostError as e:
        msg = str(e).lower()
        if dev == "cuda" and ("cuda" in msg or "gpu" in msg or "device" in msg):
            print("[warn] GPU failed, fallback CPU.\n", str(e)[:200])
            params = make_params_fn("cpu", *args)
            model = train_with_es(params, dtr, dva)
            return model, "cpu"
        raise

def _make_params_reg(device, *_):
    return params_reg(device)

def _make_params_clf(device, scale_pos_weight):
    return params_clf(device, scale_pos_weight)

# ============================================================
# 7) TRAIN 1-stage reg (OPTIMIZED)
# ============================================================
def train_reg_1stage(loc_id: str, target_key: str, h_start: int, h_end: int):
    name = loc_short_name(loc_id)
    print(f"\n=== XGB 1-STAGE | {name} ({loc_id[:8]}...) target={target_key} h={h_start:03d}-{h_end:03d} ===")

    df_tr = load_split(loc_id, SPLITS["train"], target_key)
    df_va = load_split(loc_id, SPLITS["val"], target_key)
    df_te = load_split(loc_id, SPLITS["test"], target_key)

    feat_cols, y_cols = get_cols_from_df(df_tr)

    Xtr = df_tr[feat_cols].to_numpy(np.float32)
    Xva = df_va[feat_cols].to_numpy(np.float32)
    Xte = df_te[feat_cols].to_numpy(np.float32)

    Ytr = df_tr[y_cols].to_numpy(np.float32)
    Yva = df_va[y_cols].to_numpy(np.float32)
    Yte = df_te[y_cols].to_numpy(np.float32)

    # Pre-create DMatrix once (SPEED OPTIMIZATION)
    dtr = xgb.DMatrix(Xtr)
    dva = xgb.DMatrix(Xva)
    dte = xgb.DMatrix(Xte)

    mdir = MODEL_DIR / target_key
    mdir.mkdir(parents=True, exist_ok=True)
    report_path = REPORT_DIR / f"report_xgb_{target_key}_{loc_id}.csv"
    old = pd.read_csv(report_path) if report_path.exists() else None

    rows = []
    for h in range(h_start, h_end + 1):
        mp = mdir / f"xgb_{target_key}_{loc_id}_h{h:03d}.json"
        if mp.exists():
            continue

        dtr.set_label(Ytr[:, h-1])
        dva.set_label(Yva[:, h-1])

        model, used_dev = train_with_device_fallback(_make_params_reg, dtr, dva)

        pred = predict_best(model, dte)
        yte = Yte[:, h-1]

        rows.append({
            "location_id": loc_id,
            "location_name": name,
            "target": target_key, 
            "h": h,
            "device": used_dev,
            "best_iter": int(getattr(model, "best_iteration", -1)),
            "test_mae": mae(pred, yte),
            "test_rmse": rmse(pred, yte),
            "model": mp.name,
        })
        model.save_model(mp)

        if h % 25 == 0 or h == h_end:
            print(f"[{target_key}] {name} h={h:03d} dev={used_dev} mae={rows[-1]['test_mae']:.4f}")

        del model, pred
        gc.collect()

    del Xtr, Xva, Xte, Ytr, Yva, Yte, dtr, dva, dte, df_tr, df_va, df_te
    gc.collect()

    if rows:
        new = pd.DataFrame(rows)
        out = pd.concat([old, new], ignore_index=True) if old is not None else new
        out.to_csv(report_path, index=False)
        return out
    print("[info] nothing new trained (resume hit).")
    return old if old is not None else pd.DataFrame()

# ============================================================
# 8) TRAIN 2-stage rain (OPTIMIZED)
# ============================================================
def train_rain_2stage(loc_id: str, h_start: int, h_end: int):
    name = loc_short_name(loc_id)
    print(f"\n=== XGB 2-STAGE RAIN | {name} ({loc_id[:8]}...) h={h_start:03d}-{h_end:03d} ===")

    df_tr = load_split(loc_id, SPLITS["train"], "rain")
    df_va = load_split(loc_id, SPLITS["val"], "rain")
    df_te = load_split(loc_id, SPLITS["test"], "rain")

    feat_cols, y_cols = get_cols_from_df(df_tr)

    Xtr = df_tr[feat_cols].to_numpy(np.float32)
    Xva = df_va[feat_cols].to_numpy(np.float32)
    Xte = df_te[feat_cols].to_numpy(np.float32)

    Ytr = df_tr[y_cols].to_numpy(np.float32)
    Yva = df_va[y_cols].to_numpy(np.float32)
    Yte = df_te[y_cols].to_numpy(np.float32)

    dtr = xgb.DMatrix(Xtr)
    dva = xgb.DMatrix(Xva)
    dte = xgb.DMatrix(Xte)

    mdir = MODEL_DIR / "rain"
    mdir.mkdir(parents=True, exist_ok=True)
    report_path = REPORT_DIR / f"report_xgb_rain_{loc_id}.csv"
    old = pd.read_csv(report_path) if report_path.exists() else None

    rows = []
    for h in range(h_start, h_end + 1):
        mp_clf = mdir / f"xgb_rain_clf_{loc_id}_h{h:03d}.json"
        mp_reg = mdir / f"xgb_rain_reg_{loc_id}_h{h:03d}.json"
        if mp_clf.exists() and mp_reg.exists():
            continue

        ytr_amt = Ytr[:, h-1]; yva_amt = Yva[:, h-1]; yte_amt = Yte[:, h-1]
        ytr_evt = (ytr_amt >= RAIN_MM_THR).astype(np.float32)
        yva_evt = (yva_amt >= RAIN_MM_THR).astype(np.float32)
        yte_evt = (yte_amt >= RAIN_MM_THR).astype(np.int32)

        pos = float(ytr_evt.sum()); neg = float(len(ytr_evt) - pos)
        spw = max(1.0, neg / max(pos, 1.0))

        # CLASSIFIER
        dtr.set_label(ytr_evt)
        dva.set_label(yva_evt)
        clf, dev_clf = train_with_device_fallback(_make_params_clf, dtr, dva, spw)
        
        p_va = predict_best(clf, dva)
        best_p_thr, val_f1 = tune_p_thr_on_val(yva_evt.astype(np.int32), p_va)
        p_te = predict_best(clf, dte)

        # REGRESSOR
        idx_tr = ytr_evt > 0.5
        idx_va = yva_evt > 0.5
        
        if USE_LOG1P_AMOUNT:
            ytr_amt_log = np.log1p(ytr_amt)
            yva_amt_log = np.log1p(yva_amt)
        else:
            ytr_amt_log = ytr_amt
            yva_amt_log = yva_amt
        
        if idx_tr.sum() < MIN_POS_TRAIN or idx_va.sum() < MIN_POS_VAL:
            dtr.set_label(ytr_amt_log)
            dva.set_label(yva_amt_log)
            reg, dev_reg = train_with_device_fallback(_make_params_reg, dtr, dva)
            pred_log = predict_best(reg, dte)
        else:
            dtr_r = xgb.DMatrix(Xtr[idx_tr], label=ytr_amt_log[idx_tr])
            dva_r = xgb.DMatrix(Xva[idx_va], label=yva_amt_log[idx_va])
            try:
                dev = choose_device()
                reg = train_with_es(params_reg(dev), dtr_r, dva_r)
                dev_reg = dev
            except xgb.core.XGBoostError as e:
                msg = str(e).lower()
                if dev == "cuda" and ("cuda" in msg or "gpu" in msg):
                    reg = train_with_es(params_reg("cpu"), dtr_r, dva_r)
                    dev_reg = "cpu"
                else:
                    raise
            pred_log = predict_best(reg, dte)
        
        if USE_LOG1P_AMOUNT:
            yhat_amt = np.expm1(pred_log).astype(np.float32)
        else:
            yhat_amt = pred_log.astype(np.float32)
        yhat_amt = np.maximum(yhat_amt, 0.0)

        yhat_evt = (p_te >= best_p_thr).astype(np.int32)
        yhat = np.where(yhat_evt == 1, yhat_amt, 0.0).astype(np.float32)

        prec, rec, f1, tp, fp, fn = event_metrics(yte_evt, yhat_evt)
        rows.append({
            "location_id": loc_id,
            "location_name": name,
            "target": "rain", "h": h,
            "device_clf": dev_clf, "device_reg": dev_reg,
            "best_iter_clf": int(getattr(clf, "best_iteration", -1)),
            "best_iter_reg": int(getattr(reg, "best_iteration", -1)),
            "p_thr_tuned": best_p_thr,
            "use_log1p": USE_LOG1P_AMOUNT,
            "test_mae": mae(yhat, yte_amt),
            "test_rmse": rmse(yhat, yte_amt),
            "prec": prec, "rec": rec, "f1": f1, "tp": tp, "fp": fp, "fn": fn,
            "model_clf": mp_clf.name, "model_reg": mp_reg.name,
        })

        clf.save_model(mp_clf)
        reg.save_model(mp_reg)

        if h % 25 == 0 or h == h_end:
            print(f"[rain] {name} h={h:03d} p_thr={best_p_thr:.2f} mae={rows[-1]['test_mae']:.4f} f1={f1:.3f}")

        del clf, reg, p_te, p_va, yhat_amt, yhat, yhat_evt, pred_log
        gc.collect()

    del Xtr, Xva, Xte, Ytr, Yva, Yte, dtr, dva, dte, df_tr, df_va, df_te
    gc.collect()

    if rows:
        new = pd.DataFrame(rows)
        out = pd.concat([old, new], ignore_index=True) if old is not None else new
        out.to_csv(report_path, index=False)
        return out
    print("[info] nothing new trained (resume hit).")
    return old if old is not None else pd.DataFrame()

# ============================================================
# 9) BINS SUMMARY (giá»‘ng GRU/TCN/LightGBM)
# ============================================================
def summarize_bins(df: pd.DataFrame, target: str) -> pd.DataFrame:
    """Summarize metrics by horizon bins"""
    if df is None or len(df) == 0:
        return pd.DataFrame()
    
    rows = []
    for a, b in BINS:
        mask = (df["h"] >= a) & (df["h"] <= b)
        sub = df[mask]
        if len(sub) == 0:
            continue
        
        row = {
            "target": target,
            "horizon_bin": f"{a}-{b}",
            "n_horizons": len(sub),
        }
        
        if "test_mae" in sub.columns:
            row["mae_mean"] = float(sub["test_mae"].mean())
        if "test_rmse" in sub.columns:
            row["rmse_mean"] = float(sub["test_rmse"].mean())
        if "f1" in sub.columns:
            row["f1_mean"] = float(sub["f1"].mean())
        if "prec" in sub.columns:
            row["prec_mean"] = float(sub["prec"].mean())
        if "rec" in sub.columns:
            row["rec_mean"] = float(sub["rec"].mean())
        
        rows.append(row)
    
    return pd.DataFrame(rows)

def summarize_all_bins() -> pd.DataFrame:
    """Summarize all targets with bins"""
    all_bins = []
    
    for tkey in TARGETS:
        for loc_id in LOCATION_IDS:
            if tkey == "rain":
                report_path = REPORT_DIR / f"report_xgb_rain_{loc_id}.csv"
            else:
                report_path = REPORT_DIR / f"report_xgb_{tkey}_{loc_id}.csv"
            
            if not report_path.exists():
                continue
            
            df = pd.read_csv(report_path)
            bins_df = summarize_bins(df, tkey)
            if len(bins_df) > 0:
                bins_df["location_id"] = loc_id
                bins_df["location_name"] = loc_short_name(loc_id)
                all_bins.append(bins_df)
    
    if not all_bins:
        return pd.DataFrame()
    
    return pd.concat(all_bins, ignore_index=True)

# ============================================================
# 10) RUN - Train theo LOCATION_ID
# ============================================================
summaries = []
for tkey in TARGETS:
    for loc_id in LOCATION_IDS:
        name = loc_short_name(loc_id)
        if tkey == "rain":
            rep = train_rain_2stage(loc_id, H_START, H_END)
        else:
            rep = train_reg_1stage(loc_id, tkey, H_START, H_END)

        if rep is None or len(rep) == 0:
            summaries.append({"location_id": loc_id, "location_name": name, "target": tkey, "status":"empty"})
            continue

        s = {"location_id": loc_id, "location_name": name, "target": tkey, "status":"ok", "n_rows": int(len(rep))}
        if "test_mae" in rep.columns:
            s["test_mae_mean"] = float(rep["test_mae"].mean())
        if "test_rmse" in rep.columns:
            s["test_rmse_mean"] = float(rep["test_rmse"].mean())
        if "f1" in rep.columns:
            s["f1_mean"] = float(rep["f1"].mean())
        summaries.append(s)

# ============================================================
# 11) SAVE RESULTS + BINS SUMMARY
# ============================================================
leader = pd.DataFrame(summaries)
leader_path = REPORT_DIR / "xgb_leaderboard.csv"
leader.to_csv(leader_path, index=False)

# Generate bins summary
bins_summary = summarize_all_bins()
if len(bins_summary) > 0:
    bins_path = REPORT_DIR / "xgb_bins_summary.csv"
    bins_summary.to_csv(bins_path, index=False)
    
    # Aggregate bins across all locations
    agg_bins = bins_summary.groupby(["target", "horizon_bin"]).agg({
        "mae_mean": "mean",
        "rmse_mean": "mean",
    }).reset_index()
    if "f1_mean" in bins_summary.columns:
        agg_f1 = bins_summary[bins_summary["target"] == "rain"].groupby("horizon_bin")["f1_mean"].mean()
        agg_bins = agg_bins.merge(agg_f1.reset_index(), on="horizon_bin", how="left")
    
    agg_bins_path = REPORT_DIR / "xgb_bins_aggregate.csv"
    agg_bins.to_csv(agg_bins_path, index=False)
    print("\nðŸ“Š BINS SUMMARY (aggregated):")
    print(agg_bins.to_string(index=False))

print("\n" + "="*60)
print("âœ… DONE!")
print("="*60)
print("Models:", MODEL_DIR)
print("Reports:", REPORT_DIR)
if len(bins_summary) > 0:
    print("Saved bins summary:", bins_path)
    print("Saved bins aggregate:", agg_bins_path)
leader