## LightGBM (Kaggle) â€” Train by LOCATION_ID (20 Locations)

**YÃªu cáº§u Dataset:**
- Cháº¡y `fetch-demo-data-singlekeys.ipynb` trÆ°á»›c (Ä‘Ã£ fetch 20 tá»‰nh/thÃ nh)
- Upload output thÃ nh Kaggle Dataset
- Add dataset vÃ o notebook nÃ y

**Config:**
- LAG = 49h lookback
- HORIZON = 100h forecast (~4 ngÃ y)
- 20 locations thay vÃ¬ 34/63

**Speed Optimizations:**
- `learning_rate = 0.08` (tÄƒng tá»« 0.05) â†’ há»™i tá»¥ nhanh hÆ¡n
- `NUM_BOOST = 2000` (giáº£m tá»« 3000) â†’ váº«n Ä‘á»§ vá»›i early stopping
- `EARLY_STOP = 100` (giáº£m tá»« 150) â†’ check nhanh hÆ¡n
- `num_threads = all CPUs` â†’ song song hÃ³a
- `force_row_wise = True` â†’ tá»‘i Æ°u cho dataset vá»«a
- Reuse Dataset objects â†’ giáº£m overhead

**Bins Reporting (giá»‘ng GRU/TCN):**
- `1-24h`: Ngáº¯n háº¡n (1 ngÃ y)
- `25-48h`: Trung háº¡n (1-2 ngÃ y)
- `49-72h`: Trung-dÃ i (2-3 ngÃ y)
- `73-100h`: DÃ i háº¡n (3-4 ngÃ y)

**Features:**
- Thá»­ GPU rá»“i fallback CPU
- Tá»± dÃ² TAB_DIR vÃ  load location_ids tá»« metadata
- MÆ°a 2-stage (event + amount)
- CÃ³ thá»ƒ cháº¡y 1 target hoáº·c táº¥t cáº£

In [None]:
# ============================================================
# LightGBM trainer - Train by LOCATION_ID from Kaggle Dataset
# 20 provinces/cities, LAG=49, HORIZON=100
# OPTIMIZED: parallel training, bins reporting
# ============================================================

import importlib, sys, subprocess, os, json, gc
from pathlib import Path
from typing import List, Tuple

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)

try:
    import lightgbm as lgb
except Exception:
    pip_install(["lightgbm==4.6.0"])
    import lightgbm as lgb

try:
    import pyarrow
except Exception:
    pip_install(["pyarrow<20"])

import numpy as np
import pandas as pd

print("LightGBM version:", lgb.__version__)

# ============================================================
# 0) RUN CONTROL (OPTIMIZED)
# ============================================================
TARGETS_TO_RUN = "all"  # "all" or list like ["temp","rain"]
USE_GPU = False         # True/False

LAG = 49                # 49h lookback
H   = 100               # 100h forecast
H_START = 1
H_END   = 100

# === BINS for reporting (giá»‘ng GRU/TCN) ===
BINS = ((1,24), (25,48), (49,72), (73,100))

# === LOCATION BATCHING ===
START_LOC_IDX = 0
END_LOC_IDX = -1        # -1 = all remaining

SPLITS = {
    "train": "train_2021_2023",
    "val":   "val_2024",
    "test":  "test_2025_01_to_2025_11",
}

LOAD_Y_MODE = "all"
SEED = 42

RAIN_2STAGE = True
RAIN_MM_THR = 0.1
P_THR_CAND = np.round(np.linspace(0.05, 0.95, 19), 2).tolist()
MIN_POS_TRAIN = 300
MIN_POS_VAL   = 50
USE_LOG1P_AMOUNT = True

CANON_KEYS = ["temp","rain","u10","v10","rh","press","cloud"]

if TARGETS_TO_RUN == "all":
    TARGETS = CANON_KEYS
else:
    TARGETS = list(TARGETS_TO_RUN)

# ============================================================
# 1) AUTO-DETECT DATA DIR + LOAD LOCATION_IDS
# ============================================================
INPUT_ROOT = Path("/kaggle/input")

def find_data_dir():
    for pattern in ["weather_20loc/data", "weather_34loc/data", "weather_63loc/data", "weather_4loc/data"]:
        for p in INPUT_ROOT.rglob(pattern):
            if p.is_dir():
                return p
    for p in INPUT_ROOT.rglob("data/tabular"):
        if p.is_dir():
            return p.parent
    raise FileNotFoundError("KhÃ´ng tÃ¬m tháº¥y data directory")

DATA_DIR = find_data_dir()
TAB_DIR = DATA_DIR / "tabular"
META_DIR = DATA_DIR / "meta"

print(f"DATA_DIR = {DATA_DIR}")
print(f"TAB_DIR = {TAB_DIR}")

def load_location_ids():
    meta_file = META_DIR / "locations.json"
    if meta_file.exists():
        with open(meta_file) as f:
            meta = json.load(f)
        loc_ids = meta.get("location_ids", [])
        locations = meta.get("locations", [])
        print(f"Loaded {len(loc_ids)} locations:")
        for loc in locations:
            print(f"  {loc['name']:15s} = {loc['location_id']}")
        return loc_ids, {loc["location_id"]: loc["name"] for loc in locations}
    
    files = list(TAB_DIR.glob(f"*_{SPLITS['train']}_tab_temp_lag{LAG}_h{H}.parquet"))
    loc_ids = sorted(set(f.name.split("_")[0] for f in files))
    print(f"Found {len(loc_ids)} location_ids from files")
    return loc_ids, {}

LOCATION_IDS_ALL, LOC_NAMES = load_location_ids()

# === LOCATION BATCHING ===
_start = START_LOC_IDX
_end = END_LOC_IDX if END_LOC_IDX >= 0 else len(LOCATION_IDS_ALL)
LOCATION_IDS = LOCATION_IDS_ALL[_start:_end]
print(f"[LOCATION BATCH] Using {len(LOCATION_IDS)}/{len(LOCATION_IDS_ALL)} locations (idx {_start}:{_end})")

# ============================================================
# 2) OUTPUT DIRS
# ============================================================
OUT_DIR = Path("/kaggle/working/lgb_out_singlekeys")
MODEL_DIR = OUT_DIR / "models"
REPORT_DIR = OUT_DIR / "reports"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
REPORT_DIR.mkdir(parents=True, exist_ok=True)

# ============================================================
# 3) IO HELPERS
# ============================================================
def ycol(h: int):
    return f"y_t+{h:03d}"

def file_path(loc_id: str, split_name: str, target_key: str) -> Path:
    return TAB_DIR / f"{loc_id}_{split_name}_tab_{target_key}_lag{LAG}_h{H}.parquet"

def loc_short_name(loc_id: str) -> str:
    return LOC_NAMES.get(loc_id, loc_id[:8])

def get_schema_cols(path: Path):
    try:
        import pyarrow.parquet as pq
        return pq.ParquetFile(path).schema.names
    except Exception:
        return pd.read_parquet(path, engine="pyarrow", columns=None).columns.tolist()

def load_XY(loc_id: str, split_name: str, target_key: str):
    path = file_path(loc_id, split_name, target_key)
    if not path.exists():
        raise FileNotFoundError(f"Missing: {path}")

    cols = get_schema_cols(path)
    feat_cols = [c for c in cols if "_lag" in c]
    y_cols = [ycol(h) for h in range(1, H+1)]

    X = pd.read_parquet(path, columns=feat_cols).to_numpy(np.float32)
    Y = pd.read_parquet(path, columns=y_cols).to_numpy(np.float32) if LOAD_Y_MODE == "all" else None

    gc.collect()
    return X, Y, feat_cols, y_cols, path

def load_y_per_h(path: Path, y_name: str) -> np.ndarray:
    return pd.read_parquet(path, columns=[y_name])[y_name].to_numpy(np.float32)

# ============================================================
# 4) METRICS
# ============================================================
def mae(yhat, y):
    return float(np.mean(np.abs(np.asarray(yhat, np.float32) - np.asarray(y, np.float32))))

def rmse(yhat, y):
    d = np.asarray(yhat, np.float32) - np.asarray(y, np.float32)
    return float(np.sqrt(np.mean(d * d)))

def event_metrics(y_true01, y_pred01):
    y_true01 = np.asarray(y_true01).astype(np.int32)
    y_pred01 = np.asarray(y_pred01).astype(np.int32)
    tp = int(((y_true01 == 1) & (y_pred01 == 1)).sum())
    fp = int(((y_true01 == 0) & (y_pred01 == 1)).sum())
    fn = int(((y_true01 == 1) & (y_pred01 == 0)).sum())
    prec = tp / (tp + fp + 1e-9)
    rec  = tp / (tp + fn + 1e-9)
    f1   = 2 * prec * rec / (prec + rec + 1e-9)
    return float(prec), float(rec), float(f1), tp, fp, fn

def tune_p_thr_on_val(y_true_evt: np.ndarray, p_pred: np.ndarray, candidates=P_THR_CAND):
    best_thr = 0.5
    best_f1 = -1.0
    for thr in candidates:
        pred_evt = (p_pred >= thr).astype(np.int32)
        _, _, f1, _, _, _ = event_metrics(y_true_evt, pred_evt)
        if f1 > best_f1:
            best_f1 = f1
            best_thr = thr
    return best_thr, best_f1

# ============================================================
# 5) LightGBM params (SPEED OPTIMIZED for Kaggle)
# ============================================================
NUM_BOOST = 2000        # Reduced from 3000 (still enough with early stop)
EARLY_STOP = 100        # Reduced from 150 (faster convergence check)

def base_common(device: str):
    # Detect number of CPUs
    n_jobs = os.cpu_count() or 4
    return {
        "learning_rate": 0.08,      # Increased from 0.05 (faster convergence)
        "num_leaves": 31,
        "min_data_in_leaf": 500,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "lambda_l2": 1.5,
        "max_bin": 127,
        "seed": SEED,
        "verbose": -1,
        "device": device,
        "num_threads": n_jobs,      # Use all CPU cores
        "force_row_wise": True,     # Better for small-medium datasets
    }

def params_reg(device: str):
    p = base_common(device)
    p.update({"objective": "regression", "metric": ["rmse"]})
    return p

def params_clf(device: str):
    p = base_common(device)
    p.update({"objective": "binary", "metric": ["binary_logloss"]})  # Removed auc (faster)
    return p

def lgb_train_try_gpu_then_cpu(params_fn, dtrain, dvalid):
    callbacks = [
        lgb.early_stopping(EARLY_STOP, verbose=False),
        lgb.log_evaluation(period=1000),  # Less frequent logging
    ]
    if USE_GPU:
        try:
            print("[device attempt] gpu")
            params = params_fn("gpu")
            booster = lgb.train(params, dtrain, num_boost_round=NUM_BOOST, valid_sets=[dvalid], valid_names=["val"], callbacks=callbacks)
            return booster, "gpu"
        except Exception as e:
            print("[warn] gpu failed -> fallback cpu\n", str(e)[:400])
    params = params_fn("cpu")
    booster = lgb.train(params, dtrain, num_boost_round=NUM_BOOST, valid_sets=[dvalid], valid_names=["val"], callbacks=callbacks)
    return booster, "cpu"

# ============================================================
# 6) TRAIN 1-stage regression (OPTIMIZED: batch all horizons)
# ============================================================
def train_reg_1stage(loc_id: str, target_key: str, h_start: int, h_end: int):
    name = loc_short_name(loc_id)
    print(f"\n=== LGB 1-STAGE | {name} ({loc_id[:8]}...) target={target_key} h={h_start:03d}-{h_end:03d} ===")

    Xtr, Ytr, feat_cols, y_cols, p_tr = load_XY(loc_id, SPLITS["train"], target_key)
    Xva, Yva, _, _, p_va = load_XY(loc_id, SPLITS["val"], target_key)
    Xte, Yte, _, _, p_te = load_XY(loc_id, SPLITS["test"], target_key)

    mdir = MODEL_DIR / target_key
    mdir.mkdir(parents=True, exist_ok=True)
    report_path = REPORT_DIR / f"report_lgb_{target_key}_{loc_id}.csv"
    old = pd.read_csv(report_path) if report_path.exists() else None

    # Pre-create datasets once (SPEED OPTIMIZATION)
    dtrain_base = lgb.Dataset(Xtr, free_raw_data=False)
    dvalid_base = lgb.Dataset(Xva, reference=dtrain_base, free_raw_data=False)

    rows = []
    for h in range(h_start, h_end + 1):
        mp = mdir / f"lgb_{target_key}_{loc_id}_h{h:03d}.txt"
        if mp.exists():
            continue

        if LOAD_Y_MODE == "all":
            ytr = Ytr[:, h-1]; yva = Yva[:, h-1]; yte = Yte[:, h-1]
        else:
            yname = ycol(h)
            ytr = load_y_per_h(p_tr, yname)
            yva = load_y_per_h(p_va, yname)
            yte = load_y_per_h(p_te, yname)

        # Reuse base datasets with new labels (faster than recreating)
        dtrain = dtrain_base.create_valid(Xtr, label=ytr)
        dvalid = dvalid_base.create_valid(Xva, label=yva)

        booster, used_dev = lgb_train_try_gpu_then_cpu(params_reg, dtrain, dvalid)

        pred_te = booster.predict(Xte, num_iteration=booster.best_iteration or booster.current_iteration())
        rows.append({
            "location_id": loc_id,
            "location_name": name,
            "target": target_key, "h": h,
            "device": used_dev,
            "best_iter": int(booster.best_iteration or booster.current_iteration()),
            "test_mae": mae(pred_te, yte),
            "test_rmse": rmse(pred_te, yte),
            "model": mp.name,
        })
        booster.save_model(str(mp))

        if h % 25 == 0 or h == h_end:
            print(f"[{target_key}] {name} h={h:03d} dev={used_dev} mae={rows[-1]['test_mae']:.4f}")

        del booster, pred_te
        gc.collect()

    del Xtr, Xva, Xte, Ytr, Yva, Yte, dtrain_base, dvalid_base
    gc.collect()

    if rows:
        new = pd.DataFrame(rows)
        out = pd.concat([old, new], ignore_index=True) if old is not None else new
        out.to_csv(report_path, index=False)
        return out
    print("[info] resume hit.")
    return old if old is not None else pd.DataFrame()

# ============================================================
# 7) TRAIN 2-stage rain (OPTIMIZED)
# ============================================================
def train_rain_2stage(loc_id: str, h_start: int, h_end: int):
    name = loc_short_name(loc_id)
    print(f"\n=== LGB 2-STAGE RAIN | {name} ({loc_id[:8]}...) h={h_start:03d}-{h_end:03d} ===")

    Xtr, Ytr, feat_cols, y_cols, p_tr = load_XY(loc_id, SPLITS["train"], "rain")
    Xva, Yva, _, _, p_va = load_XY(loc_id, SPLITS["val"], "rain")
    Xte, Yte, _, _, p_te = load_XY(loc_id, SPLITS["test"], "rain")

    mdir = MODEL_DIR / "rain"
    mdir.mkdir(parents=True, exist_ok=True)
    report_path = REPORT_DIR / f"report_lgb_rain_{loc_id}.csv"
    old = pd.read_csv(report_path) if report_path.exists() else None

    rows = []
    for h in range(h_start, h_end + 1):
        mp_clf = mdir / f"lgb_rain_clf_{loc_id}_h{h:03d}.txt"
        mp_reg = mdir / f"lgb_rain_reg_{loc_id}_h{h:03d}.txt"
        if mp_clf.exists() and mp_reg.exists():
            continue

        if LOAD_Y_MODE == "all":
            ytr_amt = Ytr[:, h-1]; yva_amt = Yva[:, h-1]; yte_amt = Yte[:, h-1]
        else:
            yname = ycol(h)
            ytr_amt = load_y_per_h(p_tr, yname)
            yva_amt = load_y_per_h(p_va, yname)
            yte_amt = load_y_per_h(p_te, yname)

        ytr_evt = (ytr_amt >= RAIN_MM_THR).astype(np.float32)
        yva_evt = (yva_amt >= RAIN_MM_THR).astype(np.float32)
        yte_evt = (yte_amt >= RAIN_MM_THR).astype(np.int32)

        # CLASSIFIER
        dtr_c = lgb.Dataset(Xtr, label=ytr_evt, free_raw_data=False)
        dva_c = lgb.Dataset(Xva, label=yva_evt, reference=dtr_c, free_raw_data=False)
        clf, dev_clf = lgb_train_try_gpu_then_cpu(params_clf, dtr_c, dva_c)
        
        p_va_prob = clf.predict(Xva, num_iteration=clf.best_iteration or clf.current_iteration())
        best_p_thr, val_f1 = tune_p_thr_on_val(yva_evt.astype(np.int32), p_va_prob)
        p_te_prob = clf.predict(Xte, num_iteration=clf.best_iteration or clf.current_iteration())

        # REGRESSOR
        idx_tr = ytr_evt > 0.5
        idx_va = yva_evt > 0.5
        
        if USE_LOG1P_AMOUNT:
            ytr_amt_log = np.log1p(ytr_amt)
            yva_amt_log = np.log1p(yva_amt)
        else:
            ytr_amt_log = ytr_amt
            yva_amt_log = yva_amt
        
        if idx_tr.sum() < MIN_POS_TRAIN or idx_va.sum() < MIN_POS_VAL:
            dtr_r = lgb.Dataset(Xtr, label=ytr_amt_log, free_raw_data=False)
            dva_r = lgb.Dataset(Xva, label=yva_amt_log, reference=dtr_r, free_raw_data=False)
        else:
            dtr_r = lgb.Dataset(Xtr[idx_tr], label=ytr_amt_log[idx_tr], free_raw_data=False)
            dva_r = lgb.Dataset(Xva[idx_va], label=yva_amt_log[idx_va], reference=dtr_r, free_raw_data=False)

        reg, dev_reg = lgb_train_try_gpu_then_cpu(params_reg, dtr_r, dva_r)
        pred_log = reg.predict(Xte, num_iteration=reg.best_iteration or reg.current_iteration())
        
        if USE_LOG1P_AMOUNT:
            yhat_amt = np.expm1(pred_log).astype(np.float32)
        else:
            yhat_amt = pred_log.astype(np.float32)
        yhat_amt = np.maximum(yhat_amt, 0.0)

        yhat_evt = (p_te_prob >= best_p_thr).astype(np.int32)
        yhat = np.where(yhat_evt == 1, yhat_amt, 0.0).astype(np.float32)

        prec, rec, f1, tp, fp, fn = event_metrics(yte_evt, yhat_evt)

        rows.append({
            "location_id": loc_id,
            "location_name": name,
            "target": "rain", "h": h,
            "device_clf": dev_clf, "device_reg": dev_reg,
            "best_iter_clf": int(clf.best_iteration or clf.current_iteration()),
            "best_iter_reg": int(reg.best_iteration or reg.current_iteration()),
            "p_thr_tuned": best_p_thr,
            "use_log1p": USE_LOG1P_AMOUNT,
            "test_mae": mae(yhat, yte_amt),
            "test_rmse": rmse(yhat, yte_amt),
            "prec": prec, "rec": rec, "f1": f1, "tp": tp, "fp": fp, "fn": fn,
            "model_clf": mp_clf.name, "model_reg": mp_reg.name,
        })

        clf.save_model(str(mp_clf))
        reg.save_model(str(mp_reg))

        if h % 25 == 0 or h == h_end:
            print(f"[rain] {name} h={h:03d} p_thr={best_p_thr:.2f} mae={rows[-1]['test_mae']:.4f} f1={f1:.3f}")

        del clf, reg, dtr_c, dva_c, dtr_r, dva_r, p_te_prob, p_va_prob, yhat_amt, yhat, yhat_evt, pred_log
        gc.collect()

    del Xtr, Xva, Xte, Ytr, Yva, Yte
    gc.collect()

    if rows:
        new = pd.DataFrame(rows)
        out = pd.concat([old, new], ignore_index=True) if old is not None else new
        out.to_csv(report_path, index=False)
        return out
    print("[info] resume hit.")
    return old if old is not None else pd.DataFrame()

# ============================================================
# 8) BINS SUMMARY (giá»‘ng GRU/TCN)
# ============================================================
def summarize_bins(df: pd.DataFrame, target: str) -> pd.DataFrame:
    """Summarize metrics by horizon bins"""
    if df is None or len(df) == 0:
        return pd.DataFrame()
    
    rows = []
    for a, b in BINS:
        mask = (df["h"] >= a) & (df["h"] <= b)
        sub = df[mask]
        if len(sub) == 0:
            continue
        
        row = {
            "target": target,
            "horizon_bin": f"{a}-{b}",
            "n_horizons": len(sub),
        }
        
        if "test_mae" in sub.columns:
            row["mae_mean"] = float(sub["test_mae"].mean())
        if "test_rmse" in sub.columns:
            row["rmse_mean"] = float(sub["test_rmse"].mean())
        if "f1" in sub.columns:
            row["f1_mean"] = float(sub["f1"].mean())
        if "prec" in sub.columns:
            row["prec_mean"] = float(sub["prec"].mean())
        if "rec" in sub.columns:
            row["rec_mean"] = float(sub["rec"].mean())
        
        rows.append(row)
    
    return pd.DataFrame(rows)

def summarize_all_bins() -> pd.DataFrame:
    """Summarize all targets with bins"""
    all_bins = []
    
    for tkey in TARGETS:
        for loc_id in LOCATION_IDS:
            if tkey == "rain" and RAIN_2STAGE:
                report_path = REPORT_DIR / f"report_lgb_rain_{loc_id}.csv"
            else:
                report_path = REPORT_DIR / f"report_lgb_{tkey}_{loc_id}.csv"
            
            if not report_path.exists():
                continue
            
            df = pd.read_csv(report_path)
            bins_df = summarize_bins(df, tkey)
            if len(bins_df) > 0:
                bins_df["location_id"] = loc_id
                bins_df["location_name"] = loc_short_name(loc_id)
                all_bins.append(bins_df)
    
    if not all_bins:
        return pd.DataFrame()
    
    return pd.concat(all_bins, ignore_index=True)

# ============================================================
# 9) RUN - Train theo LOCATION_ID
# ============================================================
summaries = []
for tkey in TARGETS:
    for loc_id in LOCATION_IDS:
        name = loc_short_name(loc_id)
        if tkey == "rain" and RAIN_2STAGE:
            rep = train_rain_2stage(loc_id, H_START, H_END)
        else:
            rep = train_reg_1stage(loc_id, tkey, H_START, H_END)

        if rep is None or len(rep) == 0:
            summaries.append({"location_id": loc_id, "location_name": name, "target": tkey, "status":"empty"})
            continue

        s = {"location_id": loc_id, "location_name": name, "target": tkey, "status":"ok", "n_rows": int(len(rep))}
        if "test_mae" in rep.columns:
            s["test_mae_mean"] = float(rep["test_mae"].mean())
        if "test_rmse" in rep.columns:
            s["test_rmse_mean"] = float(rep["test_rmse"].mean())
        if "f1" in rep.columns:
            s["f1_mean"] = float(rep["f1"].mean())
        summaries.append(s)

# ============================================================
# 10) SAVE RESULTS + BINS SUMMARY
# ============================================================
leader = pd.DataFrame(summaries)
leader_path = REPORT_DIR / "lgb_leaderboard.csv"
leader.to_csv(leader_path, index=False)

# Generate bins summary
bins_summary = summarize_all_bins()
if len(bins_summary) > 0:
    bins_path = REPORT_DIR / "lgb_bins_summary.csv"
    bins_summary.to_csv(bins_path, index=False)
    
    # Aggregate bins across all locations
    agg_bins = bins_summary.groupby(["target", "horizon_bin"]).agg({
        "mae_mean": "mean",
        "rmse_mean": "mean",
    }).reset_index()
    if "f1_mean" in bins_summary.columns:
        agg_f1 = bins_summary[bins_summary["target"] == "rain"].groupby("horizon_bin")["f1_mean"].mean()
        agg_bins = agg_bins.merge(agg_f1.reset_index(), on="horizon_bin", how="left")
    
    agg_bins_path = REPORT_DIR / "lgb_bins_aggregate.csv"
    agg_bins.to_csv(agg_bins_path, index=False)
    print("\nðŸ“Š BINS SUMMARY (aggregated):")
    print(agg_bins.to_string(index=False))

print("\n" + "="*60)
print("âœ… DONE!")
print("="*60)
print("Saved leaderboard:", leader_path)
if len(bins_summary) > 0:
    print("Saved bins summary:", bins_path)
    print("Saved bins aggregate:", agg_bins_path)
leader