## Ridge Regression (Kaggle) — Train by LOCATION_ID (20 Locations)

**Yêu cầu Dataset:**
- Chạy `fetch-demo-data-singlekeys.ipynb` trước (đã fetch 20 tỉnh/thành)
- Upload output thành Kaggle Dataset
- Add dataset vào notebook này

**Config:**
- LAG = 49h lookback
- HORIZON = 100h forecast (~4 ngày)
- 20 locations thay vì 34/63
- Ridge là lightweight, có thể chạy all locations cùng lúc

**Features:**
- Có thể chạy nhiều targets
- Tự dò TAB_DIR và load location_ids từ metadata
- Fast multi-output: fit 1 lần cho tất cả horizons

In [None]:
# ============================================================
# Ridge Regression trainer - Train by LOCATION_ID from Kaggle Dataset
# 20 provinces/cities, LAG=49, HORIZON=100
# ============================================================

%pip install -q "scikit-learn==1.8.0"

import os, json, gc
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
import joblib

# ============================================================
# 0) RUN CONTROL (OPTIMIZED for 20 locations)
# ============================================================
TARGETS = ["temp", "rain", "u10", "v10", "rh", "press", "cloud"]

LAG = 49            # 49h lookback
H   = 100           # 100h forecast
H_START = 1
H_END   = 100

# === LOCATION BATCHING ===
START_LOC_IDX = 0
END_LOC_IDX = -1    # -1 = all remaining

SPLITS = {
    "train": "train_2021_2023",
    "val":   "val_2024",
    "test":  "test_2025_01_to_2025_11",
}

ALPHA = 1.0
FIT_INTERCEPT = True
RIDGE_SOLVER = "auto"
RIDGE_TOL = 1e-4
CLIP_NONNEG_FOR_RAIN = True
SEED = 42

# ============================================================
# 1) AUTO-DETECT DATA DIR + LOAD LOCATION_IDS
# ============================================================
INPUT_ROOT = Path("/kaggle/input")

def find_data_dir():
    for pattern in ["weather_20loc/data", "weather_34loc/data", "weather_63loc/data", "weather_4loc/data"]:
        for p in INPUT_ROOT.rglob(pattern):
            if p.is_dir():
                return p
    for p in INPUT_ROOT.rglob("data/tabular"):
        if p.is_dir():
            return p.parent
    raise FileNotFoundError("Không tìm thấy data directory")

DATA_DIR = find_data_dir()
TAB_DIR = DATA_DIR / "tabular"
META_DIR = DATA_DIR / "meta"

print(f"DATA_DIR = {DATA_DIR}")
print(f"TAB_DIR = {TAB_DIR}")

def load_location_ids():
    meta_file = META_DIR / "locations.json"
    if meta_file.exists():
        with open(meta_file) as f:
            meta = json.load(f)
        loc_ids = meta.get("location_ids", [])
        locations = meta.get("locations", [])
        print(f"Loaded {len(loc_ids)} locations:")
        for loc in locations:
            print(f"  {loc['name']:15s} = {loc['location_id']}")
        return loc_ids, {loc["location_id"]: loc["name"] for loc in locations}
    
    # Fallback: scan files
    files = list(TAB_DIR.glob(f"*_{SPLITS['train']}_tab_temp_lag{LAG}_h{H}.parquet"))
    loc_ids = sorted(set(f.name.split("_")[0] for f in files))
    print(f"Found {len(loc_ids)} location_ids from files")
    return loc_ids, {}

LOCATION_IDS_ALL, LOC_NAMES = load_location_ids()

# === LOCATION BATCHING ===
_start = START_LOC_IDX
_end = END_LOC_IDX if END_LOC_IDX >= 0 else len(LOCATION_IDS_ALL)
LOCATION_IDS = LOCATION_IDS_ALL[_start:_end]
print(f"[LOCATION BATCH] Using {len(LOCATION_IDS)}/{len(LOCATION_IDS_ALL)} locations (idx {_start}:{_end})")

# ============================================================
# 2) OUTPUT DIRS
# ============================================================
OUT_DIR = Path("/kaggle/working/ridge_out_singlekeys_fast")
MODEL_DIR = OUT_DIR / "models"
REPORT_DIR = OUT_DIR / "reports"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
REPORT_DIR.mkdir(parents=True, exist_ok=True)

# ============================================================
# 3) IO HELPERS
# ============================================================
def ycol(h: int) -> str:
    return f"y_t+{h:03d}"

def file_path(loc_id: str, split_name: str, target_key: str) -> Path:
    return TAB_DIR / f"{loc_id}_{split_name}_tab_{target_key}_lag{LAG}_h{H}.parquet"

def loc_short_name(loc_id: str) -> str:
    return LOC_NAMES.get(loc_id, loc_id[:8])

def get_schema_cols(path: Path):
    try:
        import pyarrow.parquet as pq
        return pq.ParquetFile(path).schema.names
    except Exception:
        return pd.read_parquet(path, engine="pyarrow", columns=None).columns.tolist()

def load_XY_all(loc_id: str, split_name: str, target_key: str):
    path = file_path(loc_id, split_name, target_key)
    if not path.exists():
        raise FileNotFoundError(f"Missing: {path}")

    cols = get_schema_cols(path)
    feat_cols = [c for c in cols if "_lag" in c]
    y_cols = [ycol(h) for h in range(1, H+1)]

    missing_y = [c for c in y_cols if c not in cols]
    if missing_y:
        raise ValueError(f"{path.name}: missing {len(missing_y)} y columns")

    use_cols = feat_cols + y_cols
    df = pd.read_parquet(path, columns=use_cols)

    X = df[feat_cols].to_numpy(np.float32, copy=False)
    Y = df[y_cols].to_numpy(np.float32, copy=False)

    del df
    gc.collect()
    return X, Y, feat_cols, y_cols

# ============================================================
# 4) METRICS
# ============================================================
def mae(yhat, y):
    return float(np.mean(np.abs(np.asarray(yhat, np.float32) - np.asarray(y, np.float32))))

def rmse(yhat, y):
    d = np.asarray(yhat, np.float32) - np.asarray(y, np.float32)
    return float(np.sqrt(np.mean(d * d)))

# ============================================================
# 5) TRAIN (multi-output, one fit per loc_id/target)
# ============================================================
def train_ridge_multi(loc_id: str, target_key: str, h_start: int, h_end: int):
    name = loc_short_name(loc_id)
    print(f"\n=== RIDGE MULTI-OUTPUT | {name} ({loc_id[:8]}...) target={target_key} h={h_start:03d}-{h_end:03d} ===")

    Xtr, Ytr, feat_cols, y_cols = load_XY_all(loc_id, SPLITS["train"], target_key)
    Xva, Yva, _, _ = load_XY_all(loc_id, SPLITS["val"], target_key)
    Xte, Yte, _, _ = load_XY_all(loc_id, SPLITS["test"], target_key)

    sl = slice(h_start - 1, h_end)
    Ytr_s = Ytr[:, sl]
    Yva_s = Yva[:, sl]
    Yte_s = Yte[:, sl]

    # === LOG1P TRANSFORM FOR RAIN (precipitation is heavily skewed) ===
    use_log1p = (target_key == "rain")
    if use_log1p:
        Ytr_s = np.log1p(Ytr_s)  # Train on log1p(precip)
        # Keep Yva_s, Yte_s in original scale for fair metric comparison

    scaler = StandardScaler(with_mean=True, with_std=True, copy=False)
    scaler.fit(Xtr)
    Xtr = scaler.transform(Xtr)
    Xva = scaler.transform(Xva)
    Xte = scaler.transform(Xte)

    mdir = MODEL_DIR / target_key
    mdir.mkdir(parents=True, exist_ok=True)

    mp = mdir / f"ridge_{target_key}_{loc_id}_multi_h{h_start:03d}-{h_end:03d}.joblib"
    sp = mdir / f"scaler_{target_key}_{loc_id}_lag{LAG}_h{H}.joblib"
    report_path = REPORT_DIR / f"report_ridge_{target_key}_{loc_id}_h{h_start:03d}-{h_end:03d}.csv"

    if mp.exists() and sp.exists() and report_path.exists():
        print("[resume] model+scaler+report exist, skip.")
        return pd.read_csv(report_path)

    model = Ridge(
        alpha=ALPHA,
        fit_intercept=FIT_INTERCEPT,
        solver=RIDGE_SOLVER,
        tol=RIDGE_TOL,
    )
    model.fit(Xtr, Ytr_s)

    pred_va = model.predict(Xva)
    pred_te = model.predict(Xte)

    # === INVERSE TRANSFORM FOR RAIN ===
    if use_log1p:
        pred_va = np.expm1(pred_va)  # expm1 = exp(x) - 1, inverse of log1p
        pred_te = np.expm1(pred_te)

    # Clip non-negative for rain (after inverse transform)
    if target_key == "rain" and CLIP_NONNEG_FOR_RAIN:
        pred_va = np.maximum(pred_va, 0.0)
        pred_te = np.maximum(pred_te, 0.0)

    rows = []
    for i, h in enumerate(range(h_start, h_end + 1)):
        rows.append({
            "location_id": loc_id,
            "location_name": name,
            "target": target_key,
            "h": h,
            "alpha": ALPHA,
            "val_mae": mae(pred_va[:, i], Yva_s[:, i]),
            "val_rmse": rmse(pred_va[:, i], Yva_s[:, i]),
            "test_mae": mae(pred_te[:, i], Yte_s[:, i]),
            "test_rmse": rmse(pred_te[:, i], Yte_s[:, i]),
            "model": mp.name,
        })

    rep = pd.DataFrame(rows)
    rep.to_csv(report_path, index=False)

    joblib.dump(model, mp, compress=3)
    joblib.dump(scaler, sp, compress=3)
    print(f"[saved] {mp.name}")

    del Xtr, Xva, Xte, Ytr, Yva, Yte, Ytr_s, Yva_s, Yte_s, model, scaler, pred_va, pred_te
    gc.collect()
    return rep

# ============================================================
# 6) RUN - Train theo LOCATION_ID
# ============================================================
summ = []
for target in TARGETS:
    for loc_id in LOCATION_IDS:
        name = loc_short_name(loc_id)
        rep = train_ridge_multi(loc_id, target, H_START, H_END)
        summ.append({
            "location_id": loc_id,
            "location_name": name,
            "target": target,
            "n_rows": int(len(rep)),
            "val_mae_mean": float(rep["val_mae"].mean()),
            "test_mae_mean": float(rep["test_mae"].mean()),
            "test_rmse_mean": float(rep["test_rmse"].mean()),
        })

leader = pd.DataFrame(summ).sort_values(["test_mae_mean"], ascending=True)
leader_path = REPORT_DIR / "ridge_leaderboard.csv"
leader.to_csv(leader_path, index=False)

print("\n" + "="*60)
print("✅ DONE!")
print("="*60)
print("Saved leaderboard:", leader_path)
leader