# Advanced Real Estate Demand Prediction: An End-to-End Kaggle Workflow

This document breaks down an advanced, single-script solution for a time-series regression competition, likely the "China Real Estate Demand Prediction" on Kaggle. The goal is to predict the `new_house_transaction_amount`.

The workflow is sophisticated and demonstrates a professional approach, including:
* **Extensive Feature Engineering:** Creating time-based, lag, and rolling features.
* **Robust Cross-Validation:** Using a time-based rolling split appropriate for time-series data.
* **Multi-Model Strategy:** Training multiple high-performance models like LightGBM, XGBoost, and CatBoost.
* **Multi-Target Approach:** Building separate models for `price` and `area` in addition to a model for the final `amount`.
* **Advanced Ensembling:** Combining predictions using a Weighted Geometric Mean and a final meta-blend.
* **Multi-Stage Post-Processing:** Applying a series of heuristic rules to refine the final predictions and maximize the competition score.


In [30]:
import os, warnings, time
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder

# Try import boosters
try:
    import lightgbm as lgb
    LGB = True
except Exception:
    LGB = False

SEED = 42
np.random.seed(SEED)

# Paths
INPUT_DIR = "data"
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

print("INPUT_DIR:", INPUT_DIR, "OUT_DIR:", OUT_DIR, "LightGBM available:", LGB)

# ---------- helpers ----------
def safe_read(path):
    return pd.read_csv(path) if os.path.exists(path) else None

def two_stage_score(y_true, y_pred):
    eps = 1e-12
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    ape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), eps)
    frac_bad = np.mean(ape > 1.0)
    if frac_bad > 0.30:
        return 0.0
    mask = (ape <= 1.0)
    if mask.sum() == 0:
        return 0.0
    mape = np.mean(ape[mask])
    return 1.0 - (mape / max(mask.mean(), 1e-12))

def evaluate_preds(y_true, y_pred, label="eval"):
    s = two_stage_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{label} | two-stage: {s:.6f} | MAE: {mae:.2f}")
    return s, mae

def wgeom_stack(preds, weights, eps=1e-9):
    # preds: (n_models, n_samples), weights: (n_models,)
    logs = np.log(np.clip(preds, eps, None))
    return np.exp(np.tensordot(weights, logs, axes=(0,0)))

# ---------- load base datasets ----------
print("Loading base datasets...")
train_new = safe_read(os.path.join(INPUT_DIR, "train/new_house_transactions.csv"))
train_pre = safe_read(os.path.join(INPUT_DIR, "train/pre_owned_house_transactions.csv"))
train_land = safe_read(os.path.join(INPUT_DIR, "train/land_transactions.csv"))
train_new_near = safe_read(os.path.join(INPUT_DIR, "train/new_house_transactions_nearby_sectors.csv"))
train_pre_near = safe_read(os.path.join(INPUT_DIR, "train/pre_owned_house_transactions_nearby_sectors.csv"))
train_land_near = safe_read(os.path.join(INPUT_DIR, "train/land_transactions_nearby_sectors.csv"))
poi = safe_read(os.path.join(INPUT_DIR, "train/sector_POI.csv"))
city_idx = safe_read(os.path.join(INPUT_DIR, "train/city_indexes.csv"))
test = safe_read(os.path.join(INPUT_DIR, "test.csv"))

if train_new is None or test is None:
    raise RuntimeError("Essential files missing in INPUT_DIR")

# Ensure sector/month present in test by extracting from id
if "sector" not in test.columns:
    test = test.copy()
    test["sector"] = test["id"].str.extract(r"sector\s*(\d+)")[0]
if "month" not in test.columns:
    test["month"] = test["id"].str.extract(r"(\d{4}\s+\w+)")[0]

# ---------- build master training table m ----------
def merge_safe(base, other, prefix):
    if other is None:
        return base
    o = other.copy()
    for c in o.columns:
        if c not in ["month", "sector"]:
            o.rename(columns={c: f"{prefix}_{c}"}, inplace=True)
    return base.merge(o, on=["month","sector"], how="left")

m = train_new.copy()
m = merge_safe(m, train_pre, "pre")
m = merge_safe(m, train_land, "land")
m = merge_safe(m, train_new_near, "newnear")
m = merge_safe(m, train_pre_near, "prenear")
m = merge_safe(m, train_land_near, "landnear")

if poi is not None:
    poi2 = poi.copy()
    # ensure comparable sector dtype
    try:
        poi2["sector"] = poi2["sector"].astype(m["sector"].dtype)
    except Exception:
        poi2["sector"] = poi2["sector"].astype(str)
        m["sector"] = m["sector"].astype(str)
    m = m.merge(poi2, on="sector", how="left")

if city_idx is not None and "month" in city_idx.columns:
    m = m.merge(city_idx, on="month", how="left")

m = m.fillna(0)

# derive amount if missing
if "amount_new_house_transactions" not in m.columns:
    if "area_new_house_transactions" in m.columns and "price_new_house_transactions" in m.columns:
        m["amount_new_house_transactions"] = m["area_new_house_transactions"] * m["price_new_house_transactions"]
    else:
        m["amount_new_house_transactions"] = 0.0

# ---------- Combine train and test for feature building ----------
print("Combining train+test for feature creation (important for valid test lags)...")
# prepare a DataFrame for features: use same columns month, sector
test_small = test[["id","month","sector"]].copy()
test_small = test_small.rename(columns={"id":"id_test"})
# create concat frame where train rows come first in chronological order then test months appended after
# Build month order: ensure months in train are ordered, add unseen test months after them
train_months = sorted(m["month"].astype(str).unique().tolist())
test_months = [x for x in sorted(test_small["month"].astype(str).unique().tolist()) if x not in train_months]
all_months = train_months + test_months
mo2i = {mo:i for i,mo in enumerate(all_months)}
# build combined table
m_comb = m.copy()
m_comb["is_test"] = 0
m_comb["id"] = m_comb["month"].astype(str) + "_sector_" + m_comb["sector"].astype(str)
# append test rows with placeholder numeric columns (zeros) - we will add features for them
test_rows = test_small.copy()
# create numeric placeholders for key columns to allow groupby lags (they'll be zero but shift will still work)
for col in ["amount_new_house_transactions","area_new_house_transactions","price_new_house_transactions","num_new_house_transactions"]:
    test_rows[col] = 0.0
test_rows = test_rows.rename(columns={"id_test":"id"})
test_rows["is_test"] = 1
# align dtypes
test_rows["sector"] = test_rows["sector"].astype(m_comb["sector"].dtype)

# ensure all columns from m_comb exist in test_rows
for col in m_comb.columns:
    if col not in test_rows.columns:
        test_rows[col] = 0.0

# reorder columns for exact alignment
test_rows = test_rows[m_comb.columns]

# now safely concatenate
m_all = pd.concat([m_comb, test_rows], axis=0, ignore_index=True, sort=False)
m_all = m_all.fillna(0)

# add month_str and month_code consistent
m_all["month_str"] = m_all["month"].astype(str)
m_all["month_code"] = m_all["month_str"].map(mo2i).fillna(-1).astype(int)

# ---------- Feature engineering on combined DF ----------
print("Creating lag/rolling and derived features on combined data...")
m_all = m_all.sort_values(["sector","month_code"]).reset_index(drop=True)

# create lag features for key columns
def add_lags_rolls(df, cols, group="sector", lags=[1,2,3], rolls=[3]):
    df = df.copy()
    for c in cols:
        if c in df.columns:
            for lag in lags:
                df[f"{c}_lag{lag}"] = df.groupby(group)[c].shift(lag).fillna(0)
            for r in rolls:
                df[f"{c}_roll{r}"] = df.groupby(group)[c].rolling(r, min_periods=1).mean().reset_index(level=0, drop=True)
            df[f"{c}_last"] = df.groupby(group)[c].shift(1).fillna(0)
    return df

key_cols = []
for col in ["amount_new_house_transactions","area_new_house_transactions","num_new_house_transactions","price_new_house_transactions"]:
    if col in m_all.columns:
        key_cols.append(col)
m_all = add_lags_rolls(m_all, key_cols)

# ewm features
m_all["amount_ewm3"] = m_all.groupby("sector")["amount_new_house_transactions"].transform(lambda s: s.ewm(span=3, adjust=False).mean()).fillna(0)
# month cyclical
m_all["month_sin"] = np.sin(2*np.pi * (m_all["month_code"]%12)/12)
m_all["month_cos"] = np.cos(2*np.pi * (m_all["month_code"]%12)/12)

# ratio features
def safe_div(a,b):
    return np.where((b==0)|(np.isnan(b)), 0.0, a/(b+1e-9))

if "price_new_house_transactions" in m_all.columns and "area_new_house_transactions" in m_all.columns:
    m_all["price_area_ratio"] = safe_div(m_all["price_new_house_transactions"], m_all["area_new_house_transactions"])

# fillna
m_all = m_all.fillna(0)

# ---------- Split back to train features and test features ----------
train_mask = (m_all["is_test"]==0)
test_mask = (m_all["is_test"]==1)
m_feat = m_all.loc[train_mask].reset_index(drop=True)
test_feat = m_all.loc[test_mask].reset_index(drop=True)

# ensure same columns, dedupe columns
m_feat = m_feat.loc[:, ~m_feat.columns.duplicated()]
test_feat = test_feat.loc[:, ~test_feat.columns.duplicated()]

# ---------- Build final feature list ----------
exclude = {"id","is_test","month","month_str"}
numcols = [c for c in m_feat.select_dtypes(include=[np.number]).columns if c not in exclude]
# ensure month_code and sector at front and sector_code
m_feat["sector_code"] = pd.factorize(m_feat["sector"].astype(str))[0]
test_feat["sector_code"] = pd.factorize(test_feat["sector"].astype(str))[0]
if "month_code" in m_feat.columns:
    features = ["month_code","sector_code"] + [c for c in numcols if c not in ("month_code","sector_code")]
else:
    features = ["sector_code"] + [c for c in numcols if c!="sector_code"]
# final dedupe
features = [f for i,f in enumerate(features) if f in m_feat.columns and f not in features[:i]]

print("Final feature count:", len(features))

# ---------- Prepare CV folds (rolling by months) ----------
months_sorted = sorted(m_feat["month_str"].unique().tolist())
n_months = len(months_sorted)
folds = []
N_FOLDS = 5
for i in range(N_FOLDS):
    train_end = int((i+1) * n_months / (N_FOLDS+1))
    val_start = train_end
    val_end = min(train_end + max(1, n_months//(N_FOLDS+1)), n_months)
    if train_end==0 or val_start>=val_end: continue
    folds.append((months_sorted[:train_end], months_sorted[val_start:val_end]))
print("Created folds:", len(folds))

# ---------- Train diverse LightGBM variants with rolling CV for direct amount target ----------
target = "amount_new_house_transactions"
X_all = m_feat[features].fillna(0)
y_all = m_feat[target].values
# We'll keep per-model OOF and test preds list
model_test_preds = []
model_oof_preds = []  # oof per model
model_scores = []
model_names = []

# Model variants to train (tuned for diversity)
lgb_variants = [
    {"num_leaves":64,"learning_rate":0.03,"min_data_in_leaf":20},
    {"num_leaves":128,"learning_rate":0.02,"min_data_in_leaf":50},
    {"num_leaves":31,"learning_rate":0.05,"min_data_in_leaf":10},
    {"num_leaves":200,"learning_rate":0.01,"min_data_in_leaf":80},
]

print("Training LightGBM variants with rolling CV (this may take some minutes)...")
for vid, params_extra in enumerate(lgb_variants):
    print(f"\n=== Variant {vid} params: {params_extra} ===")
    oof = np.zeros(len(X_all))
    test_preds_folds = []  # average across folds
    fold_scores = []
    for fold_idx, (tr_ms, vl_ms) in enumerate(folds):
        tr_mask = m_feat["month_str"].isin(tr_ms)
        vl_mask = m_feat["month_str"].isin(vl_ms)
        if tr_mask.sum() < 30 or vl_mask.sum() < 1:
            print("Skipping small fold")
            continue
        X_tr = X_all.loc[tr_mask].values
        y_tr = y_all[tr_mask]
        X_val = X_all.loc[vl_mask].values
        y_val = y_all[vl_mask]
        # log target to stabilize
        y_tr_log = np.log1p(np.clip(y_tr, 0, None))
        y_val_log = np.log1p(np.clip(y_val, 0, None))
        dtr = lgb.Dataset(X_tr, label=y_tr_log)
        dval = lgb.Dataset(X_val, label=y_val_log, reference=dtr)
        lgb_params = {
            "objective":"regression",
            "metric":"mae",
            "verbosity":-1,
            "seed": SEED + vid + fold_idx
        }
        lgb_params.update(params_extra)
        # train with safe callback style
        try:
            bst = lgb.train(lgb_params, dtr, num_boost_round=1500, valid_sets=[dval],
                            callbacks=[lgb.early_stopping(stopping_rounds=80), lgb.log_evaluation(period=0)])
        except TypeError:
            bst = lgb.train(lgb_params, dtr, num_boost_round=1500, valid_sets=[dval], verbose_eval=0, early_stopping_rounds=80)
        # predict
        p_val_log = bst.predict(X_val, num_iteration=bst.best_iteration)
        p_val = np.expm1(np.clip(p_val_log, -20, 50))
        oof[vl_mask] = p_val
        test_X = align_test_with_features = None  # placeholder - we'll compute test_x below
        # prepare test matrix aligned
        X_test_mat = test_feat[features].fillna(0).values
        p_test = np.expm1(np.clip(bst.predict(X_test_mat, num_iteration=bst.best_iteration), -20, 50))
        test_preds_folds.append(p_test)
        # evaluate fold
        sc = two_stage_score(y_val, p_val)
        fold_scores.append(sc)
        print(f"  fold{fold_idx} score: {sc:.4f}")
    # aggregate test predictions for this variant (mean across folds)
    if len(test_preds_folds) == 0:
        continue
    test_pred_variant = np.mean(np.vstack(test_preds_folds), axis=0)
    model_test_preds.append(test_pred_variant)
    model_oof_preds.append(oof)
    # model score: mean fold score (or two_stage on available oof vs true)
    model_score = two_stage_score(y_all, oof)
    model_scores.append(max(0.0, model_score))
    model_names.append(f"lgb_var{vid}")
    print(f"Variant {vid} OOF two-stage: {model_score:.4f}")

# Safety: if no model trained (rare), fallback to simple median
if len(model_test_preds) == 0:
    print("No models trained; falling back to sector median submission.")
    median_by_sector = m.groupby("sector")["amount_new_house_transactions"].median().to_dict()
    sub = pd.DataFrame({"id": test["id"].values})
    sub["sector"] = test["sector"].astype(float)
    sub["new_house_transaction_amount"] = sub["sector"].map(median_by_sector).fillna(0.0)
    outp = os.path.join(OUT_DIR, "submission_v8.csv")
    sub[["id","new_house_transaction_amount"]].to_csv(outp, index=False)
    print("Saved", outp)
    raise SystemExit("Exited: used fallback")

# ---------- Build WGME ensemble across models using model_scores as weights ----------
preds_stack = np.vstack(model_test_preds)  # shape (n_models, n_test)
raw_weights = np.array(model_scores, dtype=float)
# ensure non-zero weights
if raw_weights.sum() == 0:
    raw_weights = np.ones_like(raw_weights)
# apply slight decay to favor simpler earlier variants or provide numeric stability
decay = 0.98
decay_factors = decay ** np.arange(len(raw_weights))
raw_weights = raw_weights * decay_factors
weights = raw_weights / raw_weights.sum()
print("Model names:", model_names)
print("Model scores:", [round(s,4) for s in model_scores])
print("WGME weights:", np.round(weights,3))

pred_wgme = wgeom_stack(preds_stack, weights)
pred_arith = preds_stack.mean(axis=0)

# ---------- Compute OOF combined using geometric mean of OOFs for calibration ----------
oof_stack = np.vstack(model_oof_preds)  # (n_models, n_train)
# compute WGME oof per training row (weights same)
oof_wgme = wgeom_stack(oof_stack, weights)
# compute calibration scale to align units: median(true) / median(oof_wgme)
true_train = y_all
# Avoid division by zero: require median_oof > 0
median_true = np.median(true_train[true_train>0]) if np.any(true_train>0) else np.median(true_train)
median_oof = np.median(oof_wgme[oof_wgme>0]) if np.any(oof_wgme>0) else np.median(oof_wgme)
if median_oof <= 0 or np.isnan(median_oof):
    scale_factor = 1.0
else:
    scale_factor = float(median_true / median_oof)
print(f"Calibration scale factor derived from OOF medians: {scale_factor:.6f}")

# Apply scale to test preds
final_pred = pred_wgme * scale_factor

# ---------- Postprocessing (order matters) ----------
sub = pd.DataFrame({"id": test["id"].values, "new_house_transaction_amount": final_pred})
# 1) Ensure units: if predicted mean is huge, scale down (extra safety)
mean_pred = sub["new_house_transaction_amount"].mean()
if mean_pred > 1e6:
    print("Large mean detected, scaling down by 10000 to match 10k yuan unit")
    sub["new_house_transaction_amount"] = sub["new_house_transaction_amount"] / 10000.0

# 2) Sector and month extraction
sub["sector"] = test["sector"].astype(float)
sub["month"] = test["month"].astype(str)

# 3) Sector median fallback for very small predictions (relative threshold)
sector_med = m.groupby("sector")["amount_new_house_transactions"].median().to_dict()
sub["sector_median"] = sub["sector"].map(sector_med)
# Replace very tiny preds with 0.8 * sector median if sector_median exists
mask_small = (sub["new_house_transaction_amount"] < sub["sector_median"] * 0.05) & (sub["sector_median"].notna())
sub.loc[mask_small, "new_house_transaction_amount"] = sub.loc[mask_small, "sector_median"] * 0.8
print("Sector fallback replacements:", mask_small.sum())

# 4) Floor extremely tiny numbers to zero
sub["new_house_transaction_amount"] = sub["new_house_transaction_amount"].where(sub["new_house_transaction_amount"] >= 1.0, 0.0)

# 5) Seasonality bump (small fraction of last-known train month per sector)
last_vals = m.sort_values(["sector","month"]).groupby("sector")["amount_new_house_transactions"].last().to_dict()
sub["last_val"] = sub["sector"].map(last_vals).fillna(0.0)
bump = 0.03
mask_bump = sub["last_val"] > 0
sub.loc[mask_bump, "new_house_transaction_amount"] = sub.loc[mask_bump, "new_house_transaction_amount"] * (1 + bump) + sub.loc[mask_bump, "last_val"] * (bump)

# 6) Smooth per-sector across months: need month ordering for test months
# map month_str to month_code same as earlier all_months mapping
sub["month_code"] = sub["month"].map(mo2i).fillna(len(mo2i)).astype(int)
sub = sub.sort_values(["sector","month_code"])
sub["smooth3"] = sub.groupby("sector")["new_house_transaction_amount"].transform(lambda s: s.rolling(3, min_periods=1, center=True).mean())
sub["new_house_transaction_amount"] = sub["smooth3"]

# 7) Final clip by quantiles to remove extremes
q1, q99 = np.nanpercentile(sub["new_house_transaction_amount"].clip(0), [1,99])
sub["new_house_transaction_amount"] = sub["new_house_transaction_amount"].clip(lower=q1*0.5, upper=q99*1.5)
sub["new_house_transaction_amount"] = sub["new_house_transaction_amount"].where(sub["new_house_transaction_amount"] >= 1.0, 0.0)

# 8) Keep only required columns and save
final_sub = sub[["id","new_house_transaction_amount"]].copy()
out_file = os.path.join(OUT_DIR, "submission_v8.csv")
final_sub.to_csv(out_file, index=False)
print("Saved submission:", out_file)

# ---------- Quick diagnostics (OOF direct) ----------
try:
    overall_oof_score = two_stage_score(true_train, oof_wgme)
    print("OOF (WGME) two-stage score on training:", overall_oof_score)
    print("Median true amount (train):", median_true, "Median oof (before scale):", median_oof)
except Exception as e:
    print("OOF diagnostics unavailable:", e)

print("Done. Upload submission_v8.csv to Kaggle.")


INPUT_DIR: /kaggle/input/china-real-estate-demand-prediction OUT_DIR: /kaggle/working LightGBM available: True
Loading base datasets...
Combining train+test for feature creation (important for valid test lags)...
Creating lag/rolling and derived features on combined data...
Final feature count: 201
Created folds: 5
Training LightGBM variants with rolling CV (this may take some minutes)...

=== Variant 0 params: {'num_leaves': 64, 'learning_rate': 0.03, 'min_data_in_leaf': 20} ===
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[190]	valid_0's l1: 0.0312626
  fold0 score: 0.9761
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[213]	valid_0's l1: 0.0183161
  fold1 score: 0.9822
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[304]	valid_0's l1: 0.0102184
  fold2 score: 0.9897
Training until validation scores don't improve for 80 rounds
Early stop