# Advanced Real Estate Demand Prediction: An End-to-End Kaggle Workflow

This document breaks down an advanced, single-script solution for a time-series regression competition, likely the "China Real Estate Demand Prediction" on Kaggle. The goal is to predict the `new_house_transaction_amount`.

The workflow is sophisticated and demonstrates a professional approach, including:
* **Extensive Feature Engineering:** Creating time-based, lag, and rolling features.
* **Robust Cross-Validation:** Using a time-based rolling split appropriate for time-series data.
* **Multi-Model Strategy:** Training multiple high-performance models like LightGBM, XGBoost, and CatBoost.
* **Multi-Target Approach:** Building separate models for `price` and `area` in addition to a model for the final `amount`.
* **Advanced Ensembling:** Combining predictions using a Weighted Geometric Mean and a final meta-blend.
* **Multi-Stage Post-Processing:** Applying a series of heuristic rules to refine the final predictions and maximize the competition score.


## 1. Imports and Configuration
Sets up all necessary libraries and global parameters


In [1]:
import os, sys, math, warnings, time
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Try boosters (optional)
try:
    import lightgbm as lgb
    LGB = True
except Exception:
    LGB = False
    print("LightGBM not available")

try:
    import xgboost as xgb
    XGB = True
except Exception:
    XGB = False
    print("XGBoost not available")

try:
    from catboost import CatBoostRegressor, Pool
    CAT = True
except Exception:
    CAT = False
    print("CatBoost not available")

# Global settings
SEED = 42
np.random.seed(SEED)

# Paths configuration
# KAGGLE_INPUT = "/kaggle/input/china-real-estate-demand-prediction"
# ALT_INPUT = "/mnt/data"

# if os.path.exists(KAGGLE_INPUT):
#     INPUT_DIR = KAGGLE_INPUT
# elif os.path.exists(ALT_INPUT):
#     INPUT_DIR = ALT_INPUT
# else:
#     raise FileNotFoundError("Can't find dataset directory")

INPUT_DIR = "data"
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

print(f"Input directory: {INPUT_DIR}")
print(f"Output directory: {OUT_DIR}")
print(f"Available boosters - LGB: {LGB}, XGB: {XGB}, CAT: {CAT}")

Input directory: data
Output directory: outputs
Available boosters - LGB: True, XGB: True, CAT: True


## 2. Helper Functions

Core utility functions for evaluation, data loading, and ensemble calculations

In [2]:
def two_stage_score(y_true, y_pred):
    """
    Custom two-stage evaluation metric:
    - Stage 1: Check if more than 30% of predictions have APE > 1.0
    - Stage 2: Calculate MAPE on predictions with APE <= 1.0
    """
    eps = 1e-12
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    ape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), eps)
    frac_bad = np.mean(ape > 1.0)
    
    if frac_bad > 0.3: 
        return 0.0
    
    mask = (ape <= 1.0)
    if mask.sum() == 0:
        return 0.0
    
    mape = np.mean(ape[mask])
    return 1.0 - (mape / max(1 - frac_bad, 1e-12))


def evaluate_preds(y_true, y_pred, name="eval"):
    """Evaluate predictions using both two-stage score and MAE"""
    s = two_stage_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{name} | two-stage: {s:.6f} | MAE: {mae:.2f}")
    return {"score": s, "mae": mae}


def safe_read(path):
    """Safely read CSV file, return None if doesn't exist"""
    return pd.read_csv(path) if os.path.exists(path) else None


def align_test_with_features(test_df, features):
    """Ensure test dataframe has all required features"""
    test_a = test_df.copy()
    for c in features:
        if c not in test_a.columns:
            test_a[c] = 0.0
    return test_a[features].fillna(0)


def wgeom(preds, weights, eps=1e-9):
    """
    Weighted Geometric Mean Ensemble (WGME)
    preds: array (n_models, n_samples)
    weights: (n_models,) -> normalized
    Returns: exp(sum(w_i * log(pred_i + eps)))
    """
    logs = np.log(np.clip(preds, eps, None))
    return np.exp(np.dot(weights, logs))


def safe_div(a, b):
    """Safe division avoiding divide by zero"""
    return np.where((b == 0) | (np.isnan(b)), 0.0, a / (b + 1e-9))


def merge_safe(base, other, prefix):
    """Safely merge dataframes with column prefixing"""
    if other is None: 
        return base
    o = other.copy()
    for c in o.columns:
        if c not in ["month", "sector"]:
            o = o.rename(columns={c: f"{prefix}_{c}"})
    return base.merge(o, on=["month", "sector"], how="left")

## 3. Data Loading

Load all training and test datasets

In [3]:
print("Loading base datasets...")

# Load training datasets
train_new = safe_read(os.path.join(INPUT_DIR, "train/new_house_transactions.csv"))
train_pre = safe_read(os.path.join(INPUT_DIR, "train/pre_owned_house_transactions.csv"))
train_land = safe_read(os.path.join(INPUT_DIR, "train/land_transactions.csv"))
train_new_near = safe_read(os.path.join(INPUT_DIR, "train/new_house_transactions_nearby_sectors.csv"))
train_pre_near = safe_read(os.path.join(INPUT_DIR, "train/pre_owned_house_transactions_nearby_sectors.csv"))
train_land_near = safe_read(os.path.join(INPUT_DIR, "train/land_transactions_nearby_sectors.csv"))
poi = safe_read(os.path.join(INPUT_DIR, "train/sector_POI.csv"))
city_idx = safe_read(os.path.join(INPUT_DIR, "train/city_indexes.csv"))

# Load test data
test = safe_read(os.path.join(INPUT_DIR, "test.csv"))

if train_new is None or test is None:
    raise RuntimeError("Missing essential files; check INPUT_DIR path")

print(f"✓ Loaded train_new: {train_new.shape}")
print(f"✓ Loaded test: {test.shape}")
print(f"✓ Additional datasets loaded: {sum([x is not None for x in [train_pre, train_land, poi, city_idx]])}")

# Fix test data if sector/month columns are missing
if "sector" not in test.columns:
    test["sector"] = test["id"].str.extract(r"sector\s*(\d+)").astype(float)
    print("✓ Extracted sector from test IDs")

if "month" not in test.columns:
    test["month"] = test["id"].str.extract(r"(\d{4}\s+\w+)")
    print("✓ Extracted month from test IDs")

Loading base datasets...
✓ Loaded train_new: (5433, 11)
✓ Loaded test: (1152, 2)
✓ Additional datasets loaded: 4
✓ Extracted sector from test IDs
✓ Extracted month from test IDs


## 4. Data Merging

Merge all datasets into a master training table

In [4]:
print("Merging datasets into master table...")

# Start with new house transactions as base
m = train_new.copy()

# Merge pre-owned house data
m = merge_safe(m, train_pre, "pre")
print(f"✓ Merged pre-owned data. Shape: {m.shape}")

# Merge land transactions
m = merge_safe(m, train_land, "land")
print(f"✓ Merged land data. Shape: {m.shape}")

# Merge nearby sector data
m = merge_safe(m, train_new_near, "newnear")
m = merge_safe(m, train_pre_near, "prenear")
m = merge_safe(m, train_land_near, "landnear")
print(f"✓ Merged nearby sector data. Shape: {m.shape}")

# Merge POI data
if poi is not None:
    try:
        poi["sector"] = poi["sector"].astype(m["sector"].dtype)
    except Exception:
        poi["sector"] = poi["sector"].astype(str)
        m["sector"] = m["sector"].astype(str)
    m = m.merge(poi, on="sector", how="left")
    print(f"✓ Merged POI data. Shape: {m.shape}")

# Merge city indexes
if city_idx is not None:
    if "month" in city_idx.columns:
        m = m.merge(city_idx, on="month", how="left")
        print(f"✓ Merged city indexes. Shape: {m.shape}")

# Fill missing values
m = m.fillna(0)

# Compute amount if not present
if "amount_new_house_transactions" not in m.columns:
    if "area_new_house_transactions" in m.columns and "price_new_house_transactions" in m.columns:
        m["amount_new_house_transactions"] = m["area_new_house_transactions"] * m["price_new_house_transactions"]
        print("✓ Computed amount from area × price")
    else:
        m["amount_new_house_transactions"] = 0.0
        print("⚠ Warning: Could not compute amount")

print(f"Final merged table shape: {m.shape}")
print(f"Columns: {len(m.columns)}")

Merging datasets into master table...
✓ Merged pre-owned data. Shape: (5433, 15)
✓ Merged land data. Shape: (5433, 19)
✓ Merged nearby sector data. Shape: (5433, 36)
✓ Merged POI data. Shape: (5433, 177)
Final merged table shape: (5433, 177)
Columns: 177


## 5. Feature Engineering
Create time-based, lag, rolling, and interaction features


In [5]:
print("Feature engineering...")

# ===== Time Features =====
m["month_str"] = m["month"].astype(str)
months = sorted(m["month_str"].unique().tolist())
mo2i = {mth: i for i, mth in enumerate(months)}
m["month_code"] = m["month_str"].map(mo2i).fillna(-1).astype(int)

# Cyclical encoding for seasonality
m["month_sin"] = np.sin(2 * np.pi * m["month_code"] / 12)
m["month_cos"] = np.cos(2 * np.pi * m["month_code"] / 12)
print(f"✓ Created time features (month_code, sin, cos)")


# ===== Lag and Rolling Features =====
def add_lags_rolls(df, group="sector", 
                   cols=["amount_new_house_transactions",
                         "area_new_house_transactions",
                         "num_new_house_transactions"], 
                   lags=[1, 2, 3], rolls=[3]):
    """Add lag and rolling window features"""
    df = df.sort_values([group, "month"]).reset_index(drop=True)
    
    for c in cols:
        if c in df.columns:
            # Lag features
            for lag in lags:
                df[f"{c}_lag{lag}"] = df.groupby(group)[c].shift(lag).fillna(0)
            
            # Rolling mean features
            for r in rolls:
                df[f"{c}_roll{r}"] = df.groupby(group)[c].rolling(
                    r, min_periods=1).mean().reset_index(level=0, drop=True)
            
            # Last non-zero value
            df[f"{c}_last"] = df.groupby(group)[c].shift(1).fillna(0)
    
    return df

m = add_lags_rolls(m)
print(f"✓ Created lag and rolling features")


# ===== Exponential Weighted Mean =====
m["amount_ewm_3"] = m.groupby("sector")["amount_new_house_transactions"].transform(
    lambda s: s.ewm(span=3, adjust=False).mean()).fillna(0)
m["amount_ewm_6"] = m.groupby("sector")["amount_new_house_transactions"].transform(
    lambda s: s.ewm(span=6, adjust=False).mean()).fillna(0)
print(f"✓ Created EWM features")


# ===== Interaction Features =====
if "price_new_house_transactions" in m.columns and "area_new_house_transactions" in m.columns:
    m["price_area_ratio"] = safe_div(
        m["price_new_house_transactions"], 
        m["area_new_house_transactions"])
    print(f"✓ Created price/area ratio")
else:
    m["price_area_ratio"] = 0.0

# Fill any remaining NaN values
m = m.fillna(0)

print(f"Feature engineering complete. Total columns: {len(m.columns)}")

Feature engineering...
✓ Created time features (month_code, sin, cos)
✓ Created lag and rolling features
✓ Created EWM features
✓ Created price/area ratio
Feature engineering complete. Total columns: 199


## 6. Feature Selection
Build final feature list for modeling


In [6]:
print("Building feature list...")

# Columns to exclude from features
exclude = {
    "month", "month_str", "sector", "id", 
    "amount_new_house_transactions", "y_log1p"
}

# Select numeric features only
features = [c for c in m.select_dtypes(include=[np.number]).columns 
            if c not in exclude]

# Add sector encoding
m["sector_code"] = pd.factorize(m["sector"].astype(str))[0]

# Prioritize important features at the beginning
if "month_code" in features:
    features = ["month_code"] + [f for f in features if f != "month_code"]

features = ["month_code", "sector_code"] + [
    f for f in features if f not in ("month_code", "sector_code")
]

# Remove duplicates and ensure all features exist
features = list(dict.fromkeys(features))
features = [f for f in features if f in m.columns]

print(f"✓ Feature count: {len(features)}")
print(f"✓ First 10 features: {features[:10]}")
print(f"✓ Last 10 features: {features[-10:]}")

# Verify data types
print("\nFeature data types summary:")
print(m[features].dtypes.value_counts())

Building feature list...
✓ Feature count: 196
✓ First 10 features: ['month_code', 'sector_code', 'num_new_house_transactions', 'area_new_house_transactions', 'price_new_house_transactions', 'area_per_unit_new_house_transactions', 'total_price_per_unit_new_house_transactions', 'num_new_house_available_for_sale', 'area_new_house_available_for_sale', 'period_new_house_sell_through']
✓ Last 10 features: ['area_new_house_transactions_roll3', 'area_new_house_transactions_last', 'num_new_house_transactions_lag1', 'num_new_house_transactions_lag2', 'num_new_house_transactions_lag3', 'num_new_house_transactions_roll3', 'num_new_house_transactions_last', 'amount_ewm_3', 'amount_ewm_6', 'price_area_ratio']

Feature data types summary:
float64    190
int64        6
Name: count, dtype: int64


## 7. Cross-Validation Setup
Create time-based rolling validation folds

In [7]:
print("Creating time-based cross-validation folds...")

# Get all unique months sorted
all_months = months
n_months = len(all_months)

# Create rolling validation folds
N_FOLDS = 5
folds = []

for i in range(N_FOLDS):
    # Progressive training window
    train_end = int((i + 1) * n_months / (N_FOLDS + 1))
    val_start = train_end
    val_end = min(train_end + max(1, n_months // (N_FOLDS + 1)), n_months)
    
    # Skip if insufficient data
    if train_end == 0 or val_start >= val_end: 
        continue
    
    train_months = all_months[:train_end]
    val_months = all_months[val_start:val_end]
    
    folds.append((train_months, val_months))
    
    print(f"Fold {i}: Train months {len(train_months)}, Val months {len(val_months)}")

print(f"\n✓ Created {len(folds)} validation folds")

# Display fold details
print("\nFold Details:")
for i, (tr, val) in enumerate(folds):
    print(f"  Fold {i}: Train={tr[0]} to {tr[-1]}, Val={val[0]} to {val[-1]}")

Creating time-based cross-validation folds...
Fold 0: Train months 11, Val months 11
Fold 1: Train months 22, Val months 11
Fold 2: Train months 33, Val months 11
Fold 3: Train months 44, Val months 11
Fold 4: Train months 55, Val months 11

✓ Created 5 validation folds

Fold Details:
  Fold 0: Train=2019-Apr to 2019-Oct, Val=2019-Sep to 2020-Nov
  Fold 1: Train=2019-Apr to 2020-Nov, Val=2020-Oct to 2021-May
  Fold 2: Train=2019-Apr to 2021-May, Val=2021-Nov to 2022-Mar
  Fold 3: Train=2019-Apr to 2022-Mar, Val=2022-May to 2023-Jun
  Fold 4: Train=2019-Apr to 2023-Jun, Val=2023-Mar to 2024-Mar


## 8. Model Training Function
Train multiple models (LGB/XGB/CAT) with rolling CV

In [8]:
def train_models_target(df, features, target_col, target_log=True, 
                       use_lgb=LGB, use_xgb=XGB, use_cat=CAT):
    """
    Train multiple models with rolling CV
    Returns: oof_preds, preds_stack, weights, pred_test_wgme, pred_test_arith, model_info
    """
    X = df[features].fillna(0)
    y = df[target_col].values
    
    # Log transform target if requested
    if target_log:
        y_log = np.log1p(np.clip(y, 0, None))
    else:
        y_log = y
    
    # Initialize storage
    oof_preds = np.zeros(len(df))
    test_preds_per_model = []
    model_info = []
    model_scores = []
    
    # Prepare test data
    test_df = test.copy()
    if "sector" not in test_df.columns:
        test_df["sector"] = test_df["id"].str.extract(r"sector\s*(\d+)").astype(float)
    if "month" not in test_df.columns:
        test_df["month"] = test_df["id"].str.extract(r"(\d{4}\s+\w+)")
    
    test_df["month_str"] = test_df["month"].astype(str)
    test_df["month_code"] = test_df["month_str"].map(mo2i).fillna(-1).astype(int)
    test_df["sector_code"] = pd.factorize(test_df["sector"].astype(str))[0]
    
    # Ensure test has all features
    test_X_all = test_df.copy()
    for c in features:
        if c not in test_X_all.columns:
            test_X_all[c] = 0.0
    X_test = test_X_all[features].fillna(0)
    
    # Train on each fold
    for fold_idx, (tr_months, vl_months) in enumerate(folds):
        print(f"Fold {fold_idx} | train months {len(tr_months)}, val months {len(vl_months)}")
        
        tr_mask = df["month_str"].isin(tr_months)
        vl_mask = df["month_str"].isin(vl_months)
        
        if tr_mask.sum() < 50 or vl_mask.sum() < 1:
            print("  Skipping fold due to too small data")
            continue
        
        X_tr, X_val = X.loc[tr_mask], X.loc[vl_mask]
        y_tr_log, y_val_log = y_log[tr_mask], y_log[vl_mask]
        y_val_orig = y[vl_mask]
        
        # Train LightGBM
        if use_lgb:
            lgb_params = {
                "objective": "regression",
                "metric": "mae",
                "learning_rate": 0.02,
                "num_leaves": 128,
                "feature_fraction": 0.8,
                "bagging_fraction": 0.8,
                "verbosity": -1,
                "seed": SEED
            }
            dtr = lgb.Dataset(X_tr, label=y_tr_log)
            dval = lgb.Dataset(X_val, label=y_val_log, reference=dtr)
            bst = lgb.train(
                    lgb_params,
                    dtr,
                    num_boost_round=1500,
                    valid_sets=[dval],
                    callbacks=[
                                lgb.early_stopping(stopping_rounds=80),
                                lgb.log_evaluation(period=0)  # disable per-iteration logging
                            ]
                    )

            p_val_log = bst.predict(X_val, num_iteration=bst.best_iteration)
            p_val = np.expm1(np.clip(p_val_log, -20, 50))
            p_test = np.expm1(np.clip(bst.predict(X_test, num_iteration=bst.best_iteration), -20, 50))
            
            oof_preds[vl_mask] = p_val
            test_preds_per_model.append(p_test)
            model_info.append(("lgb_fold", fold_idx, bst))
            sc = two_stage_score(y_val_orig, p_val)
            model_scores.append(("lgb_fold", fold_idx, sc))
            print(f"  LGB fold {fold_idx} score: {sc:.4f}")
        
        # Train XGBoost
        if use_xgb:
            try:
                dtr_x = xgb.DMatrix(X_tr, label=y_tr_log)
                dval_x = xgb.DMatrix(X_val, label=y_val_log)
                xgb_params = {
                    "objective": "reg:squarederror",
                    "eta": 0.02,
                    "max_depth": 7,
                    "subsample": 0.8,
                    "colsample_bytree": 0.8,
                    "seed": SEED
                }
                xbst = xgb.train(
                    xgb_params, dtr_x, num_boost_round=1200,
                    evals=[(dval_x, "val")], 
                    early_stopping_rounds=80, 
                    verbose_eval=False
                )
                p_val_log = xbst.predict(xgb.DMatrix(X_val))
                p_val = np.expm1(np.clip(p_val_log, -20, 50))
                p_test = np.expm1(np.clip(xbst.predict(xgb.DMatrix(X_test)), -20, 50))
                
                oof_preds[vl_mask] = (oof_preds[vl_mask] + p_val) / 2.0
                test_preds_per_model.append(p_test)
                model_info.append(("xgb_fold", fold_idx, xbst))
                sc = two_stage_score(y_val_orig, p_val)
                model_scores.append(("xgb_fold", fold_idx, sc))
                print(f"  XGB fold {fold_idx} score: {sc:.4f}")
            except Exception as e:
                print(f"  XGB skipped (error): {e}")
        
        # Train CatBoost
        if use_cat:
            try:
                cat = CatBoostRegressor(verbose=0, random_seed=SEED)
                cat.fit(X_tr, y_tr_log)
                p_val_log = cat.predict(X_val)
                p_val = np.expm1(np.clip(p_val_log, -20, 50))
                p_test = np.expm1(np.clip(cat.predict(X_test), -20, 50))
                
                oof_preds[vl_mask] = (oof_preds[vl_mask] + p_val) / 2.0
                test_preds_per_model.append(p_test)
                model_info.append(("cat_fold", fold_idx, cat))
                sc = two_stage_score(y_val_orig, p_val)
                model_scores.append(("cat_fold", fold_idx, sc))
                print(f"  CatBoost fold {fold_idx} score: {sc:.4f}")
            except Exception as e:
                print(f"  CatBoost skipped (error): {e}")
    
    # Aggregate test predictions
    if len(test_preds_per_model) == 0:
        raise RuntimeError(f"No models produced test predictions for target {target_col}")
    
    preds_stack = np.vstack(test_preds_per_model)
    
    # Compute weights based on CV scores
    model_scores_arr = np.array([s for (_, _, s) in model_scores], dtype=float)
    raw_weights = np.clip(model_scores_arr, 0.0, None)
    
    if raw_weights.sum() == 0:
        raw_weights = np.ones_like(raw_weights)
    
    # Apply weight decay
    decay = 0.98
    decay_factors = np.array([decay**i for i in range(len(raw_weights))])
    raw_weights = raw_weights * decay_factors
    weights = raw_weights / raw_weights.sum()
    
    # Compute weighted geometric mean
    pred_test_wgme = wgeom(preds_stack, weights)
    pred_test_arith = preds_stack.mean(axis=0)
    
    return oof_preds, preds_stack, weights, pred_test_wgme, pred_test_arith, model_info

## 9. Train Models
Train separate models for price, area, and direct amount


In [9]:
print("\n" + "="*60)
print("TRAINING MODELS")
print("="*60)

# Check if we can do price x area approach
target_price_col = "price_new_house_transactions"
target_area_col = "area_new_house_transactions"

do_price_area = (target_price_col in m.columns and target_area_col in m.columns)

if do_price_area:
    print(f"✓ Will train price and area models separately")
else:
    print(f"⚠ Missing price/area columns, using direct amount model only")

# ===== Train Price Model =====
if do_price_area:
    print("\n" + "-"*60)
    print("Training PRICE model...")
    print("-"*60)
    (oof_price, preds_stack_price, weights_price, 
     pred_price_wgme, pred_price_arith, info_price) = train_models_target(
        m, features, target_price_col, target_log=True
    )
    print(f"✓ Price model done. Models trained: {preds_stack_price.shape[0]}")
    print(f"  Weights: {weights_price.round(3)}")
else:
    preds_stack_price = None
    pred_price_wgme = None

# ===== Train Area Model =====
if do_price_area:
    print("\n" + "-"*60)
    print("Training AREA model...")
    print("-"*60)
    (oof_area, preds_stack_area, weights_area, 
     pred_area_wgme, pred_area_arith, info_area) = train_models_target(
        m, features, target_area_col, target_log=True
    )
    print(f"✓ Area model done. Models trained: {preds_stack_area.shape[0]}")
    print(f"  Weights: {weights_area.round(3)}")
else:
    preds_stack_area = None
    pred_area_wgme = None

# ===== Train Direct Amount Model =====
print("\n" + "-"*60)
print("Training DIRECT AMOUNT model...")
print("-"*60)
(oof_amt, preds_stack_amt, weights_amt, 
 pred_amt_wgme, pred_amt_arith, info_amt) = train_models_target(
    m, features, "amount_new_house_transactions", target_log=True
)
print(f"✓ Direct amount model done. Models trained: {preds_stack_amt.shape[0]}")
print(f"  Weights: {weights_amt.round(3)}")

print("\n" + "="*60)
print("ALL MODELS TRAINED")
print("="*60)


TRAINING MODELS
✓ Will train price and area models separately

------------------------------------------------------------
Training PRICE model...
------------------------------------------------------------
Fold 0 | train months 11, val months 11
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[839]	valid_0's l1: 0.015153
  LGB fold 0 score: 0.9839
  XGB fold 0 score: 0.9822
  CatBoost fold 0 score: 0.9680
Fold 1 | train months 22, val months 11
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[919]	valid_0's l1: 0.0162717
  LGB fold 1 score: 0.9839
  XGB fold 1 score: 0.9875
  CatBoost fold 1 score: 0.9826
Fold 2 | train months 33, val months 11
Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[1458]	valid_0's l1: 0.0184729
  LGB fold 2 score: 0.9821
  XGB fold 2 score: 0.9874
  CatBoost fold 2 score: 0.9845
Fold 3 | train months

## 10. Combine Predictions

Create multiple prediction candidates and compute meta-ensemble

In [10]:
print("\n" + "="*60)
print("COMBINING PREDICTIONS")
print("="*60)

# Create candidate predictions
candidates = {}

# Candidate 1: Price x Area (if available)
if do_price_area and pred_price_wgme is not None and pred_area_wgme is not None:
    pred_amount_from_price_area = pred_price_wgme * pred_area_wgme
    candidates["pricexarea_wgme"] = pred_amount_from_price_area
    print(f"✓ Created candidate: price×area WGME")

# Candidate 2: Direct amount WGME
candidates["direct_amt_wgme"] = pred_amt_wgme
print(f"✓ Created candidate: direct amount WGME")

# Candidate 3: Blend of price×area and direct amount
if "pricexarea_wgme" in candidates:
    candidates["blend_arith_avg"] = 0.5 * pred_amt_wgme + 0.5 * pred_amount_from_price_area
    print(f"✓ Created candidate: arithmetic blend (50/50)")

# Candidate 4: Simple arithmetic mean across all amount models
candidates["amt_arith_models_mean"] = preds_stack_amt.mean(axis=0)
print(f"✓ Created candidate: arithmetic mean of all amount models")

print(f"\nTotal candidates: {len(candidates)}")

# ===== Compute Meta-Weights Based on Stability =====
print("\n" + "-"*60)
print("Computing meta-weights based on prediction stability...")
print("-"*60)

cand_names = list(candidates.keys())
cand_stability = {}

for k, v in candidates.items():
    q1, q99 = np.nanpercentile(v, [1, 99])
    # Higher stability score = more stable predictions
    stability = 1.0 / (1.0 + (q99 / (q1 + 1e-9)))
    cand_stability[k] = stability
    
    mean_val = np.nanmean(v)
    std_val = np.nanstd(v)
    print(f"  {k}:")
    print(f"    Mean: {mean_val:,.2f}")
    print(f"    Std: {std_val:,.2f}")
    print(f"    Q1: {q1:,.2f}, Q99: {q99:,.2f}")
    print(f"    Stability: {stability:.4f}")

# Normalize stability scores into weights
raw_meta = np.array([cand_stability[k] for k in cand_names], dtype=float)
if raw_meta.sum() == 0:
    raw_meta = np.ones_like(raw_meta)

meta_weights_arr = raw_meta / raw_meta.sum()

print("\n" + "-"*60)
print("Meta-weights (based on stability):")
for name, weight in zip(cand_names, meta_weights_arr):
    print(f"  {name}: {weight:.4f}")
print("-"*60)

# ===== Create Final Ensemble Prediction =====
final_test_pred = np.zeros_like(list(candidates.values())[0])
for i, k in enumerate(cand_names):
    final_test_pred += meta_weights_arr[i] * candidates[k]

print(f"\n✓ Final ensemble prediction created")
print(f"  Mean: {np.mean(final_test_pred):,.2f}")
print(f"  Median: {np.median(final_test_pred):,.2f}")
print(f"  Min: {np.min(final_test_pred):,.2f}")
print(f"  Max: {np.max(final_test_pred):,.2f}")


COMBINING PREDICTIONS
✓ Created candidate: price×area WGME
✓ Created candidate: direct amount WGME
✓ Created candidate: arithmetic blend (50/50)
✓ Created candidate: arithmetic mean of all amount models

Total candidates: 4

------------------------------------------------------------
Computing meta-weights based on prediction stability...
------------------------------------------------------------
  pricexarea_wgme:
    Mean: 720,955.85
    Std: 7,609.27
    Q1: 711,189.87, Q99: 736,353.76
    Stability: 0.4913
  direct_amt_wgme:
    Mean: 103.58
    Std: 0.45
    Q1: 103.14, Q99: 105.00
    Stability: 0.4955
  blend_arith_avg:
    Mean: 360,529.72
    Std: 3,804.85
    Q1: 355,646.55, Q99: 368,229.38
    Stability: 0.4913
  amt_arith_models_mean:
    Mean: 108.19
    Std: 0.50
    Q1: 107.75, Q99: 109.86
    Stability: 0.4952

------------------------------------------------------------
Meta-weights (based on stability):
  pricexarea_wgme: 0.2490
  direct_amt_wgme: 0.2511
  blend_a

## 11. Post-Processing
Apply multiple refinement steps to improve predictions

In [11]:
print("\n" + "="*60)
print("POST-PROCESSING")
print("="*60)

# Create submission dataframe
test_ids = test["id"].values
sub = pd.DataFrame({
    "id": test_ids, 
    "new_house_transaction_amount": final_test_pred
})

print(f"Initial predictions - Mean: {sub['new_house_transaction_amount'].mean():,.2f}")

# ===== Step 1: Unit Check and Scaling =====
mean_pred = sub["new_house_transaction_amount"].mean()
if mean_pred > 1e5:
    print(f"\n✓ Step 1: Scaling down by 10,000 (detected large mean: {mean_pred:,.0f})")
    sub["new_house_transaction_amount"] /= 10000.0
else:
    print(f"\n✓ Step 1: No scaling needed (mean: {mean_pred:,.2f})")

# ===== Step 2: Floor Tiny Predictions =====
tiny_mask = sub["new_house_transaction_amount"] < 1.0
tiny_count = tiny_mask.sum()
sub.loc[tiny_mask, "new_house_transaction_amount"] = 0.0
print(f"✓ Step 2: Floored {tiny_count} tiny predictions to 0")

# ===== Step 3: Sector Median Fallback =====
if "amount_new_house_transactions" in m.columns:
    sector_median = m.groupby("sector")["amount_new_house_transactions"].median().to_dict()
    sub["sector"] = sub["id"].str.extract(r"sector\s*(\d+)")[0].astype(float)
    sub["sector_median"] = sub["sector"].map(sector_median)
    
    fallback_mask = (sub["new_house_transaction_amount"] == 0.0) & (sub["sector_median"].notna())
    fallback_count = fallback_mask.sum()
    sub.loc[fallback_mask, "new_house_transaction_amount"] = sub.loc[fallback_mask, "sector_median"] * 0.8
    print(f"✓ Step 3: Applied sector median fallback to {fallback_count} rows")
else:
    print(f"⚠ Step 3: Skipped (no sector median available)")

# ===== Step 4: Seasonality Bump =====
last_vals = m.sort_values(["sector", "month_str"]).groupby("sector")["amount_new_house_transactions"].last().to_dict()
sub["last_val"] = sub["sector"].map(last_vals)

seasonal_bump_factor = 0.05
mask_bump = sub["last_val"].notna()
bump_count = mask_bump.sum()

sub.loc[mask_bump, "new_house_transaction_amount"] = (
    sub.loc[mask_bump, "new_house_transaction_amount"] * (1 + seasonal_bump_factor) + 
    sub.loc[mask_bump, "last_val"] * seasonal_bump_factor
)
print(f"✓ Step 4: Applied seasonality bump to {bump_count} rows (factor: {seasonal_bump_factor})")

# ===== Step 5: Sector-Level Smoothing =====
sub["month"] = sub["id"].str.extract(r"(\d{4}\s+\w+)")[0]
sub["sector_int"] = sub["sector"].astype(int)
sub = sub.sort_values(["sector_int", "month"])

sub["smooth3"] = sub.groupby("sector_int")["new_house_transaction_amount"].transform(
    lambda s: s.rolling(3, min_periods=1, center=True).mean()
)
sub["new_house_transaction_amount"] = np.clip(sub["smooth3"], 0, None)
print(f"✓ Step 5: Applied 3-month centered rolling smoothing per sector")

# ===== Step 6: Quantile-Based Clipping =====
q1, q99 = np.nanpercentile(sub["new_house_transaction_amount"].clip(0), [1, 99])
before_clip = sub["new_house_transaction_amount"].copy()
sub["new_house_transaction_amount"] = sub["new_house_transaction_amount"].clip(
    lower=q1 * 0.5, 
    upper=q99 * 1.5
)
clip_count = (before_clip != sub["new_house_transaction_amount"]).sum()
print(f"✓ Step 6: Clipped {clip_count} outliers (Q1={q1:.2f}, Q99={q99:.2f})")

# ===== Step 7: Final Safety Floor =====
final_floor_mask = sub["new_house_transaction_amount"] < 1.0
final_floor_count = final_floor_mask.sum()
sub.loc[final_floor_mask, "new_house_transaction_amount"] = 0.0
print(f"✓ Step 7: Applied final safety floor to {final_floor_count} rows")

# Final statistics
print("\n" + "-"*60)
print("Final prediction statistics:")
print(f"  Mean: {sub['new_house_transaction_amount'].mean():,.2f}")
print(f"  Median: {sub['new_house_transaction_amount'].median():,.2f}")
print(f"  Std: {sub['new_house_transaction_amount'].std():,.2f}")
print(f"  Min: {sub['new_house_transaction_amount'].min():,.2f}")
print(f"  Max: {sub['new_house_transaction_amount'].max():,.2f}")
print(f"  Zeros: {(sub['new_house_transaction_amount'] == 0).sum()}")
print("-"*60)


POST-PROCESSING
Initial predictions - Mean: 269,318.05

✓ Step 1: Scaling down by 10,000 (detected large mean: 269,318)
✓ Step 2: Floored 0 tiny predictions to 0
✓ Step 3: Applied sector median fallback to 0 rows
✓ Step 4: Applied seasonality bump to 0 rows (factor: 0.05)
✓ Step 5: Applied 3-month centered rolling smoothing per sector
✓ Step 6: Clipped 0 outliers (Q1=26.57, Q99=27.51)
✓ Step 7: Applied final safety floor to 0 rows

------------------------------------------------------------
Final prediction statistics:
  Mean: 26.93
  Median: 26.91
  Std: 0.28
  Min: 26.57
  Max: 27.71
  Zeros: 0
------------------------------------------------------------


## 12. Save Results and Diagonstics
Create submission file and display OOF performance

In [12]:
print("\n" + "="*60)
print("SAVING RESULTS")
print("="*60)

# Keep only required columns
final_sub = sub[["id", "new_house_transaction_amount"]].copy()

# Save submission file
out_path = os.path.join(OUT_DIR, "submission_v7_ensemble.csv")
final_sub.to_csv(out_path, index=False)
print(f"✓ Saved submission to: {out_path}")
print(f"  Rows: {len(final_sub)}")
print(f"  Columns: {list(final_sub.columns)}")

# Display first few rows
print("\nFirst 10 rows of submission:")
print(final_sub.head(10))

print("\nLast 10 rows of submission:")
print(final_sub.tail(10))

# ===== OOF Diagnostics =====
print("\n" + "="*60)
print("OUT-OF-FOLD DIAGNOSTICS")
print("="*60)

try:
    if 'oof_amt' in globals() and "amount_new_house_transactions" in m.columns:
        oof_pred = oof_amt
        y_true = m["amount_new_house_transactions"].values
        
        # Compute two-stage score
        oof_score = two_stage_score(y_true, oof_pred)
        oof_mae = mean_absolute_error(y_true, oof_pred)
        
        print(f"\nDirect Amount Model OOF Performance:")
        print(f"  Two-Stage Score: {oof_score:.6f}")
        print(f"  MAE: {oof_mae:,.2f}")
        
        # Additional metrics
        ape = np.abs(oof_pred - y_true) / np.maximum(np.abs(y_true), 1e-12)
        frac_bad = np.mean(ape > 1.0)
        print(f"  Fraction with APE > 1.0: {frac_bad:.4f} ({frac_bad*100:.2f}%)")
        
        if frac_bad <= 0.3:
            mape_good = np.mean(ape[ape <= 1.0])
            print(f"  MAPE (on good predictions): {mape_good:.6f}")
        
        # Correlation
        corr = np.corrcoef(y_true, oof_pred)[0, 1]
        print(f"  Correlation: {corr:.4f}")
        
    else:
        print("⚠ OOF predictions not available")
        
except Exception as e:
    print(f"⚠ Could not compute OOF diagnostics: {e}")

# ===== Summary =====
print("\n" + "="*60)
print("PIPELINE COMPLETE")
print("="*60)
print("\nNext steps:")
print("  1. Upload 'submission_v7_ensemble.csv' to Kaggle")
print("  2. Check leaderboard score")
print("  3. Iterate on features/models if needed")
print("\n" + "="*60)


SAVING RESULTS
✓ Saved submission to: /kaggle/working/submission_v7_ensemble.csv
  Rows: 1152
  Columns: ['id', 'new_house_transaction_amount']

First 10 rows of submission:
                     id  new_house_transaction_amount
0     2024 Aug_sector 1                     27.707565
384   2024 Dec_sector 1                     27.707565
288   2024 Nov_sector 1                     27.707565
192   2024 Oct_sector 1                     27.707565
96    2024 Sep_sector 1                     27.707565
768   2025 Apr_sector 1                     27.707565
576   2025 Feb_sector 1                     27.707565
480   2025 Jan_sector 1                     27.707565
1056  2025 Jul_sector 1                     27.707565
960   2025 Jun_sector 1                     27.707565

Last 10 rows of submission:
                      id  new_house_transaction_amount
383   2024 Nov_sector 96                     26.641753
287   2024 Oct_sector 96                     26.641753
191   2024 Sep_sector 96             