## 1. Imports & Setup

In [1]:
import pandas as pd
import numpy as np
import os, re, warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from tqdm.notebook import tqdm

LGB_AVAILABLE, XGB_AVAILABLE = True, True
SEED = 42
np.random.seed(SEED)

# Helper save/load
def save_df(df, path):
    df.to_csv(path, index=False)
    print(f"💾 Saved: {path} ({df.shape})")

def evaluate_preds(y_true, y_pred, name=""):
    ape = np.abs((y_pred - y_true) / (y_true + 1e-6))
    frac_bad = np.mean(ape > 1)
    if frac_bad > 0.3:
        score = 0.0
    else:
        mape = np.mean(ape[ape <= 1])
        score = 1 - mape / max(1 - frac_bad, 1e-6)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{name} | two-stage: {score:.6f} | MAE: {mae:.4f} | frac_bad: {frac_bad:.4f}")
    return {"score": score, "mae": mae, "frac_bad": frac_bad}


## 2. Load Data

In [2]:
# Define local data path
BASE_DIR = os.getcwd()
DATA_PATH = os.path.join(BASE_DIR, "data")

# Load all major datasets
data = {
    "new_house": pd.read_csv(os.path.join(DATA_PATH, "train", "new_house_transactions.csv")),
    "new_house_near": pd.read_csv(os.path.join(DATA_PATH, "train", "new_house_transactions_nearby_sectors.csv")),
    "pre_owned": pd.read_csv(os.path.join(DATA_PATH, "train", "pre_owned_house_transactions.csv")),
    "land": pd.read_csv(os.path.join(DATA_PATH, "train", "land_transactions.csv")),
    "land_near": pd.read_csv(os.path.join(DATA_PATH, "train", "land_transactions_nearby_sectors.csv")),
    "poi": pd.read_csv(os.path.join(DATA_PATH, "train", "sector_POI.csv")),
    "city_index": pd.read_csv(os.path.join(DATA_PATH, "train", "city_indexes.csv")),
    "test": pd.read_csv(os.path.join(DATA_PATH, "test.csv")),
}

print("✅ Loaded datasets:")
for k, v in data.items():
    print(f"{k:20s} {v.shape}")


✅ Loaded datasets:
new_house            (5433, 11)
new_house_near       (5360, 11)
pre_owned            (5360, 6)
land                 (5896, 6)
land_near            (5025, 6)
poi                  (86, 142)
city_index           (7, 74)
test                 (1152, 2)


## 3. Clean & Merge Core Datasets

In [3]:
# Clean base dataframes
for k in ["new_house", "pre_owned", "land"]:
    df = data[k]
    df = df.drop_duplicates().fillna(0)
    data[k] = df

# Merge datasets on ['month','sector']
m = data["new_house"].merge(data["pre_owned"], on=["month","sector"], suffixes=("", "_pre"), how="left")
m = m.merge(data["land"], on=["month","sector"], suffixes=("", "_land"), how="left")
m = m.merge(data["poi"], on=["sector"], how="left")

# Target transformation
m["y_log1p"] = np.log1p(m["amount_new_house_transactions"])
m = m.fillna(0)
print("✅ Merged master table:", m.shape)


✅ Merged master table: (5433, 161)


## 4. Lag and Rolling Features

In [4]:
# Create temporal lags & rolling windows
def create_lag_features(df, group_col="sector", time_col="month", cols=None, lags=[1,2], rolls=[3]):
    out = df.copy()
    for col in cols:
        for lag in lags:
            out[f"{col}_lag{lag}"] = out.groupby(group_col)[col].shift(lag)
        for r in rolls:
            out[f"{col}_roll{r}"] = out.groupby(group_col)[col].transform(lambda x: x.rolling(r,1).mean())
    return out

lag_cols = ["amount_new_house_transactions","area_new_house_transactions","num_new_house_transactions"]
m = create_lag_features(m, cols=lag_cols)
m = m.fillna(0)
print("✅ Added lag & rolling features:", [f for f in m.columns if "lag" in f or "roll" in f])


✅ Added lag & rolling features: ['amount_new_house_transactions_lag1', 'amount_new_house_transactions_lag2', 'amount_new_house_transactions_roll3', 'area_new_house_transactions_lag1', 'area_new_house_transactions_lag2', 'area_new_house_transactions_roll3', 'num_new_house_transactions_lag1', 'num_new_house_transactions_lag2', 'num_new_house_transactions_roll3']


## 5. Add Ratio Features

In [6]:
# === Cell: Add key ratio/interaction features (auto-detect safe columns) ===

def safe_div(a, b):
    """Safe division that handles missing or zero denominators."""
    return np.where(b != 0, a / b, 0)

def get_col(df, possible_names):
    """Return first column name that exists from a list."""
    for name in possible_names:
        if name in df.columns:
            return name
    return None

# Auto-detect correct land column names
land_amount_col = get_col(m, ["land_transaction_amount", "land_transaction_amount_land"])
construction_col = get_col(m, ["construction_area", "construction_area_land"])

# Create ratio features robustly
m["price_area_ratio"] = safe_div(m.get("price_new_house_transactions", 0), m.get("area_new_house_transactions", 0) + 1)

if land_amount_col and construction_col:
    m["land_value_density"] = safe_div(m[land_amount_col], m[construction_col] + 1)
else:
    m["land_value_density"] = 0
    print("⚠️ land_value_density: skipped (columns not found)")

m["new_vs_pre_owned_price"] = safe_div(
    m.get("price_new_house_transactions", 0),
    m.get("price_pre_owned_house_transactions", 0) + 1
)

# Clean and validate
ratio_features = ["price_area_ratio", "land_value_density", "new_vs_pre_owned_price"]
m[ratio_features] = m[ratio_features].replace([np.inf, -np.inf], 0).fillna(0)

print("✅ Added ratio features:", ratio_features)


⚠️ land_value_density: skipped (columns not found)
✅ Added ratio features: ['price_area_ratio', 'land_value_density', 'new_vs_pre_owned_price']


## 6. Time Weighting

In [7]:
# Apply progressive month weighting (+10% per month)
months = sorted(m["month"].unique().tolist())
month_to_idx = {mo:i for i,mo in enumerate(months)}
m["month_weight"] = m["month"].map(lambda mo: 1.0 + 0.10 * month_to_idx[mo])
print(f"✅ Month weights range: {m['month_weight'].min():.2f}-{m['month_weight'].max():.2f}")


✅ Month weights range: 1.00-7.60


## 7. Feature Selection

In [8]:
# Collect numeric features (excluding target & obvious non-features)
exclude_cols = ["y_log1p", "amount_new_house_transactions", "month", "sector"]
features = [c for c in m.select_dtypes(include=[np.number]).columns if c not in exclude_cols]
print("✅ Final feature count:", len(features))


✅ Final feature count: 170


## 8. LightGBM + XGBoost Ensemble Training

In [9]:
# Safe test alignment
test_df = data["test"].copy()
missing_cols = [c for c in features if c not in test_df.columns]
extra_cols = [c for c in test_df.columns if c not in features]

for c in missing_cols:
    test_df[c] = 0
test_df = test_df.drop(columns=extra_cols, errors="ignore")
X_test = test_df[features].fillna(0)
print(f"✅ Test alignment: {len(features)} features used ({len(missing_cols)} added)")

# Prepare train
X = m[features].fillna(0)
y = m["amount_new_house_transactions"].values
y_log = m["y_log1p"].values

# Rolling folds
folds = []
n_months = len(months)
for i in range(5):
    train_end = int((i + 1) * n_months / 6)
    val_start = train_end
    val_end = min(train_end + n_months // 6, n_months)
    if train_end == 0 or val_start >= val_end:
        continue
    folds.append((months[:train_end], months[val_start:val_end]))
print("✅ Created folds:", len(folds))

# LightGBM params
lgb_params = {
    "objective": "regression",
    "metric": "mae",
    "learning_rate": 0.02,
    "num_leaves": 256,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.75,
    "bagging_fraction": 0.75,
    "lambda_l1": 0.5,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": SEED
}

oof_lgb, oof_xgb = np.zeros(len(X)), np.zeros(len(X))
test_lgb, test_xgb = [], []

for fold, (tr_mo, vl_mo) in enumerate(folds):
    print(f"\n=== Fold {fold} | train {len(tr_mo)} val {len(vl_mo)} ===")
    tr_idx = m["month"].isin(tr_mo)
    vl_idx = m["month"].isin(vl_mo)
    X_tr, X_val = X.loc[tr_idx], X.loc[vl_idx]
    y_tr, y_val = y_log[tr_idx], y[vl_idx]
    w_tr = m.loc[tr_idx, "month_weight"].values

    # LightGBM
    if LGB_AVAILABLE:
        dtr = lgb.Dataset(X_tr, label=y_tr, weight=w_tr)
        dval = lgb.Dataset(X_val, label=y_log[vl_idx])
        lgbm = lgb.train(lgb_params, dtr, num_boost_round=3000,
                         valid_sets=[dtr, dval],
                         callbacks=[lgb.early_stopping(stopping_rounds=100),
                                    lgb.log_evaluation(0)])
        p_val_lgb = np.expm1(np.clip(lgbm.predict(X_val, num_iteration=lgbm.best_iteration), -20, 50))
        p_test_lgb = np.expm1(np.clip(lgbm.predict(X_test, num_iteration=lgbm.best_iteration), -20, 50))
    else:
        p_val_lgb = np.zeros(len(X_val)); p_test_lgb = np.zeros(len(X_test))

    # XGBoost
    if XGB_AVAILABLE:
        dtr_x = xgb.DMatrix(X_tr, label=y_tr, weight=w_tr)
        dval_x = xgb.DMatrix(X_val, label=y_log[vl_idx])
        xgb_params = {
            "objective": "reg:squarederror",
            "eta": 0.02,
            "max_depth": 8,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "lambda": 1.0,
            "alpha": 0.5,
            "seed": SEED
        }
        xgbm = xgb.train(xgb_params, dtr_x, num_boost_round=2000,
                         evals=[(dval_x, "val")],
                         early_stopping_rounds=100, verbose_eval=False)
        p_val_xgb = np.expm1(np.clip(xgbm.predict(xgb.DMatrix(X_val)), -20, 50))
        p_test_xgb = np.expm1(np.clip(xgbm.predict(xgb.DMatrix(X_test)), -20, 50))
    else:
        p_val_xgb = np.zeros(len(X_val)); p_test_xgb = np.zeros(len(X_test))

    p_val_blend = 0.7 * p_val_lgb + 0.3 * p_val_xgb
    oof_lgb[vl_idx], oof_xgb[vl_idx] = p_val_lgb, p_val_xgb
    test_lgb.append(p_test_lgb)
    test_xgb.append(p_test_xgb)
    evaluate_preds(y[vl_idx], p_val_blend, name=f"fold{fold}_blend")

# Blend OOF/test
oof_blend = 0.7 * oof_lgb + 0.3 * oof_xgb
evaluate_preds(y, oof_blend, name="OOF_blend")
test_pred_blend = 0.7 * np.mean(test_lgb, axis=0) + 0.3 * np.mean(test_xgb, axis=0)


✅ Test alignment: 170 features used (170 added)
✅ Created folds: 5

=== Fold 0 | train 11 val 11 ===
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1889]	training's l1: 0.0371586	valid_1's l1: 0.142065
fold0_blend | two-stage: 0.871094 | MAE: 5587.8262 | frac_bad: 0.0057

=== Fold 1 | train 22 val 11 ===
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1043]	training's l1: 0.0313842	valid_1's l1: 0.081758
fold1_blend | two-stage: 0.926946 | MAE: 5210.5523 | frac_bad: 0.0011

=== Fold 2 | train 33 val 11 ===
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1989]	training's l1: 0.0160451	valid_1's l1: 0.0674053
fold2_blend | two-stage: 0.940072 | MAE: 2100.2712 | frac_bad: 0.0011

=== Fold 3 | train 44 val 11 ===
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1359]	training's l1: 0.0169726	v

## 9. Sector Smoothing & Submission

In [10]:
# Apply 3-month rolling smoothing by sector
test_original = data["test"].copy()
sub = pd.DataFrame({
    "id": test_original["id"],
    "new_house_transaction_amount": test_pred_blend
})
sub["month"] = sub["id"].str.extract(r"(\d{4} \w+)")[0]
sub["sector"] = sub["id"].str.extract(r"sector (\d+)")[0].astype(int)

sub = sub.sort_values(["sector","month"])
sub["smooth_pred"] = sub.groupby("sector")["new_house_transaction_amount"].transform(
    lambda s: s.rolling(window=3, min_periods=1, center=True).mean()
)
sub["new_house_transaction_amount"] = np.clip(sub["smooth_pred"], 0, None)
sub = sub[["id","new_house_transaction_amount"]]

save_df(sub, "submission_v3_1_ensemble_smooth.csv")
print("🏁 submission_v3_1_ensemble_smooth.csv ready for Kaggle upload!")


💾 Saved: submission_v3_1_ensemble_smooth.csv ((1152, 2))
🏁 submission_v3_1_ensemble_smooth.csv ready for Kaggle upload!
