## 1. Imports & Setup

In [1]:
import os, gc, warnings, math
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error

try:
    import lightgbm as lgb
    LGB = True
except:
    LGB = False
try:
    import xgboost as xgb
    XGB = True
except:
    XGB = False

SEED = 42
np.random.seed(SEED)
INPUT_DIR = "data"
OUT_DIR = "outputs/v5"
os.makedirs(OUT_DIR, exist_ok=True)

def save(df, name):
    path = os.path.join(OUT_DIR, name)
    df.to_csv(path, index=False)
    print("💾 Saved:", path)
    return path

def two_stage_score(y_true, y_pred):
    eps = 1e-12
    ape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), eps)
    frac_bad = np.mean(ape > 1.0)
    if frac_bad > 0.3:
        return 0.0
    mask = (ape <= 1.0)
    if mask.sum() == 0:
        return 0.0
    mape = np.mean(ape[mask])
    return 1.0 - (mape / mask.mean())

def evaluate(y_true, y_pred, name="eval"):
    s = two_stage_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{name}: score={s:.5f}, MAE={mae:.2f}")
    return s, mae

print("✅ Libraries loaded successfully.")


✅ Libraries loaded successfully.


## 2. Load and clean base datasets

In [2]:
# Step 1: Load data
files = {
    "new":"train/new_house_transactions.csv",
    "new_near":"train/new_house_transactions_nearby_sectors.csv",
    "pre":"train/pre_owned_house_transactions.csv",
    "pre_near":"train/pre_owned_house_transactions_nearby_sectors.csv",
    "land":"train/land_transactions.csv",
    "land_near":"train/land_transactions_nearby_sectors.csv",
    "poi":"train/sector_POI.csv",
    "test":"test.csv",
}
data = {}
for k,v in files.items():
    path = os.path.join(INPUT_DIR,v)
    data[k] = pd.read_csv(path)
    print(f"{k}: {data[k].shape}")


new: (5433, 11)
new_near: (5360, 11)
pre: (5360, 6)
pre_near: (5427, 6)
land: (5896, 6)
land_near: (5025, 6)
poi: (86, 142)
test: (1152, 2)


## 3. Clean individual datasets

In [None]:
# Step 2: Cleaning
def clean_df(df):
    df = df.drop_duplicates().reset_index(drop=True)
    for c in df.select_dtypes(include=['object']).columns:
        df[c] = df[c].astype(str).str.strip()
    for c in df.select_dtypes(include=[np.number]).columns:
        df[c] = df[c].fillna(0)
    return df

for k in data:
    data[k] = clean_df(data[k])

# Drop high-null columns in POI
nulls = data["poi"].isna().mean()
drop_cols = nulls[nulls > 0.7].index
data["poi"] = data["poi"].drop(columns=drop_cols, errors="ignore").fillna(0)

print("✅ Cleaned all datasets.")


## 4. Merge into modeling DataFrame

In [None]:
# Step 3: Merge datasets
m = data["new"].copy()

def merge_with_prefix(base, other, prefix):
    if other is None:
        return base
    o = other.copy()
    for c in o.columns:
        if c not in ["month","sector"]:
            o.rename(columns={c: f"{prefix}_{c}"}, inplace=True)
    return base.merge(o, on=["month","sector"], how="left")

m = merge_with_prefix(m, data["pre"], "pre")
m = merge_with_prefix(m, data["land"], "land")
m = merge_with_prefix(m, data["new_near"], "new_near")
m = merge_with_prefix(m, data["pre_near"], "pre_near")
m = merge_with_prefix(m, data["land_near"], "land_near")
m = m.merge(data["poi"], on="sector", how="left")

m = m.fillna(0)
print("✅ Merged modeling dataset:", m.shape)


## 5. Feature Engineering (lags, ratios, logs)

In [None]:
# Step 4: Feature engineering
m = m.sort_values(["sector","month"]).reset_index(drop=True)

def safe_div(a, b): return np.where(b!=0, a/(b+1e-6), 0)

# Ratio features
m["price_area_ratio"] = safe_div(m["price_new_house_transactions"], m["area_new_house_transactions"])
m["land_value_density"] = safe_div(
    m.get("land_transaction_amount", m.get("land_transaction_amount_land", 0)),
    m.get("construction_area", m.get("construction_area_land", 0))
)
m["new_vs_pre_owned_price"] = safe_div(
    m["price_new_house_transactions"], m.get("price_pre_owned_house_transactions", 0)
)

# Lag features
for col in ["amount_new_house_transactions","num_new_house_transactions","area_new_house_transactions"]:
    if col in m.columns:
        m[f"{col}_lag1"] = m.groupby("sector")[col].shift(1).fillna(0)
        m[f"{col}_roll3"] = m.groupby("sector")[col].rolling(3,1).mean().reset_index(level=0,drop=True)

# Log target
m["y_log1p"] = np.log1p(m["amount_new_house_transactions"].clip(lower=0))
print("✅ Feature engineering done. Shape:", m.shape)


## 6. Feature selection

In [None]:
# Step 5: Feature selection
num_cols = m.select_dtypes(include=[np.number]).columns
corr = m[num_cols].corr()["amount_new_house_transactions"].abs().sort_values(ascending=False)
top_feats = corr.index[1:150].tolist()  # top 150
print("✅ Selected", len(top_feats), "features.")


## 7. Train ensemble (LightGBM + XGBoost)

In [None]:
# Step 6: Ensemble training
X = m[top_feats].fillna(0)
y_log = m["y_log1p"].values
y_true = m["amount_new_house_transactions"].values

# Load test
test = data["test"].copy()
for f in top_feats:
    if f not in test.columns:
        test[f] = 0
X_test = test[top_feats].fillna(0)

# LightGBM
if LGB:
    dtrain = lgb.Dataset(X, label=y_log)
    lgb_params = {
        "objective":"regression","metric":"mae",
        "learning_rate":0.02,"num_leaves":256,"feature_fraction":0.75,
        "bagging_fraction":0.75,"lambda_l1":0.5,"lambda_l2":1.0,
        "verbosity":-1,"seed":SEED
    }
    print("⚙️ Training LightGBM...")
    lgbm = lgb.train(lgb_params, dtrain, num_boost_round=2000)
    p_lgb = np.expm1(lgbm.predict(X_test))
else:
    p_lgb = np.zeros(len(X_test))

# XGBoost
if XGB:
    print("⚙️ Training XGBoost...")
    dtrain_x = xgb.DMatrix(X, label=y_log)
    xgb_params = {
        "objective":"reg:squarederror","eta":0.02,
        "max_depth":8,"subsample":0.8,"colsample_bytree":0.8,"seed":SEED
    }
    xgbm = xgb.train(xgb_params, dtrain_x, num_boost_round=1500)
    p_xgb = np.expm1(xgbm.predict(xgb.DMatrix(X_test)))
else:
    p_xgb = np.zeros(len(X_test))

# Blend
final_pred = 0.7*p_lgb + 0.3*p_xgb
print("✅ Model training done.")


## 8. Build submission before fix

In [None]:
# Step 7: Pre-fix submission
sub = pd.DataFrame({
    "id": test["id"],
    "new_house_transaction_amount": final_pred
})
print("✅ Raw submission built.")


## 9. Smart Post-Processing & Submission

In [None]:
# Step 8: Smart post-processing to improve leaderboard score

print("🔧 Starting V5 Post-Processing")
sub = sub.copy()

# --- 1️⃣ Unit correction ---
mean_pred = sub["new_house_transaction_amount"].mean()
if mean_pred > 1e5:
    print(f"⚠️ Detected unit mismatch (mean={mean_pred:.1f}), dividing by 10,000.")
    sub["new_house_transaction_amount"] /= 10000.0
else:
    print("✅ Units look consistent (10,000 yuan scale confirmed).")

# --- 2️⃣ Outlier clipping ---
q_low, q_high = sub["new_house_transaction_amount"].quantile([0.01, 0.99])
sub["new_house_transaction_amount"] = sub["new_house_transaction_amount"].clip(q_low*0.5, q_high*1.5)
print("✅ Clipped outlier predictions.")

# --- 3️⃣ Sector fallback for zeros ---
if "amount_new_house_transactions" in m.columns:
    sector_means = m.groupby("sector")["amount_new_house_transactions"].mean().to_dict()
    sub["sector"] = sub["id"].str.extract(r"sector (\d+)")[0]
    sub["sector_mean"] = sub["sector"].map(sector_means)
    mask = (sub["new_house_transaction_amount"] < 1.0) & (sub["sector_mean"].notna())
    sub.loc[mask, "new_house_transaction_amount"] = sub.loc[mask, "sector_mean"] * 0.8
    print(f"✅ Replaced {mask.sum()} zero predictions with sector means.")

# --- 4️⃣ Smooth per-sector over months ---
sub["month"] = sub["id"].str.extract(r"(\d{4} \w+)")[0]
sub = sub.sort_values(["sector","month"])
sub["smooth"] = sub.groupby("sector")["new_house_transaction_amount"].transform(lambda s: s.rolling(3,1,center=True).mean())
sub["new_house_transaction_amount"] = np.clip(sub["smooth"], 0, None)
print("✅ Smoothed predictions over months.")

# --- 5️⃣ Save submission ---
sub = sub[["id","new_house_transaction_amount"]]
save(sub, "submission_v5_fixed.csv")
print("🏁 Submission ready for Kaggle upload!")
