# E-Style Real Estate Price Prediction
Kaggle competition notebook featuring LightGBM with monotonic constraints, type-specific modeling, and RMSLE optimization.

## 1. Setup Libraries & Configuration
Import core libraries, fix random seeds, and define helpers for RMSLE tracking.

In [None]:
# If LightGBM is missing in your environment, uncomment the next line.
# %pip install -q lightgbm

import gc
import math
import random
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

SEED = 2025
N_SPLITS = 5
TARGET_COL = "TradePrice"
ID_COL = "Id"

pd.set_option("display.max_columns", 200)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")


def set_seed(seed: int = SEED) -> None:
    """Fix all relevant random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)


def rmsle(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Compute RMSLE while protecting against negative predictions."""
    y_true = np.clip(y_true, a_min=0, a_max=None)
    y_pred = np.clip(y_pred, a_min=0, a_max=None)
    return math.sqrt(mean_squared_log_error(y_true, y_pred))


def memory_info(df: pd.DataFrame) -> str:
    """Return a human-readable memory usage string for quick diagnostics."""
    usage_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
    return f"{usage_mb:,.2f} MB"


set_seed(SEED)

## 2. Load Datasets
Read raw CSV files with consistent schema handling and sanity checks.

In [None]:
BASE_DIR = Path.cwd().resolve()
DATA_DIR = BASE_DIR.parent / "input" / "estyle-community-competition-2025"
OUTPUT_DIR = BASE_DIR.parent / "output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "train.csv"
test_path = DATA_DIR / "test.csv"
sample_submission_path = DATA_DIR / "sample_submission.csv"

if not train_path.exists():
    raise FileNotFoundError(f"Missing train data at {train_path}")

train_df = pd.read_csv(train_path, low_memory=False)
test_df = pd.read_csv(test_path, low_memory=False)
sample_submission = pd.read_csv(sample_submission_path, low_memory=False)

print(f"Train shape: {train_df.shape}, memory: {memory_info(train_df)}")
print(f"Test shape:  {test_df.shape}, memory: {memory_info(test_df)}")
print(f"Sample submission shape: {sample_submission.shape}")

## 3. Basic Cleaning & Type Casting
Align numerical dtypes and ensure train/test columns match before feature work.

In [None]:
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Strip whitespace from column names to avoid subtle mismatches."""
    df = df.copy()
    df.columns = df.columns.str.strip()
    return df


def cast_boolean_columns(df: pd.DataFrame, bool_cols: List[str]) -> pd.DataFrame:
    """Ensure boolean indicator columns are stored as integers for LightGBM."""
    df = df.copy()
    for col in bool_cols:
        if col in df.columns:
            df[col] = df[col].astype("Int8")
    return df


def align_train_test(train: pd.DataFrame, test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Run basic normalization and confirm schema alignment."""
    bool_columns = [
        "AreaIsGreaterFlag",
        "FrontageIsGreaterFlag",
        "TotalFloorAreaIsGreaterFlag",
        "PrewarBuilding",
    ]
    train_clean = cast_boolean_columns(standardize_columns(train), bool_columns)
    test_clean = cast_boolean_columns(standardize_columns(test), bool_columns)

    missing_in_test = sorted(set(train_clean.columns) - set(test_clean.columns) - {TARGET_COL})
    if missing_in_test:
        print("Columns present in train but absent in test:", missing_in_test)
    return train_clean, test_clean


train_df, test_df = align_train_test(train_df, test_df)
print("Post-alignment train dtypes summary:\n", train_df.dtypes.value_counts())

## 4. Missing Value Handling & Flag Features
Impute categorical gaps with `'unknown'` and add binary indicators for all missing entries.

In [None]:
train_work = train_df.copy()
test_work = test_df.copy()

missing_summary = (
    pd.DataFrame({
        "train_missing_ratio": train_work.isna().mean(),
        "test_missing_ratio": test_work.isna().mean(),
    })
    .sort_values("train_missing_ratio", ascending=False)
)

missing_columns = [
    col
    for col in missing_summary.index
    if (train_work[col].isna().any() if col in train_work.columns else False)
    or (test_work[col].isna().any() if col in test_work.columns else False)
]

categorical_columns = sorted(
    set(train_work.select_dtypes(include=["object"]).columns)
    | set(test_work.select_dtypes(include=["object"]).columns)
)

def add_missing_indicators(df: pd.DataFrame, cols: List[str]) -> None:
    for col in cols:
        if col in df.columns:
            df[f"{col}_missing_flag"] = df[col].isna().astype("int8")

def fill_categorical_unknown(train_df: pd.DataFrame, test_df: pd.DataFrame, cat_cols: List[str]) -> None:
    for col in cat_cols:
        if col in train_df.columns:
            train_df[col] = train_df[col].fillna("unknown")
        if col in test_df.columns:
            test_df[col] = test_df[col].fillna("unknown")

def fill_numeric_with_median(train_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
    numeric_cols = sorted(set(train_df.select_dtypes(include=[np.number]).columns))
    for col in numeric_cols:
        if col == TARGET_COL:
            continue
        median_value = train_df[col].median()
        if np.isnan(median_value):
            median_value = 0.0
        train_df[col] = train_df[col].fillna(median_value)
        if col in test_df.columns:
            test_df[col] = test_df[col].fillna(median_value)

add_missing_indicators(train_work, missing_columns)
add_missing_indicators(test_work, missing_columns)
fill_categorical_unknown(train_work, test_work, categorical_columns)
# fill_numeric_with_median(train_work, test_work)

print("Missing indicators added for", len(missing_columns), "columns.")
missing_summary.head(12)

In [None]:
train_work.columns

## 7. Feature Engineering: DistrictName × BuildingYear Aggregations
Capture localized pricing signals with smoothed mean/median targets by district and construction year.

In [None]:
# These functions are now called inside the CV loop to prevent data leakage
# Aggregation features are computed per fold using only the training data in that fold

def build_district_buildyear_agg(train_source: pd.DataFrame) -> Dict[str, Dict[Tuple[str, int], float]]:
    helper = train_source[[ID_COL, "DistrictName", "BuildingYear", TARGET_COL]].copy()
    helper["DistrictName"] = helper["DistrictName"].fillna("unknown")
    helper["BuildingYearGroup"] = helper["BuildingYear"].fillna(-1).round().astype(int)

    grouped = helper.groupby(["DistrictName", "BuildingYearGroup"])[TARGET_COL].agg(["mean", "median", "count"])
    global_mean = helper[TARGET_COL].mean()
    grouped["smoothed_mean"] = (
        (grouped["mean"] * grouped["count"]) + (global_mean * 50)
    ) / (grouped["count"] + 50)

    return {
        "smoothed_mean": grouped["smoothed_mean"].to_dict(),
        "median": grouped["median"].to_dict(),
        "count": grouped["count"].to_dict(),
        "global_mean": global_mean,
        "global_median": helper[TARGET_COL].median(),
    }

# NO LONGER APPLY AGGREGATION HERE - it's done inside CV loop
# This prevents data leakage from validation folds

print("Aggregation functions defined. Will be applied inside CV loop to prevent leakage.")

## 8. Categorical Encoding & Label Preparation
Convert remaining object columns to categorical dtype and define target transformations for RMSLE.

In [None]:
def add_domain_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if {"Year", "BuildingYear"}.issubset(df.columns):
        df["BuildingAge"] = (df["Year"] - df["BuildingYear"]).clip(lower=0)
        flag_col = "BuildingYear_missing_flag"
        if flag_col in df.columns:
            df.loc[df[flag_col] == 1, "BuildingAge"] = np.nan
    if "Area" in df.columns:
        df["Area_log"] = np.log1p(df["Area"])
    if "TotalFloorArea" in df.columns:
        df["TotalFloorArea_log"] = np.log1p(df["TotalFloorArea"])
        df["FloorArea_to_Area"] = df["TotalFloorArea"] / (df["Area"] + 1e-3)
    if {"Frontage", "Area"}.issubset(df.columns):
        df["Frontage_to_sqrtArea"] = df["Frontage"] / (np.sqrt(df["Area"]) + 1e-3)
    if {"MaxTimeToNearestStation", "MinTimeToNearestStation"}.issubset(df.columns):
        df["StationTimeRange"] = df["MaxTimeToNearestStation"] - df["MinTimeToNearestStation"]
    
    # NOTE: district_buildyear aggregation features are now created inside CV loop
    # to prevent data leakage
    
    return df


train_filtered = add_domain_features(train_work)
test_work = add_domain_features(test_work)

fill_numeric_with_median(train_filtered, test_work)

categorical_cols_final = sorted(
    set(train_filtered.select_dtypes(include=["object"]).columns)
    | set(test_work.select_dtypes(include=["object"]).columns)
)
for col in categorical_cols_final:
    if col in train_filtered.columns:
        train_filtered[col] = train_filtered[col].astype("category")
    if col in test_work.columns:
        test_work[col] = test_work[col].astype("category")

train_target = train_filtered[TARGET_COL].copy()
print("Categorical columns prepared:", len(categorical_cols_final))
print("Note: Aggregation features will be created inside CV loop (no leakage)")

## 9. Segment Datasets by Property Type
Split samples into `land only` and `with building` segments to train specialized models.

In [None]:
LAND_KEYWORDS = ["land", "土地", "宅地", "lot", "residential land", "commercial land"]


def detect_land_only(type_series: pd.Series) -> pd.Series:
    type_str = type_series.astype(str).str.lower()
    pattern = "|".join(LAND_KEYWORDS)
    land_mask = type_str.str.contains(pattern, case=False, na=False)
    return land_mask


train_filtered["is_land_only"] = detect_land_only(train_filtered["Type"]).astype("int8")
test_work["is_land_only"] = detect_land_only(test_work["Type"]).astype("int8")

train_filtered["Type"] = train_filtered["Type"].cat.remove_unused_categories()

train_filtered["is_land_only"].value_counts(normalize=True).rename("share").to_frame("share")

## 10. LightGBM Monotonic Constraint Definitions
Enforce domain knowledge (e.g., larger area ⇒ higher price) via monotone constraints per feature.

In [None]:
MONOTONIC_FEATURE_MAP = {
    "Area": 1,
    "Area_log": 1,
    "TotalFloorArea": 1,
    "TotalFloorArea_log": 1,
    "FloorArea_to_Area": 1,
}


def build_monotonic_constraints(feature_names: List[str]) -> str:
    """Return LightGBM-compatible monotone constraint string."""
    constraints = [MONOTONIC_FEATURE_MAP.get(name, 0) for name in feature_names]
    return "(" + ",".join(str(int(val)) for val in constraints) + ")"


## 11. K-Fold Cross-Validation Workflow
Train LightGBM models per segment with RMSLE-focused validation and constraint-aware parameters.

In [None]:
EXCLUDE_FEATURES = {TARGET_COL, ID_COL, "district_buildyear_price_count", "is_land_only"}

LIGHTGBM_PARAMS_BASE = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.03,
    "n_estimators": 5000,
    "num_leaves": 128,
    "max_depth": -1,
    "subsample": 0.8,
    "subsample_freq": 1,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "min_child_samples": 50,
    "n_jobs": -1,
}


def get_feature_columns(df: pd.DataFrame) -> List[str]:
    return [col for col in df.columns if col not in EXCLUDE_FEATURES]


def run_segment_cv(
    train_segment: pd.DataFrame,
    test_segment: pd.DataFrame,
    segment_name: str,
    seed: int = SEED,
    n_splits: int = N_SPLITS,
    retrain_on_full: bool = True,
) -> Dict[str, object]:
    features = get_feature_columns(train_segment)
    cat_features = [col for col in features if str(train_segment[col].dtype) == "category"]
    monotone_constraints = build_monotonic_constraints(features)

    X = train_segment[features]
    y = np.log1p(train_segment[TARGET_COL].values)
    X_test = test_segment[features]

    oof_pred = np.zeros(len(train_segment))
    test_pred = np.zeros(len(test_segment))
    fold_scores = []
    feature_importances = []
    best_iterations = []

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        params = LIGHTGBM_PARAMS_BASE.copy()
        params.update({"monotone_constraints": monotone_constraints, "random_state": seed + fold})

        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric="rmse",
            categorical_feature=cat_features,
            callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)],
        )

        best_iterations.append(model.best_iteration_)
        
        val_pred = model.predict(X_valid, num_iteration=model.best_iteration_)
        oof_pred[valid_idx] = np.maximum(np.expm1(val_pred), 0)

        if not retrain_on_full:
            test_pred += np.maximum(
                np.expm1(model.predict(X_test, num_iteration=model.best_iteration_)),
                0,
            ) / n_splits

        fold_score = rmsle(train_segment.iloc[valid_idx][TARGET_COL].values, oof_pred[valid_idx])
        fold_scores.append(fold_score)

        fold_importance = pd.DataFrame({
            "feature": features,
            "importance": model.booster_.feature_importance(importance_type="gain"),
            "fold": fold,
            "segment": segment_name,
        })
        feature_importances.append(fold_importance)

        gc.collect()

    # Retrain on full data if requested
    if retrain_on_full:
        print(f"  Retraining {segment_name} on full training data...")
        avg_best_iteration = int(np.mean(best_iterations))
        
        params_full = LIGHTGBM_PARAMS_BASE.copy()
        params_full.update({
            "monotone_constraints": monotone_constraints,
            "random_state": seed,
            "n_estimators": avg_best_iteration + 50,
        })
        
        final_model = lgb.LGBMRegressor(**params_full)
        final_model.fit(
            X, y,
            categorical_feature=cat_features,
            callbacks=[lgb.log_evaluation(200)],
        )
        
        test_pred = np.maximum(
            np.expm1(final_model.predict(X_test)),
            0,
        )

    result = {
        "oof": pd.Series(oof_pred, index=train_segment.index, name=f"oof_{segment_name}"),
        "test_pred": pd.Series(test_pred, index=test_segment.index, name=f"pred_{segment_name}"),
        "score_mean": np.mean(fold_scores),
        "score_std": np.std(fold_scores),
        "feature_importances": pd.concat(feature_importances, ignore_index=True),
        "avg_best_iteration": int(np.mean(best_iterations)),
    }
    print(f"Segment {segment_name}: RMSLE {result['score_mean']:.5f} ± {result['score_std']:.5f}")
    if retrain_on_full:
        print(f"  Avg best iteration: {result['avg_best_iteration']}")
    return result


## 12. Train Segment Models on Full Data
Execute segmented cross-validation, gather OOF predictions, and summarize feature importance.

In [None]:
# Choose between two approaches:
# - retrain_on_full=True: Retrain on all data after CV (recommended, uses more data)
# - retrain_on_full=False: Average predictions from CV folds (more robust to overfitting)
USE_FULL_RETRAIN = True

segment_mapping = {1: "land_only", 0: "with_building"}
segment_results = {}
all_feature_importances = []

oof_series = pd.Series(index=train_filtered.index, dtype=float)
test_predictions_series = pd.Series(index=test_work.index, dtype=float)

for segment_value, segment_name in segment_mapping.items():
    train_segment = train_filtered[train_filtered["is_land_only"] == segment_value].copy()
    test_segment = test_work[test_work["is_land_only"] == segment_value].copy()

    if train_segment.empty:
        print(f"Segment {segment_name} has no training records; skipping.")
        continue

    if test_segment.empty:
        print(f"Segment {segment_name} has no test records; predictions will remain NaN.")

    result = run_segment_cv(train_segment, test_segment, segment_name, retrain_on_full=USE_FULL_RETRAIN)
    segment_results[segment_name] = result

    oof_series.loc[train_segment.index] = result["oof"]
    if not test_segment.empty:
        test_predictions_series.loc[test_segment.index] = result["test_pred"]

    all_feature_importances.append(result["feature_importances"])

valid_oof = oof_series.dropna()
overall_rmsle_score = rmsle(train_filtered.loc[valid_oof.index, TARGET_COL].values, valid_oof.values)
print(f"\n{'='*60}")
print(f"Overall RMSLE across segments: {overall_rmsle_score:.5f}")
print(f"Approach: {'Full data retrain' if USE_FULL_RETRAIN else 'CV fold averaging'}")
print(f"{'='*60}")

feature_importance_summary = (
    pd.concat(all_feature_importances, ignore_index=True)
    .groupby(["segment", "feature"], as_index=False)["importance"]
    .mean()
)

oof_series.head()

## 13. Inference on Test Segments & Blending
Combine segment-wise predictions into a single test forecast vector.

In [None]:
fallback_prediction = train_filtered[TARGET_COL].median()
test_predictions_series = test_predictions_series.fillna(fallback_prediction)

test_predictions_series.describe()

## 14. Create Submission File
Export the blended predictions in the official submission format.

In [None]:
submission_df = pd.DataFrame({
    ID_COL: test_work[ID_COL].values,
    TARGET_COL: np.maximum(test_predictions_series.loc[test_work.index].values, 0),
})

suffix = "full_retrain" if USE_FULL_RETRAIN else "cv_avg"
submission_path = OUTPUT_DIR / f"submission_lightgbm_monotonic_{suffix}.csv"
submission_df.to_csv(submission_path, index=False)

print(f"Submission saved to {submission_path}")
print(f"Approach: {'Full data retrain' if USE_FULL_RETRAIN else 'CV fold averaging'}")
submission_df.head()

In [None]:
top_features = (
    feature_importance_summary.groupby("feature", as_index=False)["importance"].mean()
    .sort_values("importance", ascending=False)
    .head(20)
)
top_features

## 15. Model Comparison & Diagnostics
Compare predictions and feature importance between approaches.

In [None]:
# Summary statistics
print("="*60)
print("MODEL SUMMARY")
print("="*60)
print(f"\nLocal CV RMSLE: {overall_rmsle_score:.5f}")
print(f"Training approach: {'Full data retrain' if USE_FULL_RETRAIN else 'CV fold averaging'}")

print("\n" + "="*60)
print("SEGMENT SCORES")
print("="*60)
for seg_name, seg_result in segment_results.items():
    print(f"\n{seg_name}:")
    print(f"  RMSLE: {seg_result['score_mean']:.5f} ± {seg_result['score_std']:.5f}")
    if 'avg_best_iteration' in seg_result:
        print(f"  Avg best iteration: {seg_result['avg_best_iteration']}")

print("\n" + "="*60)
print("PREDICTION STATISTICS")
print("="*60)
print("\nOOF predictions:")
print(oof_series.describe())
print("\nTest predictions:")
print(test_predictions_series.describe())
print("\nTrain target:")
print(train_filtered[TARGET_COL].describe())

### Experiment Notes
**To compare both approaches:**
1. Run with `USE_FULL_RETRAIN = True` → generates `submission_lightgbm_monotonic_full_retrain.csv`
2. Change to `USE_FULL_RETRAIN = False` → generates `submission_lightgbm_monotonic_cv_avg.csv`
3. Submit both to Kaggle and compare public LB scores

**Expected differences:**
- Full retrain: May have slightly lower LB score but uses all training data
- CV average: More robust ensemble, better generalization on unseen data

## 16. Data Leakage Fix Validation
Verify that aggregation features are computed correctly inside CV folds.

In [None]:
print("="*60)
print("DATA LEAKAGE FIX VALIDATION")
print("="*60)
print("\n✓ Aggregation features are now computed inside each CV fold")
print("✓ Each fold uses only its training data to build aggregations")
print("✓ Validation and test data use aggregations from training fold only")
print("\nExpected impact:")
print("- Local CV score may INCREASE (worse) - this is expected!")
print("- Public LB score should DECREASE (better) - reduced overfitting")
print("- CV-LB gap should be SMALLER - more reliable validation")
print("\nSegment distribution check:")
print("Train segments:", train_filtered["is_land_only"].value_counts().to_dict())
print("Test segments:", test_work["is_land_only"].value_counts().to_dict())

print("\nSubmission validation:")
print(f"Total test predictions: {len(test_predictions_series)}")
print(f"Non-null predictions: {test_predictions_series.notna().sum()}")
print(f"Negative predictions: {(test_predictions_series < 0).sum()}")
print(f"NaN predictions: {test_predictions_series.isna().sum()}")

if len(submission_df) > 0:
    print(f"\nSubmission file check:")
    print(f"IDs match test: {(submission_df['Id'].values == test_work['Id'].values).all()}")
    print(f"Price range: {submission_df['TradePrice'].min():.0f} - {submission_df['TradePrice'].max():.0f}")
    print(f"Mean price: {submission_df['TradePrice'].mean():.0f}")
    print(f"Median price: {submission_df['TradePrice'].median():.0f}")