# E-Style Real Estate Price Prediction
Kaggle competition notebook featuring LightGBM with monotonic constraints, type-specific modeling, and RMSLE optimization.

## 1. Setup Libraries & Configuration
Import core libraries, fix random seeds, and define helpers for RMSLE tracking.

In [22]:
# If LightGBM is missing in your environment, uncomment the next line.
# %pip install -q lightgbm

import gc
import math
import random
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

SEED = 2025
N_SPLITS = 5
TARGET_COL = "TradePrice"
ID_COL = "Id"

pd.set_option("display.max_columns", 200)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")


def set_seed(seed: int = SEED) -> None:
    """Fix all relevant random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)


def rmsle(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Compute RMSLE while protecting against negative predictions."""
    y_true = np.clip(y_true, a_min=0, a_max=None)
    y_pred = np.clip(y_pred, a_min=0, a_max=None)
    return math.sqrt(mean_squared_log_error(y_true, y_pred))


def memory_info(df: pd.DataFrame) -> str:
    """Return a human-readable memory usage string for quick diagnostics."""
    usage_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
    return f"{usage_mb:,.2f} MB"


set_seed(SEED)

## 2. Load Datasets
Read raw CSV files with consistent schema handling and sanity checks.

In [23]:
BASE_DIR = Path.cwd().resolve()
DATA_DIR = BASE_DIR.parent / "input" / "estyle-community-competition-2025"
OUTPUT_DIR = BASE_DIR.parent / "output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "train.csv"
test_path = DATA_DIR / "test.csv"
sample_submission_path = DATA_DIR / "sample_submission.csv"

if not train_path.exists():
    raise FileNotFoundError(f"Missing train data at {train_path}")

train_df = pd.read_csv(train_path, low_memory=False)
test_df = pd.read_csv(test_path, low_memory=False)
sample_submission = pd.read_csv(sample_submission_path, low_memory=False)

print(f"Train shape: {train_df.shape}, memory: {memory_info(train_df)}")
print(f"Test shape:  {test_df.shape}, memory: {memory_info(test_df)}")
print(f"Sample submission shape: {sample_submission.shape}")

Train shape: (300000, 35), memory: 298.86 MB
Test shape:  (600000, 34), memory: 593.08 MB
Sample submission shape: (600000, 2)


## 3. Basic Cleaning & Type Casting
Align numerical dtypes and ensure train/test columns match before feature work.

In [24]:
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Strip whitespace from column names to avoid subtle mismatches."""
    df = df.copy()
    df.columns = df.columns.str.strip()
    return df


def cast_boolean_columns(df: pd.DataFrame, bool_cols: List[str]) -> pd.DataFrame:
    """Ensure boolean indicator columns are stored as integers for LightGBM."""
    df = df.copy()
    for col in bool_cols:
        if col in df.columns:
            df[col] = df[col].astype("Int8")
    return df


def align_train_test(train: pd.DataFrame, test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Run basic normalization and confirm schema alignment."""
    bool_columns = [
        "AreaIsGreaterFlag",
        "FrontageIsGreaterFlag",
        "TotalFloorAreaIsGreaterFlag",
        "PrewarBuilding",
    ]
    train_clean = cast_boolean_columns(standardize_columns(train), bool_columns)
    test_clean = cast_boolean_columns(standardize_columns(test), bool_columns)

    missing_in_test = sorted(set(train_clean.columns) - set(test_clean.columns) - {TARGET_COL})
    if missing_in_test:
        print("Columns present in train but absent in test:", missing_in_test)
    return train_clean, test_clean


train_df, test_df = align_train_test(train_df, test_df)
print("Post-alignment train dtypes summary:\n", train_df.dtypes.value_counts())

Post-alignment train dtypes summary:
 object     17
float64     8
int64       6
Int8        4
Name: count, dtype: int64


## 4. Missing Value Handling & Flag Features
Impute categorical gaps with `'unknown'` and add binary indicators for all missing entries.

In [25]:
train_work = train_df.copy()
test_work = test_df.copy()

missing_summary = (
    pd.DataFrame({
        "train_missing_ratio": train_work.isna().mean(),
        "test_missing_ratio": test_work.isna().mean(),
    })
    .sort_values("train_missing_ratio", ascending=False)
)

missing_columns = [
    col
    for col in missing_summary.index
    if (train_work[col].isna().any() if col in train_work.columns else False)
    or (test_work[col].isna().any() if col in test_work.columns else False)
]

categorical_columns = sorted(
    set(train_work.select_dtypes(include=["object"]).columns)
    | set(test_work.select_dtypes(include=["object"]).columns)
)

def add_missing_indicators(df: pd.DataFrame, cols: List[str]) -> None:
    for col in cols:
        if col in df.columns:
            df[f"{col}_missing_flag"] = df[col].isna().astype("int8")

def fill_categorical_unknown(train_df: pd.DataFrame, test_df: pd.DataFrame, cat_cols: List[str]) -> None:
    for col in cat_cols:
        if col in train_df.columns:
            train_df[col] = train_df[col].fillna("unknown")
        if col in test_df.columns:
            test_df[col] = test_df[col].fillna("unknown")

def fill_numeric_with_median(train_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
    numeric_cols = sorted(set(train_df.select_dtypes(include=[np.number]).columns))
    for col in numeric_cols:
        if col == TARGET_COL:
            continue
        median_value = train_df[col].median()
        if np.isnan(median_value):
            median_value = 0.0
        train_df[col] = train_df[col].fillna(median_value)
        if col in test_df.columns:
            test_df[col] = test_df[col].fillna(median_value)

add_missing_indicators(train_work, missing_columns)
add_missing_indicators(test_work, missing_columns)
fill_categorical_unknown(train_work, test_work, categorical_columns)
# fill_numeric_with_median(train_work, test_work)

print("Missing indicators added for", len(missing_columns), "columns.")
missing_summary.head(12)

Missing indicators added for 22 columns.


Unnamed: 0,train_missing_ratio,test_missing_ratio
Remarks,0.9341,0.9345
Renovation,0.7386,0.738
FloorPlan,0.7226,0.7218
Purpose,0.652,0.6515
TotalFloorArea,0.6219,0.6233
Frontage,0.379,0.3801
Use,0.3572,0.3583
BuildingYear,0.3478,0.3483
Structure,0.3427,0.3441
Breadth,0.3322,0.3328


In [26]:
train_work.columns

Index(['Id', 'Type', 'Region', 'MunicipalityCode', 'Prefecture',
       'Municipality', 'DistrictName', 'NearestStation',
       'TimeToNearestStation', 'MinTimeToNearestStation',
       'MaxTimeToNearestStation', 'TradePrice', 'FloorPlan', 'Area',
       'AreaIsGreaterFlag', 'LandShape', 'Frontage', 'FrontageIsGreaterFlag',
       'TotalFloorArea', 'TotalFloorAreaIsGreaterFlag', 'BuildingYear',
       'PrewarBuilding', 'Structure', 'Use', 'Purpose', 'Direction',
       'Classification', 'Breadth', 'CityPlanning', 'CoverageRatio',
       'FloorAreaRatio', 'Year', 'Quarter', 'Renovation', 'Remarks',
       'Remarks_missing_flag', 'Renovation_missing_flag',
       'FloorPlan_missing_flag', 'Purpose_missing_flag',
       'TotalFloorArea_missing_flag', 'Frontage_missing_flag',
       'Use_missing_flag', 'BuildingYear_missing_flag',
       'Structure_missing_flag', 'Breadth_missing_flag',
       'Classification_missing_flag', 'Direction_missing_flag',
       'LandShape_missing_flag', 'Regio

## 5. High Missing Columns Pruning
Drop highly sparse fields while keeping their missingness indicators for signal preservation.

In [27]:
HIGH_MISSING_THRESHOLD = 0.85

high_missing_cols = [
    col
    for col, ratios in missing_summary.iterrows()
    if ratios["train_missing_ratio"] >= HIGH_MISSING_THRESHOLD
    and col not in {TARGET_COL}
]

train_work.drop(columns=[col for col in high_missing_cols if col in train_work.columns], inplace=True)
test_work.drop(columns=[col for col in high_missing_cols if col in test_work.columns], inplace=True)

print(f"Dropped {len(high_missing_cols)} sparse columns: {high_missing_cols}")

Dropped 1 sparse columns: ['Remarks']


## 6. Outlier Detection & Removal
Remove extreme target values using log-scale quantile clipping to stabilize training.

In [28]:
def trim_outliers_log(df: pd.DataFrame, target_col: str, lower_quantile: float = 0.005, upper_quantile: float = 0.995) -> pd.DataFrame:
    """Remove rows with extreme targets on the log scale."""
    log_target = np.log1p(df[target_col])
    low, high = log_target.quantile([lower_quantile, upper_quantile])
    mask = log_target.between(low, high)
    trimmed = df.loc[mask].copy()
    print(
        f"Outlier trimming retained {mask.sum()} of {len(mask)} rows "
        f"({mask.sum() / len(mask):.2%})."
    )
    return trimmed


train_filtered = trim_outliers_log(train_work, TARGET_COL)
print("Filtered train shape:", train_filtered.shape)

Outlier trimming retained 297074 of 300000 rows (99.02%).
Filtered train shape: (297074, 56)


## 7. Feature Engineering: DistrictName × BuildingYear Aggregations
Capture localized pricing signals with smoothed mean/median targets by district and construction year.

In [29]:
def build_district_buildyear_agg(train_source: pd.DataFrame) -> Dict[str, Dict[Tuple[str, int], float]]:
    helper = train_source[[ID_COL, "DistrictName", "BuildingYear", TARGET_COL]].copy()
    helper["DistrictName"] = helper["DistrictName"].fillna("unknown")
    helper["BuildingYearGroup"] = helper["BuildingYear"].fillna(-1).round().astype(int)

    grouped = helper.groupby(["DistrictName", "BuildingYearGroup"])[TARGET_COL].agg(["mean", "median", "count"])
    global_mean = helper[TARGET_COL].mean()
    grouped["smoothed_mean"] = (
        (grouped["mean"] * grouped["count"]) + (global_mean * 50)
    ) / (grouped["count"] + 50)

    return {
    "smoothed_mean": grouped["smoothed_mean"].to_dict(),
    "median": grouped["median"].to_dict(),
    "count": grouped["count"].to_dict(),
    "global_mean": global_mean,
    "global_median": helper[TARGET_COL].median(),
}


def apply_agg_features(
    df: pd.DataFrame,
    agg_lookup: Dict[str, Dict[Tuple[str, int], float]],
    district_source: pd.Series,
    building_year_source: pd.Series,
) -> pd.DataFrame:
    df = df.copy()
    district_values = df[ID_COL].map(district_source).fillna("unknown")
    building_year_values = df[ID_COL].map(building_year_source).fillna(-1).round().astype(int)
    keys = list(zip(district_values, building_year_values))

    df["district_buildyear_price_mean"] = [
        agg_lookup["smoothed_mean"].get(key, agg_lookup["global_mean"])
        for key in keys
    ]
    df["district_buildyear_price_median"] = [
        agg_lookup["median"].get(key, agg_lookup["global_median"])
        for key in keys
    ]
    df["district_buildyear_price_count"] = [
        agg_lookup["count"].get(key, 0.0)
        for key in keys
    ]
    return df


agg_lookup = build_district_buildyear_agg(train_df)
train_filtered = apply_agg_features(
    train_filtered,
    agg_lookup,
    district_source=train_df.set_index(ID_COL)["DistrictName"],
    building_year_source=train_df.set_index(ID_COL)["BuildingYear"],
)
test_work = apply_agg_features(
    test_work,
    agg_lookup,
    district_source=test_df.set_index(ID_COL)["DistrictName"],
    building_year_source=test_df.set_index(ID_COL)["BuildingYear"],
)

train_filtered[[
    "district_buildyear_price_mean",
    "district_buildyear_price_median",
    "district_buildyear_price_count",
]].describe()

Unnamed: 0,district_buildyear_price_mean,district_buildyear_price_median,district_buildyear_price_count
count,297074.0,297074.0,297074.0
mean,42678586.4915,30193923.4092,16.5103
std,10758608.1179,41727357.1724,35.6271
min,8522413.0007,11000.0,1.0
25%,40049274.2073,14000000.0,2.0
50%,42169896.6779,25000000.0,6.0
75%,43026739.1637,37000000.0,17.0
max,579902822.3681,6019500000.0,433.0


## 8. Categorical Encoding & Label Preparation
Convert remaining object columns to categorical dtype and define target transformations for RMSLE.

In [30]:
def add_domain_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if {"Year", "BuildingYear"}.issubset(df.columns):
        df["BuildingAge"] = (df["Year"] - df["BuildingYear"]).clip(lower=0)
        flag_col = "BuildingYear_missing_flag"
        if flag_col in df.columns:
            df.loc[df[flag_col] == 1, "BuildingAge"] = np.nan
    if "Area" in df.columns:
        df["Area_log"] = np.log1p(df["Area"])
    if "TotalFloorArea" in df.columns:
        df["TotalFloorArea_log"] = np.log1p(df["TotalFloorArea"])
        df["FloorArea_to_Area"] = df["TotalFloorArea"] / (df["Area"] + 1e-3)
    if {"Frontage", "Area"}.issubset(df.columns):
        df["Frontage_to_sqrtArea"] = df["Frontage"] / (np.sqrt(df["Area"]) + 1e-3)
    if {"MaxTimeToNearestStation", "MinTimeToNearestStation"}.issubset(df.columns):
        df["StationTimeRange"] = df["MaxTimeToNearestStation"] - df["MinTimeToNearestStation"]
    df["district_buildyear_price_count_log"] = np.log1p(df.get("district_buildyear_price_count", 0.0))
    return df


train_filtered = add_domain_features(train_filtered)
test_work = add_domain_features(test_work)

fill_numeric_with_median(train_filtered, test_work)

categorical_cols_final = sorted(
    set(train_filtered.select_dtypes(include=["object"]).columns)
    | set(test_work.select_dtypes(include=["object"]).columns)
)
for col in categorical_cols_final:
    if col in train_filtered.columns:
        train_filtered[col] = train_filtered[col].astype("category")
    if col in test_work.columns:
        test_work[col] = test_work[col].astype("category")

train_target = train_filtered[TARGET_COL].copy()
print("Categorical columns prepared:", len(categorical_cols_final))

Categorical columns prepared: 16


## 9. Segment Datasets by Property Type
Split samples into `land only` and `with building` segments to train specialized models.

In [31]:
LAND_KEYWORDS = ["land", "土地", "宅地", "lot", "residential land", "commercial land"]


def detect_land_only(type_series: pd.Series) -> pd.Series:
    type_str = type_series.astype(str).str.lower()
    pattern = "|".join(LAND_KEYWORDS)
    land_mask = type_str.str.contains(pattern, case=False, na=False)
    return land_mask


train_filtered["is_land_only"] = detect_land_only(train_filtered["Type"]).astype("int8")
test_work["is_land_only"] = detect_land_only(test_work["Type"]).astype("int8")

train_filtered["Type"] = train_filtered["Type"].cat.remove_unused_categories()

train_filtered["is_land_only"].value_counts(normalize=True).rename("share").to_frame("share")

Unnamed: 0_level_0,share
is_land_only,Unnamed: 1_level_1
1,0.7109
0,0.2891


## 10. LightGBM Monotonic Constraint Definitions
Enforce domain knowledge (e.g., larger area ⇒ higher price) via monotone constraints per feature.

In [32]:
MONOTONIC_FEATURE_MAP = {
    "Area": 1,
    "Area_log": 1,
    "TotalFloorArea": 1,
    "TotalFloorArea_log": 1,
    "FloorArea_to_Area": 1,
}


def build_monotonic_constraints(feature_names: List[str]) -> str:
    """Return LightGBM-compatible monotone constraint string."""
    constraints = [MONOTONIC_FEATURE_MAP.get(name, 0) for name in feature_names]
    return "(" + ",".join(str(int(val)) for val in constraints) + ")"


## 11. K-Fold Cross-Validation Workflow
Train LightGBM models per segment with RMSLE-focused validation and constraint-aware parameters.

In [None]:
EXCLUDE_FEATURES = {TARGET_COL, ID_COL, "district_buildyear_price_count", "is_land_only"}

LIGHTGBM_PARAMS_BASE = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.03,
    "n_estimators": 5000,
    "num_leaves": 128,
    "max_depth": -1,
    "subsample": 0.8,
    "subsample_freq": 1,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "min_child_samples": 50,
    "n_jobs": -1,
}


def get_feature_columns(df: pd.DataFrame) -> List[str]:
    return [col for col in df.columns if col not in EXCLUDE_FEATURES]


def run_segment_cv(
    train_segment: pd.DataFrame,
    test_segment: pd.DataFrame,
    segment_name: str,
    seed: int = SEED,
    n_splits: int = N_SPLITS,
    retrain_on_full: bool = True,
) -> Dict[str, object]:
    features = get_feature_columns(train_segment)
    cat_features = [col for col in features if str(train_segment[col].dtype) == "category"]
    monotone_constraints = build_monotonic_constraints(features)

    X = train_segment[features]
    y = np.log1p(train_segment[TARGET_COL].values)
    X_test = test_segment[features]

    oof_pred = np.zeros(len(train_segment))
    test_pred = np.zeros(len(test_segment))
    fold_scores = []
    feature_importances = []
    best_iterations = []

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        params = LIGHTGBM_PARAMS_BASE.copy()
        params.update({"monotone_constraints": monotone_constraints, "random_state": seed + fold})

        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric="rmse",
            categorical_feature=cat_features,
            callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)],
        )

        best_iterations.append(model.best_iteration_)
        
        val_pred = model.predict(X_valid, num_iteration=model.best_iteration_)
        oof_pred[valid_idx] = np.maximum(np.expm1(val_pred), 0)

        if not retrain_on_full:
            test_pred += np.maximum(
                np.expm1(model.predict(X_test, num_iteration=model.best_iteration_)),
                0,
            ) / n_splits

        fold_score = rmsle(train_segment.iloc[valid_idx][TARGET_COL].values, oof_pred[valid_idx])
        fold_scores.append(fold_score)

        fold_importance = pd.DataFrame({
            "feature": features,
            "importance": model.booster_.feature_importance(importance_type="gain"),
            "fold": fold,
            "segment": segment_name,
        })
        feature_importances.append(fold_importance)

        gc.collect()

    # Retrain on full data if requested
    if retrain_on_full:
        print(f"  Retraining {segment_name} on full training data...")
        avg_best_iteration = int(np.mean(best_iterations))
        
        params_full = LIGHTGBM_PARAMS_BASE.copy()
        params_full.update({
            "monotone_constraints": monotone_constraints,
            "random_state": seed,
            "n_estimators": avg_best_iteration + 50,
        })
        
        final_model = lgb.LGBMRegressor(**params_full)
        final_model.fit(
            X, y,
            categorical_feature=cat_features,
            callbacks=[lgb.log_evaluation(200)],
        )
        
        test_pred = np.maximum(
            np.expm1(final_model.predict(X_test)),
            0,
        )

    result = {
        "oof": pd.Series(oof_pred, index=train_segment.index, name=f"oof_{segment_name}"),
        "test_pred": pd.Series(test_pred, index=test_segment.index, name=f"pred_{segment_name}"),
        "score_mean": np.mean(fold_scores),
        "score_std": np.std(fold_scores),
        "feature_importances": pd.concat(feature_importances, ignore_index=True),
        "avg_best_iteration": int(np.mean(best_iterations)),
    }
    print(f"Segment {segment_name}: RMSLE {result['score_mean']:.5f} ± {result['score_std']:.5f}")
    if retrain_on_full:
        print(f"  Avg best iteration: {result['avg_best_iteration']}")
    return result


## 12. Train Segment Models on Full Data
Execute segmented cross-validation, gather OOF predictions, and summarize feature importance.

In [None]:
# Choose between two approaches:
# - retrain_on_full=True: Retrain on all data after CV (recommended, uses more data)
# - retrain_on_full=False: Average predictions from CV folds (more robust to overfitting)
USE_FULL_RETRAIN = True

segment_mapping = {1: "land_only", 0: "with_building"}
segment_results = {}
all_feature_importances = []

oof_series = pd.Series(index=train_filtered.index, dtype=float)
test_predictions_series = pd.Series(index=test_work.index, dtype=float)

for segment_value, segment_name in segment_mapping.items():
    train_segment = train_filtered[train_filtered["is_land_only"] == segment_value].copy()
    test_segment = test_work[test_work["is_land_only"] == segment_value].copy()

    if train_segment.empty:
        print(f"Segment {segment_name} has no training records; skipping.")
        continue

    if test_segment.empty:
        print(f"Segment {segment_name} has no test records; predictions will remain NaN.")

    result = run_segment_cv(train_segment, test_segment, segment_name, retrain_on_full=USE_FULL_RETRAIN)
    segment_results[segment_name] = result

    oof_series.loc[train_segment.index] = result["oof"]
    if not test_segment.empty:
        test_predictions_series.loc[test_segment.index] = result["test_pred"]

    all_feature_importances.append(result["feature_importances"])

valid_oof = oof_series.dropna()
overall_rmsle_score = rmsle(train_filtered.loc[valid_oof.index, TARGET_COL].values, valid_oof.values)
print(f"\n{'='*60}")
print(f"Overall RMSLE across segments: {overall_rmsle_score:.5f}")
print(f"Approach: {'Full data retrain' if USE_FULL_RETRAIN else 'CV fold averaging'}")
print(f"{'='*60}")

feature_importance_summary = (
    pd.concat(all_feature_importances, ignore_index=True)
    .groupby(["segment", "feature"], as_index=False)["importance"]
    .mean()
)

oof_series.head()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008120 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11716
[LightGBM] [Info] Number of data points in the train set: 168942, number of used features: 59
[LightGBM] [Info] Start training from score 16.867719
Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 0.438631
[400]	valid_0's rmse: 0.42783
[600]	valid_0's rmse: 0.425909
[800]	valid_0's rmse: 0.425459
[1000]	valid_0's rmse: 0.425255
Early stopping, best iteration is:
[994]	valid_0's rmse: 0.42523
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005972 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11720
[LightGBM] [Info] Number of data points in the t

## 13. Inference on Test Segments & Blending
Combine segment-wise predictions into a single test forecast vector.

In [None]:
fallback_prediction = train_filtered[TARGET_COL].median()
test_predictions_series = test_predictions_series.fillna(fallback_prediction)

test_predictions_series.describe()

count         600,000.0000
mean       32,205,097.8513
std        38,794,397.0271
min           161,661.7147
25%        14,175,259.3401
50%        25,155,741.6207
75%        37,617,874.9476
max     2,856,886,828.2736
dtype: float64

## 14. Create Submission File
Export the blended predictions in the official submission format.

In [None]:
submission_df = pd.DataFrame({
    ID_COL: test_work[ID_COL].values,
    TARGET_COL: np.maximum(test_predictions_series.loc[test_work.index].values, 0),
})

suffix = "full_retrain" if USE_FULL_RETRAIN else "cv_avg"
submission_path = OUTPUT_DIR / f"submission_lightgbm_monotonic_{suffix}.csv"
submission_df.to_csv(submission_path, index=False)

print(f"Submission saved to {submission_path}")
print(f"Approach: {'Full data retrain' if USE_FULL_RETRAIN else 'CV fold averaging'}")
submission_df.head()

Submission saved to /Users/estyle-155/Documents/kaggle/kaggle_estyle/output/submission_lightgbm_monotonic.csv


Unnamed: 0,Id,TradePrice
0,300001,18366669.9948
1,300002,96312334.3856
2,300003,260533802.7713
3,300004,9878918.0471
4,300005,14471425.5912


In [None]:
top_features = (
    feature_importance_summary.groupby("feature", as_index=False)["importance"].mean()
    .sort_values("importance", ascending=False)
    .head(20)
)
top_features

Unnamed: 0,feature,importance
62,district_buildyear_price_median,999235.0289
0,Area,152159.542
33,Municipality,115328.4142
8,CityPlanning,105606.8525
35,NearestStation,88177.957
61,district_buildyear_price_mean,66229.3815
16,DistrictName,49255.3865
60,district_buildyear_price_count_log,29243.2269
6,BuildingYear,28581.8356
2,Area_log,24083.3926


## 15. Model Comparison & Diagnostics
Compare predictions and feature importance between approaches.

In [None]:
# Summary statistics
print("="*60)
print("MODEL SUMMARY")
print("="*60)
print(f"\nLocal CV RMSLE: {overall_rmsle_score:.5f}")
print(f"Training approach: {'Full data retrain' if USE_FULL_RETRAIN else 'CV fold averaging'}")

print("\n" + "="*60)
print("SEGMENT SCORES")
print("="*60)
for seg_name, seg_result in segment_results.items():
    print(f"\n{seg_name}:")
    print(f"  RMSLE: {seg_result['score_mean']:.5f} ± {seg_result['score_std']:.5f}")
    if 'avg_best_iteration' in seg_result:
        print(f"  Avg best iteration: {seg_result['avg_best_iteration']}")

print("\n" + "="*60)
print("PREDICTION STATISTICS")
print("="*60)
print("\nOOF predictions:")
print(oof_series.describe())
print("\nTest predictions:")
print(test_predictions_series.describe())
print("\nTrain target:")
print(train_filtered[TARGET_COL].describe())

### Experiment Notes
**To compare both approaches:**
1. Run with `USE_FULL_RETRAIN = True` → generates `submission_lightgbm_monotonic_full_retrain.csv`
2. Change to `USE_FULL_RETRAIN = False` → generates `submission_lightgbm_monotonic_cv_avg.csv`
3. Submit both to Kaggle and compare public LB scores

**Expected differences:**
- Full retrain: May have slightly lower LB score but uses all training data
- CV average: More robust ensemble, better generalization on unseen data