In [14]:
import json
import os
from pathlib import Path

SECRETS_PATH = Path("../.secrets/kaggle.json")
if SECRETS_PATH.exists():
    with open(SECRETS_PATH) as f:
        creds = json.load(f)
    os.environ["KAGGLE_USERNAME"] = creds["username"]
    os.environ["KAGGLE_KEY"] = creds["key"]
else:
    raise FileNotFoundError("Missing Kaggle credentials at ../.secrets/kaggle.json")


In [15]:
from typing import Dict, Iterable, List, Optional

import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.decomposition import PCA
from sklearn.ensemble import (
    BaggingRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)
RANDOM_STATE = 42


In [16]:
DATA_DIR = Path("../data")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
SUBMISSIONS_DIR = DATA_DIR

if not TRAIN_PATH.exists():
    raise FileNotFoundError("Expected train.csv under ../data")
if not TEST_PATH.exists():
    raise FileNotFoundError("Expected test.csv under ../data")


In [17]:
train_df = pd.read_csv(TRAIN_PATH).set_index("Id")
test_df = pd.read_csv(TEST_PATH).set_index("Id")

y = train_df["SalePrice"]
X = train_df.drop(columns="SalePrice")

train_df.shape, test_df.shape


((1460, 80), (1459, 79))

In [18]:
def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add domain-driven aggregates, ratios, and indicator features."""

    result = df.copy()
    safe_div = lambda num, den: num / (den + 1e-6)

    result["TotalSF"] = (
        result.get("TotalBsmtSF", 0)
        + result.get("1stFlrSF", 0)
        + result.get("2ndFlrSF", 0)
    )
    result["TotalBath"] = (
        result.get("FullBath", 0)
        + 0.5 * result.get("HalfBath", 0)
        + result.get("BsmtFullBath", 0)
        + 0.5 * result.get("BsmtHalfBath", 0)
    )
    result["AgeAtSale"] = result.get("YrSold", 0) - result.get("YearBuilt", 0)
    result["SinceRemodel"] = result.get("YrSold", 0) - result.get("YearRemodAdd", 0)
    result["IsRemodeled"] = (result.get("YearBuilt", 0) != result.get("YearRemodAdd", 0)).astype(int)
    result["TotalPorchSF"] = (
        result.get("OpenPorchSF", 0)
        + result.get("EnclosedPorch", 0)
        + result.get("3SsnPorch", 0)
        + result.get("ScreenPorch", 0)
        + result.get("WoodDeckSF", 0)
    )
    result["HasPool"] = (result.get("PoolArea", 0) > 0).astype(int)
    result["HasGarage"] = (result.get("GarageArea", 0) > 0).astype(int)
    result["HasFireplace"] = (result.get("Fireplaces", 0) > 0).astype(int)
    result["LotRatio"] = safe_div(result.get("GrLivArea", 0), result.get("LotArea", 1))
    result["LogLotArea"] = np.log1p(result.get("LotArea", 0))
    result["LogGrLivArea"] = np.log1p(result.get("GrLivArea", 0))
    result["QualityScore"] = result.get("OverallQual", 0) * result.get("OverallCond", 0)
    result["LivLotRatio"] = safe_div(result.get("GrLivArea", 0), result.get("LotArea", 1))
    result["BedBathRatio"] = safe_div(
        result.get("BedroomAbvGr", 0),
        result.get("FullBath", 0) + 0.5 * result.get("HalfBath", 0) + 1,
    )
    result["RoomsPerSF"] = safe_div(result.get("TotRmsAbvGrd", 0), result.get("GrLivArea", 1))
    result["GarageCarsToArea"] = safe_div(result.get("GarageCars", 0), result.get("GarageArea", 1))
    result["GarageAge"] = result.get("YrSold", 0) - result.get("GarageYrBlt", result.get("YearBuilt", 0))
    result["OutdoorSF"] = result.get("TotalPorchSF", 0) + result.get("WoodDeckSF", 0)
    result["IsNew"] = (result.get("YearBuilt", 0) >= result.get("YrSold", 0) - 1).astype(int)
    result["RemodelAgeRatio"] = safe_div(result.get("SinceRemodel", 0), result.get("AgeAtSale", 0) + 1)
    result["BsmtFinRatio"] = safe_div(result.get("BsmtFinSF1", 0) + result.get("BsmtFinSF2", 0), result.get("TotalBsmtSF", 0) + 1)
    result["TotalFunctionalSF"] = result.get("TotalSF", 0) + result.get("TotalPorchSF", 0)
    result["LogTotalSF"] = np.log1p(result.get("TotalSF", 0))

    return result


In [19]:
X_fe = add_engineered_features(X)
test_fe = add_engineered_features(test_df)

X_fe.shape, test_fe.shape


((1460, 103), (1459, 103))

In [20]:
numeric_features = X_fe.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [col for col in X_fe.columns if col not in numeric_features]

len(numeric_features), len(categorical_features)


(60, 43)

In [21]:
numeric_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("power", PowerTransformer(method="yeo-johnson")),
        ("scaler", StandardScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
    ],
    remainder="drop",
)

cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=RANDOM_STATE)
cv


RepeatedKFold(n_repeats=2, n_splits=5, random_state=42)

In [22]:
def evaluate_pipeline(name: str, pipeline: Pipeline, X_data: pd.DataFrame, y_data: pd.Series, splitter: RepeatedKFold) -> Dict[str, float]:
    fold_rows: List[Dict[str, float]] = []

    for fold_idx, (train_idx, valid_idx) in enumerate(splitter.split(X_data, y_data), start=1):
        X_train, X_valid = X_data.iloc[train_idx], X_data.iloc[valid_idx]
        y_train, y_valid = y_data.iloc[train_idx], y_data.iloc[valid_idx]

        model = clone(pipeline)
        model.fit(X_train, y_train)
        preds = model.predict(X_valid)
        preds = np.clip(preds, 1, None)

        rmse = float(np.sqrt(mean_squared_error(y_valid, preds)))
        log_rmse = float(np.sqrt(mean_squared_error(np.log1p(y_valid), np.log1p(preds))))
        r2 = float(r2_score(y_valid, preds))

        fold_rows.append({"rmse": rmse, "log_rmse": log_rmse, "r2": r2})

    fold_df = pd.DataFrame(fold_rows)
    return {
        "model": name,
        "rmse_mean": fold_df["rmse"].mean(),
        "rmse_std": fold_df["rmse"].std(ddof=0),
        "log_rmse_mean": fold_df["log_rmse"].mean(),
        "log_rmse_std": fold_df["log_rmse"].std(ddof=0),
        "r2_mean": fold_df["r2"].mean(),
    }


def evaluate_pipelines(pipelines: Dict[str, Pipeline], X_data: pd.DataFrame, y_data: pd.Series, splitter: RepeatedKFold) -> pd.DataFrame:
    rows = []
    total = len(pipelines)
    for idx, (name, pipe) in enumerate(pipelines.items(), start=1):
        print(f"[{idx}/{total}] Evaluating {name}...", flush=True)
        rows.append(evaluate_pipeline(name, pipe, X_data, y_data, splitter))
    results = pd.DataFrame(rows)
    return results.sort_values(["log_rmse_mean", "rmse_mean"]).reset_index(drop=True)


In [23]:
def make_hgb(**kwargs) -> HistGradientBoostingRegressor:
    params = dict(
        learning_rate=0.05,
        max_depth=5,
        max_iter=700,
        min_samples_leaf=14,
        l2_regularization=0.1,
        max_leaf_nodes=None,
    )
    params.update(kwargs)
    return HistGradientBoostingRegressor(random_state=RANDOM_STATE, **params)


def make_gbr(**kwargs) -> GradientBoostingRegressor:
    params = dict(
        n_estimators=2000,
        learning_rate=0.04,
        max_depth=3,
        subsample=0.9,
        min_samples_leaf=2,
        max_features=None,
    )
    params.update(kwargs)
    return GradientBoostingRegressor(random_state=RANDOM_STATE, **params)


def make_rf(**kwargs) -> RandomForestRegressor:
    params = dict(
        n_estimators=1000,
        max_depth=None,
        max_features="sqrt",
        min_samples_leaf=1,
        bootstrap=True,
    )
    params.update(kwargs)
    return RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1, **params)


def make_extratrees(**kwargs) -> ExtraTreesRegressor:
    params = dict(
        n_estimators=1200,
        max_depth=None,
        max_features=0.6,
        min_samples_leaf=1,
        bootstrap=False,
    )
    params.update(kwargs)
    return ExtraTreesRegressor(random_state=RANDOM_STATE, n_jobs=-1, **params)



In [24]:
model_registry: Dict[str, Pipeline] = {
    "HGB_baseline": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", make_hgb()),
        ]
    ),
    "HGB_deeper": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", make_hgb(max_depth=6, max_iter=900, min_samples_leaf=12)),
        ]
    ),
    "HGB_shrinkage": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            (
                "model",
                make_hgb(learning_rate=0.03, max_iter=1200, min_samples_leaf=20, max_depth=4, l2_regularization=0.2),
            ),
        ]
    ),
    "HGB_with_pca": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("dim_red", PCA(n_components=150, svd_solver="randomized", random_state=RANDOM_STATE)),
            ("model", make_hgb(max_depth=5, max_iter=600, min_samples_leaf=18)),
        ]
    ),
    "GBR_depth3": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", make_gbr()),
        ]
    ),
    "GBR_depth4": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            (
                "model",
                make_gbr(n_estimators=1600, learning_rate=0.05, max_depth=4, min_samples_leaf=3, subsample=0.85),
            ),
        ]
    ),
    "RF_unbounded": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", make_rf()),
        ]
    ),
    "RF_regularized": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            (
                "model",
                make_rf(n_estimators=800, max_depth=24, max_features=0.4, min_samples_leaf=2),
            ),
        ]
    ),
    "ExtraTrees_wide": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", make_extratrees()),
        ]
    ),
    "KernelRidge_rbf": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", KernelRidge(alpha=0.4, kernel="rbf", gamma=5e-4)),
        ]
    ),
    "Bagging_HGB": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            (
                "model",
                BaggingRegressor(
                    estimator=make_hgb(max_depth=4, learning_rate=0.06, max_iter=800, min_samples_leaf=18),
                    n_estimators=12,
                    max_features=0.9,
                    random_state=RANDOM_STATE,
                    n_jobs=-1,
                ),
            ),
        ]
    ),
    "LogTarget_HGB": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            (
                "model",
                TransformedTargetRegressor(
                    regressor=make_hgb(max_depth=5, learning_rate=0.045, max_iter=900, min_samples_leaf=18),
                    func=np.log1p,
                    inverse_func=np.expm1,
                ),
            ),
        ]
    ),
}

stack_estimators = [
    ("hgb", make_hgb(max_depth=5, learning_rate=0.045, max_iter=850, min_samples_leaf=16)),
    ("gbr", make_gbr(n_estimators=1800, learning_rate=0.045)),
    ("rf", make_rf(n_estimators=900, max_depth=30, max_features=0.5)),
]
stack_final = make_gbr(n_estimators=800, learning_rate=0.06, max_depth=2)
model_registry["Stacking_Trees"] = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "model",
            StackingRegressor(
                estimators=stack_estimators,
                final_estimator=stack_final,
                n_jobs=-1,
                passthrough=False,
            ),
        ),
    ]
)

voting_estimators = [
    ("hgb", make_hgb(max_depth=5, learning_rate=0.05, max_iter=700, min_samples_leaf=14)),
    ("gbr", make_gbr(n_estimators=1500, learning_rate=0.05)),
    ("extratrees", make_extratrees(n_estimators=1000, max_features=0.65)),
]
model_registry["Voting_TopTrees"] = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "model",
            VotingRegressor(estimators=voting_estimators, weights=[0.4, 0.3, 0.3]),
        ),
    ]
)

len(model_registry)


14

In [25]:
results_df = evaluate_pipelines(model_registry, X_fe, y, cv)
results_df


[1/14] Evaluating HGB_baseline...
[2/14] Evaluating HGB_deeper...
[3/14] Evaluating HGB_shrinkage...
[4/14] Evaluating HGB_with_pca...
[5/14] Evaluating GBR_depth3...
[6/14] Evaluating GBR_depth4...
[7/14] Evaluating RF_unbounded...
[8/14] Evaluating RF_regularized...
[9/14] Evaluating ExtraTrees_wide...
[10/14] Evaluating KernelRidge_rbf...
[11/14] Evaluating Bagging_HGB...
[12/14] Evaluating LogTarget_HGB...
[13/14] Evaluating Stacking_Trees...
[14/14] Evaluating Voting_TopTrees...


Unnamed: 0,model,rmse_mean,rmse_std,log_rmse_mean,log_rmse_std,r2_mean
0,Bagging_HGB,27399.551344,6357.185771,0.125295,0.01558,0.872584
1,Voting_TopTrees,26891.805115,7207.40272,0.125962,0.016404,0.873663
2,GBR_depth3,26850.105434,7685.488876,0.127057,0.015513,0.872124
3,GBR_depth4,26999.786581,6780.784243,0.1285,0.015043,0.873631
4,HGB_baseline,28193.519974,6890.107412,0.131251,0.017017,0.863069
5,LogTarget_HGB,27655.919103,5724.381698,0.131467,0.014711,0.871659
6,HGB_shrinkage,28360.440354,6010.141091,0.131557,0.01379,0.864029
7,HGB_deeper,28209.324614,7098.184741,0.131582,0.017709,0.862389
8,Stacking_Trees,32975.57315,9325.965034,0.136671,0.018095,0.812638
9,ExtraTrees_wide,29455.191587,7018.788362,0.138933,0.014702,0.851372


In [26]:
top_models = results_df.head(6)
top_models


Unnamed: 0,model,rmse_mean,rmse_std,log_rmse_mean,log_rmse_std,r2_mean
0,Bagging_HGB,27399.551344,6357.185771,0.125295,0.01558,0.872584
1,Voting_TopTrees,26891.805115,7207.40272,0.125962,0.016404,0.873663
2,GBR_depth3,26850.105434,7685.488876,0.127057,0.015513,0.872124
3,GBR_depth4,26999.786581,6780.784243,0.1285,0.015043,0.873631
4,HGB_baseline,28193.519974,6890.107412,0.131251,0.017017,0.863069
5,LogTarget_HGB,27655.919103,5724.381698,0.131467,0.014711,0.871659


In [27]:
best_log_model = results_df.iloc[0]["model"]
best_rmse_model = results_df.sort_values("rmse_mean").iloc[0]["model"]

best_log_model, best_rmse_model


('Bagging_HGB', 'GBR_depth3')

In [28]:
def train_and_save(model_name: str, pipeline: Pipeline) -> Path:
    fitted = clone(pipeline)
    fitted.fit(X_fe, y)
    predictions = fitted.predict(test_fe)
    submission = pd.DataFrame({"Id": test_fe.index, "SalePrice": predictions})
    output_path = SUBMISSIONS_DIR / f"{model_name}_submission.csv"
    submission.to_csv(output_path, index=False)
    return output_path


In [29]:
submission_candidates = results_df.head(4)["model"].tolist()
submission_paths = {name: train_and_save(name, model_registry[name]) for name in submission_candidates}
submission_paths


{'Bagging_HGB': PosixPath('../data/Bagging_HGB_submission.csv'),
 'Voting_TopTrees': PosixPath('../data/Voting_TopTrees_submission.csv'),
 'GBR_depth3': PosixPath('../data/GBR_depth3_submission.csv'),
 'GBR_depth4': PosixPath('../data/GBR_depth4_submission.csv')}

In [30]:
def blend_and_save(blend_name: str, model_names: List[str], weights: Optional[List[float]] = None) -> Path:
    if weights is not None and len(weights) != len(model_names):
        raise ValueError("weights must match the number of models")

    preds = []
    for name in model_names:
        fitted = clone(model_registry[name])
        fitted.fit(X_fe, y)
        preds.append(fitted.predict(test_fe))

    preds = np.vstack(preds)
    if weights is None:
        blended = preds.mean(axis=0)
    else:
        weights = np.array(weights) / np.sum(weights)
        blended = np.average(preds, axis=0, weights=weights)

    blend_path = SUBMISSIONS_DIR / f"{blend_name}_submission.csv"
    pd.DataFrame({"Id": test_fe.index, "SalePrice": blended}).to_csv(blend_path, index=False)
    return blend_path


In [31]:
top3_blend_path = blend_and_save("blend_top3", submission_candidates[:3])
weighted_blend_path = blend_and_save("blend_weighted", submission_candidates, weights=[0.4, 0.3, 0.2, 0.1])
top3_blend_path, weighted_blend_path


(PosixPath('../data/blend_top3_submission.csv'),
 PosixPath('../data/blend_weighted_submission.csv'))

In [42]:
# !kaggle competitions submit \
#     -c house-prices-advanced-regression-techniques \
#     -f f"../data/{best_log_model}_submission.csv" \
#     -m f"Final model - log metric ({best_log_model})"

# !kaggle competitions submit \
#     -c house-prices-advanced-regression-techniques \
#     -f f"../data/{best_rmse_model}_submission.csv" \
#     -m f"Final model - rmse metric ({best_rmse_model})"

# for candidate in submission_candidates:
#     print(candidate)
    # !kaggle competitions submit \
    #     -c house-prices-advanced-regression-techniques \
    #     -f f"../data/{candidate}_submission.csv" \
    #     -m f"Final model - single run ({candidate})"

# !kaggle competitions submit \
#     -c house-prices-advanced-regression-techniques \
#     -f "../data/blend_top3_submission.csv" \
#     -m "Final model - blended top3"

# !kaggle competitions submit \
#     -c house-prices-advanced-regression-techniques \
#     -f "../data/blend_weighted_submission.csv" \
#     -m "Final model - weighted blend"



![](../images/final.png)

# 1174 / 6075 = ~best 20%

![](../images/placement.png)