In [74]:
import os
import json

with open("../.secrets/kaggle.json") as f:
    creds = json.load(f)

os.environ["KAGGLE_USERNAME"] = creds["username"]
os.environ["KAGGLE_KEY"] = creds["key"]

In [75]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import (
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler

DATA_DIR = Path("../data")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
RANDOM_STATE = 42


In [76]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

train_df = train_df.set_index("Id")
test_df = test_df.set_index("Id")

y = train_df["SalePrice"]
X = train_df.drop(columns="SalePrice")

train_df.shape, test_df.shape


((1460, 80), (1459, 79))

In [77]:
def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    result = df.copy()
    result["TotalSF"] = (
        result.get("TotalBsmtSF", 0)
        + result.get("1stFlrSF", 0)
        + result.get("2ndFlrSF", 0)
    )
    result["TotalBath"] = (
        result.get("FullBath", 0)
        + 0.5 * result.get("HalfBath", 0)
        + result.get("BsmtFullBath", 0)
        + 0.5 * result.get("BsmtHalfBath", 0)
    )
    result["AgeAtSale"] = result.get("YrSold", 0) - result.get("YearBuilt", 0)
    result["SinceRemodel"] = result.get("YrSold", 0) - result.get("YearRemodAdd", 0)
    result["IsRemodeled"] = (result.get("YearBuilt", 0) != result.get("YearRemodAdd", 0)).astype(int)
    result["TotalPorchSF"] = (
        result.get("OpenPorchSF", 0)
        + result.get("EnclosedPorch", 0)
        + result.get("3SsnPorch", 0)
        + result.get("ScreenPorch", 0)
    )
    result["HasPool"] = (result.get("PoolArea", 0) > 0).astype(int)
    result["HasGarage"] = (result.get("GarageArea", 0) > 0).astype(int)
    result["HasFireplace"] = (result.get("Fireplaces", 0) > 0).astype(int)
    result["LotRatio"] = result.get("GrLivArea", 0) / (result.get("LotArea", 1) + 1e-6)
    result["LogLotArea"] = np.log1p(result.get("LotArea", 0))
    result["LogGrLivArea"] = np.log1p(result.get("GrLivArea", 0))
    return result

X_fe = add_engineered_features(X)
test_fe = add_engineered_features(test_df)

X_fe.shape, test_fe.shape


((1460, 91), (1459, 91))

In [78]:
numeric_features = X_fe.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_fe.select_dtypes(exclude=[np.number]).columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
    X_fe, y, test_size=0.2, random_state=RANDOM_STATE
)

len(numeric_features), len(categorical_features), X_train.shape, X_valid.shape


(48, 43, (1168, 91), (292, 91))

In [79]:
numeric_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("power", PowerTransformer(method="yeo-johnson")),
        ("scaler", StandardScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
    ]
)

gb_params = dict(
    max_depth=6,
    learning_rate=0.05,
    max_iter=400,
    min_samples_leaf=15,
    l2_regularization=0.1,
    random_state=RANDOM_STATE,
)

pipelines = {
    "HGB_no_pca": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", HistGradientBoostingRegressor(**gb_params)),
        ]
    ),
    "HGB_with_pca": Pipeline(
        steps=[
            ("preprocess", preprocessor),
            (
                "dim_red",
                PCA(
                    n_components=120,
                    svd_solver="randomized",
                    random_state=RANDOM_STATE,
                ),
            ),
            ("model", HistGradientBoostingRegressor(**gb_params)),
        ]
    ),
}

results = []
trained_pipelines = {}

for name, pipe in pipelines.items():
    fitted_pipe = clone(pipe)
    fitted_pipe.fit(X_train, y_train)
    preds = fitted_pipe.predict(X_valid)
    rmse = float(np.sqrt(mean_squared_error(y_valid, preds)))
    r2 = float(r2_score(y_valid, preds))
    safe_preds = np.clip(preds, 1, None)
    log_rmse = float(
        np.sqrt(mean_squared_error(np.log(y_valid), np.log(safe_preds)))
    )
    results.append({"model": name, "rmse": rmse, "log_rmse": log_rmse, "r2": r2})
    trained_pipelines[name] = fitted_pipe

results_df = (
    pd.DataFrame(results)
    .sort_values(["log_rmse", "rmse"])
    .reset_index(drop=True)
)
results_df


Unnamed: 0,model,rmse,log_rmse,r2
0,HGB_no_pca,28968.920459,0.141854,0.890592
1,HGB_with_pca,31789.077158,0.143903,0.868253


In [80]:
additional_pipelines = {}

# Tuned HistGradientBoosting variants
hgb_grid = [
    {"max_depth": 5, "learning_rate": 0.06, "max_iter": 450, "min_samples_leaf": 18, "l2_regularization": 0.05},
    {"max_depth": 4, "learning_rate": 0.08, "max_iter": 500, "min_samples_leaf": 12, "l2_regularization": 0.2},
]
for idx, params in enumerate(hgb_grid, start=1):
    label = f"HGB_tuned_{idx}"
    estimator = HistGradientBoostingRegressor(random_state=RANDOM_STATE, **params)
    additional_pipelines[label] = Pipeline(
        steps=[("preprocess", preprocessor), ("model", estimator)]
    )

# Random forest candidates
rf_grid = [
    {"n_estimators": 800, "max_depth": None, "max_features": "sqrt", "min_samples_leaf": 1},
    {"n_estimators": 600, "max_depth": 28, "max_features": 0.4, "min_samples_leaf": 2},
]
for idx, params in enumerate(rf_grid, start=1):
    label = f"RF_{idx}"
    estimator = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1, **params)
    additional_pipelines[label] = Pipeline(
        steps=[("preprocess", preprocessor), ("model", estimator)]
    )

# Classic GradientBoostingRegressor settings
cgb_grid = [
    {"n_estimators": 1500, "learning_rate": 0.05, "max_depth": 3, "subsample": 0.85, "min_samples_leaf": 3},
    {"n_estimators": 1000, "learning_rate": 0.07, "max_depth": 4, "subsample": 0.9, "min_samples_leaf": 4},
]
for idx, params in enumerate(cgb_grid, start=1):
    label = f"GBR_{idx}"
    estimator = GradientBoostingRegressor(random_state=RANDOM_STATE, **params)
    additional_pipelines[label] = Pipeline(
        steps=[("preprocess", preprocessor), ("model", estimator)]
    )

# Elastic Net baselines (linear with regularisation)
elastic_grid = [
    {"alpha": 0.001, "l1_ratio": 0.1},
    {"alpha": 0.01, "l1_ratio": 0.5},
]
for idx, params in enumerate(elastic_grid, start=1):
    label = f"ElasticNet_{idx}"
    estimator = ElasticNet(random_state=RANDOM_STATE, max_iter=5000, **params)
    additional_pipelines[label] = Pipeline(
        steps=[("preprocess", preprocessor), ("model", estimator)]
    )

pipelines.update(additional_pipelines)

for name in additional_pipelines:
    pipe = pipelines[name]
    fitted_pipe = clone(pipe)
    fitted_pipe.fit(X_train, y_train)
    preds = fitted_pipe.predict(X_valid)
    rmse = float(np.sqrt(mean_squared_error(y_valid, preds)))
    r2 = float(r2_score(y_valid, preds))
    safe_preds = np.clip(preds, 1, None)
    log_rmse = float(
        np.sqrt(mean_squared_error(np.log(y_valid), np.log(safe_preds)))
    )
    results.append({"model": name, "rmse": rmse, "log_rmse": log_rmse, "r2": r2})
    trained_pipelines[name] = fitted_pipe

results_df = (
    pd.DataFrame(results)
    .sort_values(["log_rmse", "rmse"])
    .reset_index(drop=True)
)
results_df


  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,model,rmse,log_rmse,r2
0,HGB_tuned_2,28420.980764,0.137322,0.894691
1,GBR_1,25625.298579,0.137922,0.91439
2,HGB_tuned_1,28334.386534,0.140462,0.895332
3,GBR_2,25574.928586,0.140558,0.914726
4,HGB_no_pca,28968.920459,0.141854,0.890592
5,HGB_with_pca,31789.077158,0.143903,0.868253
6,RF_2,28915.40102,0.151315,0.890995
7,RF_1,31001.461463,0.159115,0.8747
8,ElasticNet_2,31728.894687,0.176556,0.868751
9,ElasticNet_1,31288.285083,0.206648,0.872371


In [81]:
results_kaggle = results_df.sort_values(["log_rmse", "rmse"]).reset_index(drop=True)
results_kaggle[["model", "log_rmse", "rmse", "r2"]]


Unnamed: 0,model,log_rmse,rmse,r2
0,HGB_tuned_2,0.137322,28420.980764,0.894691
1,GBR_1,0.137922,25625.298579,0.91439
2,HGB_tuned_1,0.140462,28334.386534,0.895332
3,GBR_2,0.140558,25574.928586,0.914726
4,HGB_no_pca,0.141854,28968.920459,0.890592
5,HGB_with_pca,0.143903,31789.077158,0.868253
6,RF_2,0.151315,28915.40102,0.890995
7,RF_1,0.159115,31001.461463,0.8747
8,ElasticNet_2,0.176556,31728.894687,0.868751
9,ElasticNet_1,0.206648,31288.285083,0.872371


In [82]:
def train_and_save(model_name: str, pipeline: Pipeline) -> Path:
    full_pipeline = clone(pipeline)
    full_pipeline.fit(X_fe, y)
    predictions = full_pipeline.predict(test_fe)
    submission_df = pd.DataFrame({"Id": test_fe.index, "SalePrice": predictions})
    filename = f"{model_name}_submission.csv"
    submission_path = DATA_DIR / filename
    submission_df.to_csv(submission_path, index=False)
    return submission_path


In [83]:
best_by_log = results_df.iloc[0]
best_by_rmse = results_df.sort_values("rmse").iloc[0]

best_log_name = best_by_log["model"]
best_rmse_name = best_by_rmse["model"]

log_submission_path = train_and_save(best_log_name, pipelines[best_log_name])
rmse_submission_path = train_and_save(best_rmse_name, pipelines[best_rmse_name])

{
    "best_log_model": best_log_name,
    "log_submission": log_submission_path,
    "best_rmse_model": best_rmse_name,
    "rmse_submission": rmse_submission_path,
}


{'best_log_model': 'HGB_tuned_2',
 'log_submission': PosixPath('../data/HGB_tuned_2_submission.csv'),
 'best_rmse_model': 'GBR_2',
 'rmse_submission': PosixPath('../data/GBR_2_submission.csv')}

In [84]:
# Example: submit the log-optimised model
!kaggle competitions submit \
    -c house-prices-advanced-regression-techniques \
    -f "../data/{best_log_name}_submission.csv" \
    -m "Improved model ({best_log_name})"

# Example: submit the RMSE-optimised model
!kaggle competitions submit \
    -c house-prices-advanced-regression-techniques \
    -f "../data/{best_rmse_name}_submission.csv" \
    -m "Improved model ({best_rmse_name})"



100%|██████████████████████████████████████| 33.7k/33.7k [00:00<00:00, 69.2kB/s]
100%|██████████████████████████████████████| 33.7k/33.7k [00:00<00:00, 72.2kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques

![](../images/tuned.png)