# House Prices — Model Training, Cross-Validation, and Best Model Selection

**Goal:** Build a leakage-safe preprocessing pipeline, train multiple regression models, compare them using CV RMSE, select the best model, and save artifacts for GitHub.


In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, cross_val_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import joblib


In [20]:
DATA_DIR = Path("../data/raw")
train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")

target = "SalePrice"

X = train.drop(columns=[target]).copy()
y = train[target].copy()

print("Train:", train.shape, "Test:", test.shape)


Train: (1460, 81) Test: (1459, 80)


In [None]:
# House Prices is typically right-skewed.
# Using log1p stabilizes variance and improves linear model performance.
y_log = np.log1p(y)
#Remove Id from features It’s an identifier, not a real predictive feature.
X = train.drop(columns=[target, "Id"], errors="ignore")
test_X = test.drop(columns=["Id"], errors="ignore")
print("Features shape:", X.shape)

In [22]:
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric:", len(numeric_cols), "Categorical:", len(categorical_cols))


Numeric: 37 Categorical: 43


In [23]:
# Numeric: median imputation
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
    ]
)

# Categorical: most-frequent imputation + one-hot
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop",
)


In [24]:
# We use RMSE for regression; sklearn uses "neg_root_mean_squared_error"
# so we negate it back to positive RMSE.
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def cv_rmse(model):
    pipe = Pipeline(steps=[("preprocess", preprocess), ("model", model)])
    scores = cross_val_score(pipe, X, y_log, scoring="neg_root_mean_squared_error", cv=cv)
    rmse = -scores
    return float(rmse.mean()), float(rmse.std())


In [25]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge(alpha=10)": Ridge(alpha=10, random_state=42),
    "Lasso(alpha=0.0005)": Lasso(alpha=0.0005, random_state=42, max_iter=10000),
    "RandomForest(n=400)": RandomForestRegressor(
        n_estimators=400, random_state=42, n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
}


In [26]:
results = []

for name, model in models.items():
    mean_rmse, std_rmse = cv_rmse(model)
    results.append({"model": name, "rmse_mean": mean_rmse, "rmse_std": std_rmse})
    print(f"{name:20s} RMSE: {mean_rmse:.5f} ± {std_rmse:.5f}")

results_df = pd.DataFrame(results).sort_values("rmse_mean")
results_df


LinearRegression     RMSE: 0.15275 ± 0.03508
Ridge(alpha=10)      RMSE: 0.19657 ± 0.04509
Lasso(alpha=0.0005)  RMSE: 0.14157 ± 0.04396
RandomForest(n=400)  RMSE: 0.14506 ± 0.01925
GradientBoosting     RMSE: 0.13405 ± 0.02019


Unnamed: 0,model,rmse_mean,rmse_std
4,GradientBoosting,0.134053,0.020193
2,Lasso(alpha=0.0005),0.14157,0.043963
3,RandomForest(n=400),0.145063,0.01925
0,LinearRegression,0.152754,0.035083
1,Ridge(alpha=10),0.196569,0.045089


In [27]:
best_row = results_df.iloc[0]
best_name = best_row["model"]
best_model = models[best_name]

print("Best model:", best_name)
print("CV RMSE:", best_row["rmse_mean"], "±", best_row["rmse_std"])

best_pipeline = Pipeline(steps=[("preprocess", preprocess), ("model", best_model)])
best_pipeline.fit(X, y_log)


Best model: GradientBoosting
CV RMSE: 0.13405294633631998 ± 0.020192847766544102


Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['Id', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   '2ndFlrSF', 'LowQualFinSF',
                                                   'GrLivArea', 'BsmtF...
                                 

In [28]:
OUT_DIR = Path("../outputs")
REPORT_DIR = Path("../reports")
OUT_DIR.mkdir(exist_ok=True)
REPORT_DIR.mkdir(exist_ok=True)

# Save model
model_path = OUT_DIR / "model.joblib"
joblib.dump(best_pipeline, model_path)

# Save metrics report
report = {
    "best_model": best_name,
    "cv_rmse_mean": float(best_row["rmse_mean"]),
    "cv_rmse_std": float(best_row["rmse_std"]),
    "all_models": results,
    "target_transform": "log1p(SalePrice)",
    "cv": {"n_splits": 5, "shuffle": True, "random_state": 42},
}

metrics_path = REPORT_DIR / "metrics.json"
metrics_path.write_text(json.dumps(report, indent=2), encoding="utf-8")

print("Saved model:", model_path)
print("Saved metrics:", metrics_path)


Saved model: ..\outputs\model.joblib
Saved metrics: ..\reports\metrics.json
