# House Prices Baseline Model

This notebook establishes a baseline model and tests the end-to-end ML pipeline including preprocessing, feature engineering, and model training.

## 1. Import Dependencies

In [1]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBRegressor

## 2. Load and Prepare Data

In [2]:
train_df = pd.read_csv(
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/HOUSE PRICES - ADVANCED REGRESSION TECHNIQUES/data/02-preprocessed/train_preprocessed.csv"
)
test_df = pd.read_csv(
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/HOUSE PRICES - ADVANCED REGRESSION TECHNIQUES/data/02-preprocessed/test_preprocessed.csv"
)

print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

Train set shape: (1439, 81)
Test set shape: (1459, 80)


## 3. Remove Outliers

In [3]:
outlier_indices = [
    598,
    955,
    935,
    1299,
    250,
    314,
    336,
    707,
    379,
    1183,
    692,
    186,
    441,
    524,
    739,
    636,
    1062,
    1191,
    496,
    198,
    1338,
]
train_df = train_df[train_df.Id.isin(outlier_indices) == False]
print(f"Train set after outlier removal: {train_df.shape}")

Train set after outlier removal: (1439, 81)


## 4. Handle Missing Values

In [4]:
categorical_fillna = [
    "Alley",
    "Fence",
    "MasVnrType",
    "FireplaceQu",
    "GarageCond",
    "GarageType",
    "GarageFinish",
    "GarageQual",
    "BsmtExposure",
    "BsmtQual",
    "BsmtCond",
    "BsmtFinType2",
]
for col in categorical_fillna:
    train_df[col].fillna("No", inplace=True)
    test_df[col].fillna("No", inplace=True)

train_df["BsmtFinType1"].fillna("Unf", inplace=True)
test_df["BsmtFinType1"].fillna("Unf", inplace=True)
train_df["Electrical"].fillna("SBrkr", inplace=True)
test_df["Electrical"].fillna("SBrkr", inplace=True)
train_df["LotFrontage"].fillna(0, inplace=True)
test_df["LotFrontage"].fillna(0, inplace=True)
train_df["MasVnrArea"].fillna(0, inplace=True)
test_df["MasVnrArea"].fillna(0, inplace=True)

print(f"Missing values handled")

Missing values handled


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna('No', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna('No', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behav

## 5. Feature Engineering

In [5]:
columns_to_drop = [
    "PoolQC",
    "MiscFeature",
    "Alley",
    "Fence",
    "GarageYrBlt",
    "GarageCond",
    "BsmtFinType2",
]
train_df = train_df.drop(columns=columns_to_drop)
test_df = test_df.drop(columns=columns_to_drop)

train_df["houseage"] = train_df["YrSold"] - train_df["YearBuilt"]
test_df["houseage"] = test_df["YrSold"] - test_df["YearBuilt"]
train_df["houseremodelage"] = train_df["YrSold"] - train_df["YearRemodAdd"]
test_df["houseremodelage"] = test_df["YrSold"] - test_df["YearRemodAdd"]
train_df["totalsf"] = (
    train_df["1stFlrSF"] + train_df["2ndFlrSF"] + train_df["BsmtFinSF1"] + train_df["BsmtFinSF2"]
)
test_df["totalsf"] = (
    test_df["1stFlrSF"] + test_df["2ndFlrSF"] + test_df["BsmtFinSF1"] + test_df["BsmtFinSF2"]
)
train_df["totalbaths"] = (
    train_df["BsmtFullBath"]
    + train_df["FullBath"]
    + 0.5 * (train_df["BsmtHalfBath"] + train_df["HalfBath"])
)
test_df["totalbaths"] = (
    test_df["BsmtFullBath"]
    + test_df["FullBath"]
    + 0.5 * (test_df["BsmtHalfBath"] + test_df["HalfBath"])
)
train_df["totalporchsf"] = (
    train_df["OpenPorchSF"]
    + train_df["3SsnPorch"]
    + train_df["EnclosedPorch"]
    + train_df["ScreenPorch"]
    + train_df["WoodDeckSF"]
)
test_df["totalporchsf"] = (
    test_df["OpenPorchSF"]
    + test_df["3SsnPorch"]
    + test_df["EnclosedPorch"]
    + test_df["ScreenPorch"]
    + test_df["WoodDeckSF"]
)

columns_to_drop_fe = [
    "Id",
    "YrSold",
    "YearBuilt",
    "YearRemodAdd",
    "1stFlrSF",
    "2ndFlrSF",
    "BsmtFinSF1",
    "BsmtFinSF2",
    "GrLivArea",
    "TotalBsmtSF",
    "BsmtFullBath",
    "FullBath",
    "BsmtHalfBath",
    "HalfBath",
    "OpenPorchSF",
    "3SsnPorch",
    "EnclosedPorch",
    "ScreenPorch",
    "WoodDeckSF",
]
train_df = train_df.drop(columns=columns_to_drop_fe)
test_id = test_df["Id"].copy()
test_df = test_df.drop(columns=[col for col in columns_to_drop_fe if col in test_df.columns])

if "GarageArea" in train_df.columns:
    train_df = train_df.drop(columns=["GarageArea"])
if "GarageArea" in test_df.columns:
    test_df = test_df.drop(columns=["GarageArea"])

print("Features engineered")

Features engineered


## 5.1 Save Engineered Features

In [6]:
train_df.to_csv(
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/HOUSE PRICES - ADVANCED REGRESSION TECHNIQUES/data/03-features/train_features.csv",
    index=False,
)
test_df.to_csv(
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/HOUSE PRICES - ADVANCED REGRESSION TECHNIQUES/data/03-features/test_features.csv",
    index=False,
)
print("Engineered features saved to 03-features folder")

Engineered features saved to 03-features folder


## 6. Target Variable Transformation

In [7]:
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])
print("SalePrice log-transformed")

SalePrice log-transformed


## 7. Define Feature Groups

In [8]:
ordinal_encoder_cols = [
    "LotShape",
    "LandContour",
    "Utilities",
    "LandSlope",
    "BsmtQual",
    "BsmtFinType1",
    "CentralAir",
    "Functional",
    "FireplaceQu",
    "GarageFinish",
    "GarageQual",
    "PavedDrive",
    "ExterCond",
    "KitchenQual",
    "BsmtExposure",
    "HeatingQC",
    "ExterQual",
    "BsmtCond",
]
one_hot_encoder_cols = [
    "Street",
    "LotConfig",
    "Neighborhood",
    "Condition1",
    "Condition2",
    "BldgType",
    "HouseStyle",
    "RoofStyle",
    "Exterior1st",
    "Exterior2nd",
    "MasVnrType",
    "Foundation",
    "Electrical",
    "SaleType",
    "MSZoning",
    "SaleCondition",
    "Heating",
    "GarageType",
    "RoofMatl",
]
numeric_cols = train_df.select_dtypes(include=["int64", "float64"]).columns.drop("SalePrice")
print(
    f"Numeric: {len(numeric_cols)}, Ordinal: {len(ordinal_encoder_cols)}, OneHot: {len(one_hot_encoder_cols)}"
)

Numeric: 21, Ordinal: 18, OneHot: 19


## 8. Create Preprocessing Pipelines

In [9]:
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)
ordinal_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)
onehot_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)
col_transformer = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, numeric_cols),
        ("ordinal", ordinal_pipeline, ordinal_encoder_cols),
        ("onehot", onehot_pipeline, one_hot_encoder_cols),
    ],
    remainder="passthrough",
    n_jobs=-1,
)
print("Pipelines created")

Pipelines created


## 9. Prepare Data

In [10]:
X = train_df.drop("SalePrice", axis=1)
y = train_df["SalePrice"]
X_preprocessed = col_transformer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=25
)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")

X_train: (1151, 193), X_test: (288, 193)


## 10. Baseline Model

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
print(f"Linear Regression RMSE: {rmse_lr:.6f}")

Linear Regression RMSE: 0.133129


## 11. Random Forest Tuning

In [12]:
rfr = RandomForestRegressor(random_state=13)
param_grid_rfr = {
    "max_depth": [5, 10, 15],
    "n_estimators": [100, 250, 500],
    "min_samples_split": [3, 5, 10],
}
rfr_cv = GridSearchCV(rfr, param_grid_rfr, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
rfr_cv.fit(X_train, y_train)
rmse_rfr = np.sqrt(-1 * rfr_cv.best_score_)
print(f"RF RMSE: {rmse_rfr:.6f}")

RF RMSE: 0.136380


## 12. Gradient Boosting Tuning

In [13]:
gbr = GradientBoostingRegressor(random_state=13)
param_grid_gbr = {
    "max_depth": [3, 5, 7],
    "n_estimators": [100, 300, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "min_samples_leaf": [5, 10, 20],
}
gbr_cv = GridSearchCV(gbr, param_grid_gbr, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
gbr_cv.fit(X_train, y_train)
rmse_gbr = np.sqrt(-1 * gbr_cv.best_score_)
print(f"GBR RMSE: {rmse_gbr:.6f}")

GBR RMSE: 0.118121


## 13. XGBoost Tuning

In [14]:
xgb = XGBRegressor(random_state=13)
param_grid_xgb = {
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [100, 300, 500],
    "max_depth": [3, 5, 7],
    "subsample": [0.8, 0.9, 1.0],
}
xgb_cv = GridSearchCV(xgb, param_grid_xgb, cv=3, scoring="neg_mean_squared_error", n_jobs=-1)
xgb_cv.fit(X_train, y_train)
rmse_xgb = np.sqrt(-1 * xgb_cv.best_score_)
print(f"XGB RMSE: {rmse_xgb:.6f}")

XGB RMSE: 0.119380


## 14. Voting Regressor

In [15]:
vr = VotingRegressor(
    [("gbr", gbr_cv.best_estimator_), ("xgb", xgb_cv.best_estimator_)], weights=[1, 1]
)
vr.fit(X_train, y_train)
y_pred_vr = vr.predict(X_test)
rmse_vr = np.sqrt(mean_squared_error(y_test, y_pred_vr))
print(f"Voting RMSE: {rmse_vr:.6f}")

Voting RMSE: 0.121672


## 15. Stacking Regressor

In [16]:
estimators = [
    ("gbr", gbr_cv.best_estimator_),
    ("xgb", xgb_cv.best_estimator_),
    ("rfr", rfr_cv.best_estimator_),
]
stackreg = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=1.0), cv=5)
stackreg.fit(X_train, y_train)
y_pred_stack = stackreg.predict(X_test)
rmse_stack = np.sqrt(mean_squared_error(y_test, y_pred_stack))
print(f"Stacking RMSE: {rmse_stack:.6f}")

Stacking RMSE: 0.121442


## 16. Model Comparison

In [17]:
results = pd.DataFrame(
    {
        "Model": [
            "Linear Regression",
            "Random Forest",
            "Gradient Boosting",
            "XGBoost",
            "Voting",
            "Stacking",
        ],
        "RMSE": [rmse_lr, rmse_rfr, rmse_gbr, rmse_xgb, rmse_vr, rmse_stack],
    }
)
results = results.sort_values("RMSE").reset_index(drop=True)
print("\\nModel Comparison:")
print(results)

\nModel Comparison:
               Model      RMSE
0  Gradient Boosting  0.118121
1            XGBoost  0.119380
2           Stacking  0.121442
3             Voting  0.121672
4  Linear Regression  0.133129
5      Random Forest  0.136380


## 17. Generate Predictions

In [18]:
X_test_preprocessed = col_transformer.transform(test_df)
y_pred_test = stackreg.predict(X_test_preprocessed)
y_pred_test = np.exp(y_pred_test) - 1
print(f"Predictions shape: {y_pred_test.shape}")

Predictions shape: (1459,)


## 18. Create Submission

In [19]:
submission = pd.DataFrame({"Id": test_id, "SalePrice": y_pred_test})
submission.to_csv(
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/HOUSE PRICES - ADVANCED REGRESSION TECHNIQUES/data/04-predictions/submission.csv",
    index=False,
)
print("Submission saved")
print(submission.head())

Submission saved
     Id      SalePrice
0  1461  128310.204680
1  1462  168082.754369
2  1463  175835.094094
3  1464  196900.160106
4  1465  189916.623593
