In [170]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [120]:
df_train = pd.read_csv('../data/train_cleaned.csv', keep_default_na=False)
df_test = pd.read_csv('../data/test.csv', keep_default_na=False)

In [121]:
df_test["TotalBsmtSF"] = pd.to_numeric(
    df_test["TotalBsmtSF"],
    errors="coerce"
)
df_test["TotalBsmtSF"] = df_test["TotalBsmtSF"].fillna(0)


In [123]:
cols_should_be_numeric = [
    "GarageArea", "GarageCars", "BsmtUnfSF", "BsmtFinSF1", "BsmtFinSF2",
    "LotFrontage", "MasVnrArea"
]

for col in cols_should_be_numeric:
    df_test[col] = pd.to_numeric(df_test[col], errors="coerce")

df_test[cols_should_be_numeric] = df_test[cols_should_be_numeric].fillna(0)


In [None]:
selected_features = [
    "OverallQual",
    "TotalSF",
    "GrLivArea",
    "HouseAge",
    "GarageArea",
    "GarageCars",
    "KitchenQual",
    "BsmtQual",
    "YearRemodAdd",
    "LotArea",
    "TotalBsmtFinSF",
    "LotFrontage",
    "MasVnrArea",
    "TotalPorchSF"
]

In [129]:
def build_features(df):
    df = df.copy()

    df["HouseAge"] = df["YrSold"] - df["YearBuilt"]
    df["TotalSF"] = df["TotalBsmtSF"] + df["1stFlrSF"] + df["2ndFlrSF"]
    df["TotalPorchSF"] = (
        df["OpenPorchSF"] + df["EnclosedPorch"] +
        df["3SsnPorch"] + df["ScreenPorch"] + df["WoodDeckSF"]
    )
    df["TotalBsmtFinSF"] = df["BsmtFinSF1"] + df["BsmtFinSF2"]

    return df

df_test = build_features(df_test)
df_train = build_features(df_train)
df_test_selected = df_test[selected_features]
df_train_selected = df_train[selected_features]

In [197]:
from sklearn.preprocessing import RobustScaler
import numpy as np

def preprocess_for_model(
    df,
    scaler=None,
    fit=False,
    use_scaler=True
):
    X = df.copy()
    
    selected_features = [
        "OverallQual",
        "TotalSF",
        "GrLivArea",
        "HouseAge",
        "GarageArea",
        "GarageCars",
        "KitchenQual",
        "BsmtQual",
        "YearRemodAdd",
        "LotArea",
        "TotalBsmtFinSF",
        "LotFrontage",
        "MasVnrArea",
        "TotalPorchSF"
    ]

    log_transform_cols = [
        "LotFrontage",
        "LotArea",
        "MasVnrArea",
        "GrLivArea",
        "TotalSF",
        "TotalPorchSF",
        "TotalBsmtFinSF"
    ]

    # Astype
    X['BsmtQual'] = X['BsmtQual'].astype(str)
    X['TotalBsmtFinSF'] = X['TotalBsmtFinSF'].astype(float)

    # Ordinal encoding
    qual_map = {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
    X['KitchenQual'] = X['KitchenQual'].map(qual_map)
    X['BsmtQual'] = X['BsmtQual'].map(qual_map)


    X = X[selected_features]

    for col in log_transform_cols:
        X[col] = np.log1p(X[col])

    if not use_scaler:
        return X.values, None

    if scaler is None:
        scaler = RobustScaler()

    if fit:
        X_scaled = scaler.fit_transform(X)
    else:
        X_scaled = scaler.transform(X)

    return X_scaled, scaler


In [176]:
X = df_train_selected
y = np.log1p(df_train["SalePrice"])

X_tr, X_val, y_tr, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)
X_tr_scaled, scaler = preprocess_for_model(X_tr, fit=True)
X_val_scaled, _ = preprocess_for_model(X_val, scaler=scaler, fit=False)

In [177]:
models = {
    "LR": LinearRegression(),
    "EN": ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=42),
    "XGB":XGBRegressor(
        n_estimators=800,
        learning_rate=0.03,
        max_depth=3,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
}

for name, model in models.items():
    model.fit(X_tr_scaled, y_tr)
    preds_log = model.predict(X_val_scaled)
    preds_log = np.clip(preds_log, 0, 14)

    preds = np.expm1(preds_log)
    y_true = np.expm1(y_val)  # y_val LOG

    rmse = np.sqrt(mean_squared_error(y_true, preds))
    print(f"{name} RMSE: {rmse:.2f}")

LR RMSE: 31526.83
EN RMSE: 31527.11
XGB RMSE: 27889.87


In [178]:
# Get the XGB model from the models dictionary
XGB = models["XGB"]

joblib.dump(XGB, "../models/model_xgb.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']

In [179]:
preprocess_config = {
    "selected_features": [
        "OverallQual", "TotalSF", "GrLivArea", "HouseAge",
        "GarageArea", "GarageCars", "KitchenQual", "BsmtQual",
        "YearRemodAdd", "LotArea", "TotalBsmtFinSF",
        "LotFrontage", "MasVnrArea", "TotalPorchSF"
    ],
    "log_transform_cols": [
        "LotFrontage", "LotArea", "MasVnrArea",
        "GrLivArea", "TotalSF", "TotalPorchSF", "TotalBsmtFinSF"
    ],
    "ordinal_maps": {
        "KitchenQual": {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
        "BsmtQual": {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
    }
}

joblib.dump(preprocess_config, "../models/preprocess_config.pkl")

['../models/preprocess_config.pkl']

In [None]:

def preprocess_input(df, scaler, config):
    X = df.copy()

    # Ordinal encoding
    for col, mapping in config["ordinal_maps"].items():
        X[col] = X[col].astype(str).map(mapping).fillna(0)

    # Select features
    X = X[config["selected_features"]]

    # Log transform
    for col in config["log_transform_cols"]:
        X[col] = np.log1p(X[col])

    # Scaling
    X_scaled = scaler.transform(X)

    return X_scaled


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

X_tr_rf, _ = preprocess_for_model(
    X_tr,
    use_scaler=False
)

X_val_rf, _ = preprocess_for_model(
    X_val,
    use_scaler=False
)

model_rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

model_rf.fit(X_tr_rf, y_tr)

y_pred = model_rf.predict(X_val_rf)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RF RMSE (log space):", rmse)


RF RMSE (log space): 0.15524299285538853


In [199]:
y_pred_price = np.expm1(y_pred)
y_true_price = np.expm1(y_val)

rmse_real = np.sqrt(mean_squared_error(y_true_price, y_pred_price))
print("RF RMSE (real price):", rmse_real)


RF RMSE (real price): 30478.81582945116


In [None]:
from sklearn.model_selection import RandomizedSearchCV
X_tr_xgb, _ = preprocess_for_model( X_tr, use_scaler=False ) 
X_val_xgb, _ = preprocess_for_model( X_val, use_scaler=False )

model_xgb = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=800,
    tree_method="hist",  
    random_state=42
)


param_dist = {
    "max_depth": [2, 3, 4, 5],
    "learning_rate": [0.01, 0.03, 0.05],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "min_child_weight": [1, 3, 5],
    "gamma": [0, 0.1, 0.2]
}

xgb_search = RandomizedSearchCV(
    model_xgb,
    param_distributions=param_dist,
    n_iter=40,
    scoring="neg_root_mean_squared_error",
    cv=5,
    random_state=42,
    n_jobs=1,
    error_score="raise"  # debug only
)

xgb_search.fit(X_tr_xgb, y_tr)

print("Best RMSE:", -xgb_search.best_score_)
print("Best Params:", xgb_search.best_params_)


Best RMSE: 0.1381472317783789
Best Params: {'subsample': 0.9, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.03, 'gamma': 0, 'colsample_bytree': 0.7}


In [192]:
import numpy as np
from sklearn.preprocessing import RobustScaler

def preprocess_for_model(
    df,
    preprocess_config,
    scaler=None,
    fit=False,
    use_scaler=True
):
    X = df.copy()

    selected_features = preprocess_config["selected_features"]
    log_transform_cols = preprocess_config["log_transform_cols"]
    ordinal_maps = preprocess_config["ordinal_maps"]

    X["BsmtQual"] = X["BsmtQual"].astype(str)
    X["KitchenQual"] = X["KitchenQual"].astype(str)

    for col, mapping in ordinal_maps.items():
        X[col] = X[col].map(mapping)

    X = X[selected_features]

    for col in log_transform_cols:
        X[col] = np.log1p(X[col])

    if not use_scaler:
        return X.values, None

    if scaler is None:
        scaler = RobustScaler()

    if fit:
        X_scaled = scaler.fit_transform(X)
    else:
        X_scaled = scaler.transform(X)

    return X_scaled, scaler


In [193]:
best_params = xgb_search.best_params_

df_train_feat = build_features(df_train)

X_xgb_full, _ = preprocess_for_model(
    df_train_feat,
    preprocess_config,
    use_scaler=False
)
y_full = np.log1p(df_train["SalePrice"])
final_xgb = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=800,
    random_state=42,
    tree_method="hist",
    **best_params
)
final_xgb.fit(X_xgb_full, y_full)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,
,early_stopping_rounds,
,enable_categorical,False


In [194]:
import json

with open("../models/xgb_best_params_v1.json", "w") as f:
    json.dump(best_params, f, indent=4)
joblib.dump(final_xgb, "../models/tuned_xgb_model_v1.pkl")

['../models/tuned_xgb_model_v1.pkl']