In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv("../data/processed/processed_data.csv")


In [3]:
corr = df.corr()["SalePrice"].abs().sort_values(ascending=False)

selected_features = [
    'Overall Qual','Gr Liv Area','Garage Cars','Garage Area',
    'Total Bsmt SF','1st Flr SF','Exter Qual_TA','Year Built',
    'Full Bath','Garage Yr Blt','Year Remod/Add',
    'Kitchen Qual_TA','Foundation_PConc','Mas Vnr Area',
    'TotRms AbvGrd','Fireplaces',
    'BsmtFin Type 1_GLQ','Bsmt Qual_TA','Exter Qual_Gd'
]


In [4]:
X = df[selected_features]
y = df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)


In [5]:
rf = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("RF R²:", r2_score(y_test, y_pred))
print("RF RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


RF R²: 0.8864639115508427
RF RMSE: 27706.49070841771


In [6]:
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)

print("GB R²:", r2_score(y_test, y_pred))
print("GB RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


GB R²: 0.8822433586094264
GB RMSE: 28216.767715807528


In [7]:
hgb = HistGradientBoostingRegressor(
    max_depth=6,
    learning_rate=0.05,
    max_iter=400,
    random_state=42
)

hgb.fit(X_train, y_train)
y_pred = hgb.predict(X_test)

print("HGB R²:", r2_score(y_test, y_pred))
print("HGB RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


HGB R²: 0.8727433352364194
HGB RMSE: 29332.888147636353
