# Random Forest Regressor

Purpose: load processed data, fit RandomForest, evaluate with CV, tune key hyperparameters, and save model.


# Data Loading

In [1]:
from pathlib import Path
import pandas as pd

ROOT = Path("..")
PROC = ROOT / "data" / "train" / "housing_train_processed.csv"
df = pd.read_csv(PROC)
y = df["median_house_value"].values
X = df.drop(columns=["median_house_value"]).values
print(X.shape)


(16512, 24)


# Model Fitting

In [2]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X, y)
print("Train R^2:", model.score(X, y))


Train R^2: 0.974453572649535


# Cross-Validation

In [3]:
from sklearn.model_selection import cross_val_score
import numpy as np
scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5, n_jobs=-1)
rmse = np.sqrt(-scores)
print("CV RMSE:", rmse)
print("Mean RMSE:", rmse.mean(), "±", rmse.std())


CV RMSE: [49528.23694705 49143.73782072 49973.82756057 51031.49380669
 50216.1774395 ]
Mean RMSE: 49978.694714905665 ± 642.630979193054


# Hyperparameter Tuning

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

param_grid = {
    "n_estimators": [100, 200],   
    "max_depth": [None, 15],   
    "max_features": [8, 12]       
}

gs = GridSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=3,     
    n_jobs=-1
)
gs.fit(X, y)

print("Best params:", gs.best_params_)
print("Best RMSE:", np.sqrt(-gs.best_score_))
best_model = gs.best_estimator_


Best params: {'max_depth': None, 'max_features': 8, 'n_estimators': 200}
Best RMSE: 49467.451813550775


# Model Saving

In [5]:
import joblib, pathlib
final_model = best_model if "best_model" in locals() else model
out = pathlib.Path(".") / "random_forest_model.pkl"
joblib.dump(final_model, out)
out


PosixPath('random_forest_model.pkl')