Imports

In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df["MedHouseVal"] = housing.target
df["Rooms_per_Occup"] = df["AveRooms"] / df["AveOccup"]
df["Bedrooms_Ratio"] = df["AveBedrms"] / df["AveRooms"]
df["Income_per_Room"] = df["MedInc"] / df["AveRooms"]

X = df.drop("MedHouseVal", axis=1)
y = df["MedHouseVal"]

df.head()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lin_reg = LinearRegression().fit(X_train_scaled, y_train)
dt = DecisionTreeRegressor(random_state=42).fit(X_train, y_train)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1).fit(X_train, y_train)

print("Models trained: lin_reg, dt, rf")

X_train: (16512, 11)
X_test : (4128, 11)
Models trained: lin_reg, dt, rf


Baseline RF performance

In [4]:
y_pred_rf_base = rf.predict(X_test)

rmse_base = np.sqrt(mean_squared_error(y_test, y_pred_rf_base))
mae_base = mean_absolute_error(y_test, y_pred_rf_base)
r2_base = r2_score(y_test, y_pred_rf_base)

print("Baseline Random Forest")
print("RMSE:", rmse_base)
print("MAE :", mae_base)
print("R2  :", r2_base)

Baseline Random Forest
RMSE: 0.49973410738222784
MAE : 0.3260529798934111
R2  : 0.8094227205186303


RandomizedSearchCV setup

In [5]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    "n_estimators": [200, 400, 600],
    "max_depth": [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False]
}

rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=25,
    cv=3,
    scoring="neg_root_mean_squared_error",
    random_state=42,
    n_jobs=-1,
    verbose=2
)

random_search.fit(X_train, y_train)

print("Best parameters:", random_search.best_params_)
print("Best CV RMSE:", -random_search.best_score_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best parameters: {'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 20, 'bootstrap': True}
Best CV RMSE: 0.5145731626278368


Evaluate tuned model

In [6]:
best_rf = random_search.best_estimator_
y_pred_rf_tuned = best_rf.predict(X_test)

rmse_tuned = np.sqrt(mean_squared_error(y_test, y_pred_rf_tuned))
mae_tuned = mean_absolute_error(y_test, y_pred_rf_tuned)
r2_tuned = r2_score(y_test, y_pred_rf_tuned)

print("Tuned Random Forest")
print("RMSE:", rmse_tuned)
print("MAE :", mae_tuned)
print("R2  :", r2_tuned)

Tuned Random Forest
RMSE: 0.5005727942259073
MAE : 0.32624549053768986
R2  : 0.8087825049432252


Improvement table

In [7]:
improvement = pd.DataFrame({
    "Model": ["Baseline RF", "Tuned RF"],
    "RMSE": [rmse_base, rmse_tuned],
    "MAE": [mae_base, mae_tuned],
    "R2": [r2_base, r2_tuned]
})
improvement

Unnamed: 0,Model,RMSE,MAE,R2
0,Baseline RF,0.499734,0.326053,0.809423
1,Tuned RF,0.500573,0.326245,0.808783
