Task five - Model Evaluation & Comparison

Import Metrics 

In [5]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


Load Dataset

In [6]:
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df["MedHouseVal"] = housing.target

Feature engineering 

In [7]:
df["Rooms_per_Occup"] = df["AveRooms"] / df["AveOccup"]
df["Bedrooms_Ratio"] = df["AveBedrms"] / df["AveRooms"]
df["Income_per_Room"] = df["MedInc"] / df["AveRooms"]

X = df.drop("MedHouseVal", axis=1)
y = df["MedHouseVal"]


Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Scaling (LR)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Train Models

In [10]:
lin_reg = LinearRegression().fit(X_train_scaled, y_train)
dt = DecisionTreeRegressor(random_state=42).fit(X_train, y_train)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1).fit(X_train, y_train)


Predictions

In [14]:
y_pred_lr = lin_reg.predict(X_test_scaled)
y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)

Evaluate Linear Regression

In [15]:
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression")
print("RMSE:", rmse_lr)
print("MAE :", mae_lr)
print("R2  :", r2_lr)

Linear Regression
RMSE: 0.6751121427647729
MAE : 0.4857618541524641
R2  : 0.6521876659298534


Evaluate Decision Tree

In [16]:
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print("Decision Tree")
print("RMSE:", rmse_dt)
print("MAE :", mae_dt)
print("R2  :", r2_dt)

Decision Tree
RMSE: 0.7222367492259621
MAE : 0.45751519622093023
R2  : 0.6019365476015596


Evaluate Random Forest

In [17]:
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest")
print("RMSE:", rmse_rf)
print("MAE :", mae_rf)
print("R2  :", r2_rf)

Random Forest
RMSE: 0.4997341073822279
MAE : 0.32605297989341114
R2  : 0.8094227205186303


Comparison Table 

In [18]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "Random Forest"],
    "RMSE":  [rmse_lr, rmse_dt, rmse_rf],
    "MAE":   [mae_lr, mae_dt, mae_rf],
    "R2":    [r2_lr, r2_dt, r2_rf]
})

results_sorted = results.sort_values(by="RMSE") 
results_sorted

Unnamed: 0,Model,RMSE,MAE,R2
2,Random Forest,0.499734,0.326053,0.809423
0,Linear Regression,0.675112,0.485762,0.652188
1,Decision Tree,0.722237,0.457515,0.601937


Pick Best Model

In [19]:
best_row = results_sorted.iloc[0]
print("Best model based on lowest RMSE:")
print(best_row)

Best model based on lowest RMSE:
Model    Random Forest
RMSE          0.499734
MAE           0.326053
R2            0.809423
Name: 2, dtype: object
