# 1. Loading and Preprocessing

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [5]:
from sklearn.datasets import fetch_california_housing as df

In [7]:
housing = df()

In [9]:
housing_data = pd.DataFrame(housing.data, columns=housing.feature_names)
housing_data['Target'] = housing.target

In [11]:
housing_data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [13]:
df = housing_data

In [15]:
df.isnull().sum() # there is no missing values so we dont need to do any imputation

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64

In [17]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [19]:
X = df.drop(columns=['Target'])
y = df['Target']

In [21]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [23]:
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
# all columns is standardized to bring them into a common scale

# 2. Regression Algorithm Implementation

In [55]:
# Linear Regression - helps assess the overall trend in the data.
l=LinearRegression()
l.fit(X_scaled, y)
y_pred_l = l.predict(X_scaled)
r2_l= r2_score(y, y_pred_l)
rmse_l= mean_squared_error(y, y_pred_l, squared=False)

In [59]:
# Decision Tree Regressor - captures non-linear relationships and automatically selects important features without requiring scaling
dt = DecisionTreeRegressor(random_state=0)
dt.fit(X_scaled, y)
y_pred_dt = dt.predict(X_scaled)
r2_dt = r2_score(y, y_pred_dt)
rmse_dt = mean_squared_error(y, y_pred_dt, squared=False)

In [None]:
# Random Forest Regressor - improves prediction accuracy by combining multiple decision trees and reducing overfitting.
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_scaled, y)
y_pred_rf = rf.predict(X_scaled)
r2_rf = r2_score(y, y_pred_rf)
rmse_rf = mean_squared_error(y, y_pred_rf, squared=False)

In [31]:
# Gradient Boosting Regressor # helpfull for predicting housing prices due to its ability to minimize errors iteratively.
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=0)
gbr.fit(X_scaled, y)
y_pred_gbr = gbr.predict(X_scaled)
r2_gbr = r2_score(y, y_pred_gbr)
rmse_gbr = mean_squared_error(y, y_pred_gbr, squared=False)

# 3. Model Evaluation and Comparison

In [39]:
# Linear Regression
y_pred_l = l.predict(X_scaled)
mse_l = mean_squared_error(y, y_pred_l)
mae_l = mean_absolute_error(y, y_pred_l)
r2_l = r2_score(y, y_pred_l)

In [41]:
# Decision Tree
y_pred_dt = dt.predict(X_scaled)
mse_dt = mean_squared_error(y, y_pred_dt)
mae_dt = mean_absolute_error(y, y_pred_dt)
r2_dt = r2_score(y, y_pred_dt)

In [43]:
# Random Forest
y_pred_rf = rf.predict(X_scaled)
mse_rf = mean_squared_error(y, y_pred_rf)
mae_rf = mean_absolute_error(y, y_pred_rf)
r2_rf = r2_score(y, y_pred_rf)

In [45]:
# Gradient Boosting
y_pred_gbr = gbr.predict(X_scaled)
mse_gbr = mean_squared_error(y, y_pred_gbr)
mae_gbr = mean_absolute_error(y, y_pred_gbr)
r2_gbr = r2_score(y, y_pred_gbr)

In [47]:
results = {
    "Linear Regression": [mse_l, mae_l, r2_l],
    "Decision Tree": [mse_dt, mae_dt, r2_dt],
    "Random Forest": [mse_rf, mae_rf, r2_rf],
    "Gradient Boosting": [mse_gbr, mae_gbr, r2_gbr]
}

In [49]:
print(f"{'Model':<20}{'MSE':<15}{'MAE':<15}{'R² Score'}")
print("-" * 60)
for model, metrics in results.items():
    print(f"{model:<20}{metrics[0]:<15.4f}{metrics[1]:<15.4f}{metrics[2]:.4f}")

Model               MSE            MAE            R² Score
------------------------------------------------------------
Linear Regression   0.5243         0.5312         0.6062
Decision Tree       0.0000         0.0000         1.0000
Random Forest       0.0347         0.1198         0.9739
Gradient Boosting   0.2619         0.3562         0.8033


In [51]:
# Best Performer - Decision Tree
# Worst Performer - Linear Regression