In [92]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [96]:
#  Loading and Preprocessing : Load the California Housing dataset using the fetch_california_housing function from sklearn. 

from sklearn.datasets import fetch_california_housing

# Convert the dataset into a pandas DataFrame for easier handling.

data=fetch_california_housing()
df=pd.DataFrame(data.data, columns=data.feature_names)

df.head(6)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25


In [62]:
df['MedHouseVal'] = data.target

In [63]:
df.shape

(20640, 9)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [65]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [66]:
# Handle missing values (if any)

df.isnull().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [67]:
#  perform necessary feature scaling (e.g., standardization).
scaler= StandardScaler()
scaled_features= scaler.fit_transform(df.drop('MedHouseVal',axis=1))
X =pd.DataFrame(scaled_features, columns=data.feature_names)
y=df['MedHouseVal']
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844
2,1.782699,1.856182,1.155620,-0.049016,-0.820777,-0.025843,1.038503,-1.332827
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818
...,...,...,...,...,...,...,...,...
20635,-1.216128,-0.289187,-0.155023,0.077354,-0.512592,-0.049110,1.801647,-0.758826
20636,-0.691593,-0.845393,0.276881,0.462365,-0.944405,0.005021,1.806329,-0.818722
20637,-1.142593,-0.924851,-0.090318,0.049414,-0.369537,-0.071735,1.778237,-0.823713
20638,-1.054583,-0.845393,-0.040211,0.158778,-0.604429,-0.091225,1.778237,-0.873626


In [90]:
X_train, X_test ,y_train, y_test =train_test_split(X, y,train_size=0.8,random_state=42)

In [68]:

#  Regression Algorithm Implementation : Implement the following regression algorithms: Linear Regression Decision Tree Regressor Random Forest Regressor Gradient Boosting Regressor Support Vector Regressor (SVR) For each algorithm:
# Provide a brief explanation of how it works.

lr = LinearRegression() #object creation
dt = DecisionTreeRegressor(random_state=42)
rf =RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)
svr = SVR()


In [69]:
lr.fit(X_train,y_train)  #fit each model using the training data
dt.fit(X_train,y_train)
rf.fit(X_train,y_train)
gb.fit(X_train,y_train)
svr.fit(X_train,y_train)

In [70]:
# Evaluation and Comparison : Evaluate the performance of each algorithm using the following metrics: 
# Mean Squared Error (MSE) Mean Absolute Error (MAE) R-squared Score (R²)

def evaluate(model, X_test,y_test):
    y_pred = model.predict(X_test)
    mse= mean_squared_error(y_test,y_pred)
    mae= mean_absolute_error(y_test, y_pred)
    r2= r2_score(y_test, y_pred)
    return mse,mae,r2

models={
    'Linear regression': lr,
    'Decision tree': dt,
    'Random Forest':rf,
    'Gradient Boosting':gb,
    'SVR':svr
}

In [71]:

results = {name:evaluate(model, X_test,y_test) for name,model in models.items()}
results_df =pd.DataFrame(results,index=['MSE','MAE','R2']).T

print('\n Model evaluation Results \n',results_df)


 Model evaluation Results 
                         MSE       MAE        R2
Linear regression  0.555892  0.533200  0.575788
Decision tree      0.505221  0.456285  0.614455
Random Forest      0.256213  0.327628  0.804478
Gradient Boosting  0.293999  0.371650  0.775643
SVR                0.355198  0.397763  0.728941


In [72]:
# Compare the results of all models and identify: The best-performing algorithm with justification. 
# The worst-performing algorithm with reasoning.



 
##

Best-Performing Model: ***Random Forest Regressor***

Reason:
Lowest MSE (0.256) → smallest average squared error.

Lowest MAE (0.327) → smallest average absolute error.

Highest R2 (0.804) → explains 80.4%(approximately) of the variance in the target variable.

This performance reflects its ensemble strength: combining multiple trees to reduce variance and avoid overfitting.

Performs exceptionally well on datasets with non-linear relationships and complex interactions like housing data.




Worst-Performing Model: ***Linear Regression***

Reason:
Highest MSE (0.555) and MAE (0.533) → the most error-prone.

Lowest R2 (0.575) → explains only 57.6%(appr) of the variance.

Linear regression assumes a straight-line relationship between input features and output, which is too simplistic for this real-world dataset.

Insights:
Tree-based ensemble methods (Random Forest, Gradient Boosting) are significantly better at capturing complex relationships.

SVR performs decently, better than Decision Tree and Linear Regression, but not quite as strong as ensembles.

    

###  Conclusion:
Tree-based ensemble models like **Random Forest** and **Gradient Boosting** significantly outperform simpler models. For predicting house prices in California, ensemble models are highly recommended due to their ability to capture complex relationships in the data.