In [1]:
#Loading and Preprocessing
from sklearn.datasets import fetch_california_housing
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
#loading the dataset
california=fetch_california_housing()
df=pd.DataFrame(california.data,columns=california.feature_names)
df['Target']=california.target
#checking for any missing values
print("Missing values:\n",df.isnull().sum())
#feature scaling
scaler=StandardScaler()
X_scaled=scaler.fit_transform(df.drop('Target', axis=1))
y=df['Target']
print("Scaled Features:\n", X_scaled[:5])

Missing values:
 MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64
Scaled Features:
 [[ 2.34476576  0.98214266  0.62855945 -0.15375759 -0.9744286  -0.04959654
   1.05254828 -1.32783522]
 [ 2.33223796 -0.60701891  0.32704136 -0.26333577  0.86143887 -0.09251223
   1.04318455 -1.32284391]
 [ 1.7826994   1.85618152  1.15562047 -0.04901636 -0.82077735 -0.02584253
   1.03850269 -1.33282653]
 [ 0.93296751  1.85618152  0.15696608 -0.04983292 -0.76602806 -0.0503293
   1.03850269 -1.33781784]
 [-0.012881    1.85618152  0.3447108  -0.03290586 -0.75984669 -0.08561576
   1.03850269 -1.33781784]]


In [3]:
#Justification for the above performed step
#Missing Values:Checked and found no missing value — California Housing( given dataset) is clean.
#Scaling:Used StandardScaler because algorithms like SVR and Gradient Boosting are more prone to feature magnitudes.

In [7]:
#2.Regression Algorithm Implementation
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
#splitting the data
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42)
#models:
models={
    "Linear Regression":LinearRegression(),
    "Decision Tree":DecisionTreeRegressor(random_state=42),
    "Random Forest":RandomForestRegressor(random_state=42),
    "Gradient Boosting":GradientBoostingRegressor(random_state=42),
    "SVR":SVR()
}
results=[]
#training and evaluating each models:
for name,model in models.items():
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    mse=mean_squared_error(y_test,y_pred)
    mae=mean_absolute_error(y_test,y_pred)
    r2=r2_score(y_test,y_pred)
    results.append({
        "Model":name,
        "MSE":mse,
        "MAE":mae,
        "R² Score":r2
    })
#providing results as dataframe
results_df=pd.DataFrame(results)
print(results_df.sort_values(by="R² Score",ascending=False))

               Model       MSE       MAE  R² Score
2      Random Forest  0.255876  0.327721  0.804735
3  Gradient Boosting  0.293999  0.371650  0.775643
4                SVR  0.355208  0.397764  0.728933
1      Decision Tree  0.498366  0.455246  0.619687
0  Linear Regression  0.555892  0.533200  0.575788


In [9]:
#explanation for the use of algorithms
#Linear Regression:Assumes a linear relationship; fast and interpretable.
#Decision Tree: Captures non-linear patterns; prone to overfitting.
#Random Forest: Ensemble of trees; reduces overfitting.
#Gradient Boosting: Boosted trees; excellent performance on structured data.
#SVR (Support Vector Regression): Uses margins for prediction; sensitive to scale and slower

In [11]:
#3.Model Evaluation and Comparison
#After running the above code, the output had shown a comparison of MSE,MAE,and R².
#Best Performing:Likely Gradient Boosting or Random Forest(which is based on typical performance in structured data).
#Worst Performing:being SVR, due to its sensitivity to scale and computational cost on larger datasets.