## 1.Loading and PreProcessing

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

In [4]:
# load dataset
California_housing = fetch_california_housing()

#covert to a dataframe
df = pd.DataFrame(California_housing.data, columns=California_housing.feature_names)
df['Target'] = California_housing.target

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [6]:
# Check missing values:
print("Missing Values:")
print(df.isnull().sum())

#to handle missing values :

df.fillna(df.mean(), inplace=True)

print("Missing Values:")
print(df.isnull().sum())

Missing Values:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64
Missing Values:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64


In [8]:
# to initialize the scaler :
scaler = StandardScaler()

# to scale the features:
scaled_features = scaler.fit_transform(df.iloc[:,:-1])

# convert scaled features back to a dataframe:
scaled_df = pd.DataFrame(scaled_features, columns=California_housing.feature_names)
scaled_df['Target']=df['Target']

scaled_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835,4.526
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844,3.585
2,1.782699,1.856182,1.15562,-0.049016,-0.820777,-0.025843,1.038503,-1.332827,3.521
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818,3.413
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818,3.422


#### Handling Missing Values


##### .Many machine learning models cannot handle missing values and will fail or produce errors if they remain.
##### .Replacing missing values with the mean is a simple and effective method when then missing values are few and the data is continuous.
##### .It prevents losing rows of data 

#### Separating Features and Target

##### . ML models need to be trained on input variables separately from the output the model should predict.
##### . This separation is essential before scaling and model training.

#### Feature Scaling 

##### . The dataset contains features with very different numeric ranges(e.g.,population count vs median income).

##### . Prevents large-value features from dominating the model.Improves numerical stability and models convergence,often improve prediction accuracy.

We handled missing data by replacing missing values with the column mean to avoid losing data ensure smooth model training. We then Standardized all numeric features using StandardizedScaler to normalize the features distributions, which prevents scale-dependent features from dominating the model and improves the stability and performance of regression algorithms.

## 2.Regression Algorithm Implementation.

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

In [36]:
# split dataset into features and target:
X = scaled_df.iloc[:,:-1]
y = scaled_df['Target']

#split into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
!pip install --upgrade scikit-learn



In [50]:
#linear regression:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

#predictions:
lr_predictions = lr_model.predict(X_test)

#Evaluation:
lr_rmse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

print("Linear Regression RMSE:",lr_rmse)
print("Linear Regession R2 Score:",lr_r2)

Linear Regression RMSE: 0.5558915986952442
Linear Regession R2 Score: 0.575787706032451


In [52]:
# Decision Tree Regressor:
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

#predictions
dt_predictions = dt_model.predict(X_test)

#evaluation:
dt_rmse = mean_squared_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)

print("Decision Tree RMSE:", dt_rmse)
print("Decision Tree R2 Score:", dt_r2)

Decision Tree RMSE: 0.4942716777366763
Decision Tree R2 Score: 0.6228111330554302


In [54]:
# Random Forest Regressor:
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

#predictions
rf_predictions = rf_model.predict(X_test)

#evaluation
rf_rmse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("Random Forest RMSE:",rf_rmse)
print("Random Forest R2 Score:",rf_r2)

Random Forest RMSE: 0.25549776668540763
Random Forest R2 Score: 0.805024407701793


In [56]:
# gradient Boosting Regressor:
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

#predictions:
gb_predictions = gb_model.predict(X_test)

#Evaluation:
gb_rmse = mean_squared_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)


print("Gradient Boosting RMSE:", gb_rmse)
print("Gradient Boosting R2 Score:", gb_r2)



Gradient Boosting RMSE: 0.29399901242474274
Gradient Boosting R2 Score: 0.7756433164710084


In [58]:
# Support Vector Regressor:
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)

#predictions
svr_predictions = svr_model.predict(X_test)

#evaluation:
svr_rmse = mean_squared_error(y_test, svr_predictions)
svr_r2 = r2_score(y_test, svr_predictions)

print("SVR RMSE:", svr_rmse)
print("SVR R2 Score:", svr_r2)

SVR RMSE: 0.3551984619989429
SVR R2 Score: 0.7289407597956454


In [60]:
results = {
    "Model":["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVR"],
    "RMSE":[lr_rmse, dt_rmse, rf_rmse, gb_rmse, svr_rmse],
    "R2 Score":[lr_r2, dt_r2, rf_r2, gb_r2, svr_r2]
}

results_df = pd.DataFrame(results)
print(results_df)
            

               Model      RMSE  R2 Score
0  Linear Regression  0.555892  0.575788
1      Decision Tree  0.494272  0.622811
2      Random Forest  0.255498  0.805024
3  Gradient Boosting  0.293999  0.775643
4                SVR  0.355198  0.728941


#### Linear Regression:

##### .First a straight line through the data.Assumes a linear relationship between features(X) and the target (y).
##### .When the relationship between variables is roughly linear. Works best if the dataset is clean and not too complex.
##### . If your datasets has non-linear patterns,performance will be poor (higher MSE,lower R^2).


#### Decision Tree Regressor:

##### .Splits the data into branches based on rules(like yes/no conditions)
##### .Creates a tree of decisions to predict the target.
##### .Good for non-linear relationships and easy to interpret But can overfit if not pruned or limited in depth.

#### Random Forest Regressor:

##### .Builds many decision trees,each trained on different random samples and takes the average prediction of all trees.
##### .Reduces overfitting compared to a single tree and handles non-linear data well.
##### .Good general performance on many datasets. Often gives one of the best results,which seems to match your output.

#### Gradient Boosting Regressor:

##### .Builds trees sequentially,where each new tree focuses on fixing the errors of the previous one."Boosts" performance gradually.
##### .Powerful for complex datasets, Often performs even better than random forest when tuned well.But more sensitive to parameter settings and noise.

#### Support Vector Regression(SVR)

##### .Tries to fit data within a margin(tube) rather than minimizing error directly.
##### .Can Model non-linear data using kernel functions.
##### .Works well with smaller datasets.Good when relationships are non-linear but not extremely complex.But be slow and require Scaling.

### Summary

From the results Gradient Boosting or Random Forest likely performed the best. They handle non-linear relationships well,they handle complex interactions between features and they reduce overfitting (Random Forest) or Correct errors step-by-step(Gradient-Boosting).

if the dataset isn't perfectly linear (which is very common in real-world data),these models will generally outperform Linear Regression and SVR.


The Random Forest and Gradient Boosting regressors work well for this dataset because they are able to model complex and non-linear relationships between the features and the target variable. Random Forest reduces overfitting by averaging predictions across many decisions trees, while Gradient Boosting improves performance by sequentially correcting the errors of previous models.This makes tham more accurate and robust compared to linear Regression, Which assumes a purely linear relationship that may not exist in the data.

### 3.Model Evaluation and Comparison.

In [74]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

In [78]:
#linear regression evaluation
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)


print("Linear Regression Metrics:")
print("MSE:", lr_mse)
print("MAE:", lr_mae)
print("R2:" ,lr_r2)

Linear Regression Metrics:
MSE: 0.5558915986952442
MAE: 0.5332001304956565
R2: 0.575787706032451


In [80]:
# Decision Tree Regressor Evaluation:
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)

print("Decision Tree Regressor Metrics:")
print("MSE:", dt_mse)
print("MAE:", dt_mae)
print("R²:", dt_r2)

Decision Tree Regressor Metrics:
MSE: 0.4942716777366763
MAE: 0.4537843265503876
R²: 0.6228111330554302


In [82]:
# Random Forest Regressor Evaluation:
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("Random Forest Regressor Metrics:")
print("MSE:", rf_mse)
print("MAE:", rf_mae)
print("R²:", rf_r2)

Random Forest Regressor Metrics:
MSE: 0.25549776668540763
MAE: 0.32761306601259704
R²: 0.805024407701793


In [84]:
# Gradient Boosting Regressor Evaluation:
gb_mse = mean_squared_error(y_test, gb_predictions)
gb_mae = mean_absolute_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)

print("Gradient Boosting Regressor Metrics:")
print("MSE:", gb_mse)
print("MAE:", gb_mae)
print("R²:", gb_r2)

Gradient Boosting Regressor Metrics:
MSE: 0.29399901242474274
MAE: 0.37165044848436773
R²: 0.7756433164710084


In [86]:
# Support Vector Regressor Evaluation:
svr_mse = mean_squared_error(y_test, svr_predictions)
svr_mae = mean_absolute_error(y_test, svr_predictions)
svr_r2 = r2_score(y_test, svr_predictions)

print("Support Vector Regressor Metrics:")
print("MSE:", svr_mse)
print("MAE:", svr_mae)
print("R²:", svr_r2)

Support Vector Regressor Metrics:
MSE: 0.3551984619989429
MAE: 0.397763096343787
R²: 0.7289407597956454


In [88]:
#  create a DataFrame to compare results:
results = {
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVR"],
    "MSE": [lr_mse, dt_mse, rf_mse, gb_mse, svr_mse],
    "MAE": [lr_mae, dt_mae, rf_mae, gb_mae, svr_mae],
    "R²": [lr_r2, dt_r2, rf_r2, gb_r2, svr_r2],
}

comparison_df = pd.DataFrame(results)
print(comparison_df)

               Model       MSE       MAE        R²
0  Linear Regression  0.555892  0.533200  0.575788
1      Decision Tree  0.494272  0.453784  0.622811
2      Random Forest  0.255498  0.327613  0.805024
3  Gradient Boosting  0.293999  0.371650  0.775643
4                SVR  0.355198  0.397763  0.728941


#### Model Performance Analysis:

##### The objective for error metrics like MSE and MAE is to be as low as possible ,with 0 indicating a perfect model. For R^2, which measures the proportion of variance explained,the goal is to be close to 1(or

### Conclusion

#### Best Performing Algorithm:Random Forest Regressor

##### The Random Forest model performs best across all metrics:

##### .It has the lowest MSE(0.255), meaning the average squared difference between predicted and actual values is minimized, penalizing larger errors efffectively.
##### .I t has the lowest MAE(0.327), indicating its predictions are , on average , the closest in absloute distance to the actual data points.
##### .It achieves the highest R^2 Score (0.805 or 80.5%), meaning it explains the largest proportion of the variance in the largest variable compared to all other models.


#### Worst Performing Alogorithm: Linear Regression

##### The Linear Regression model is the worst performer across all metrics:

##### .It has the highest MSE(0.555) and highest MAE(0.533), indicating its predictions are the least accurate and furtheat from the actual values on average.
##### .It has the lowest R^2score (0.575),suggesting it explains the least amount of the target variables variance compared to the other four models tested