In [1]:
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### LOADING AND PREPROCESSING

In [3]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()

# Convert the dataset into a pandas DataFrame
X = pd.DataFrame(data.data, columns=data.feature_names)
Y = pd.Series(data.target)

# Display the first few rows of the dataset
print("Initial Dataset:")
print(X.head())
print("\nTarget Dataset:")
print(Y.head())

Initial Dataset:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  

Target Dataset:
0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
dtype: float64


##### -  loaded the California Housing dataset using fetch_california_housing from scikit-learn.
##### - The dataset was converted into a Pandas DataFrame for easier manipulation and analysis.

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [34]:
X.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [35]:
Y.describe()

count    20640.000000
mean         2.068558
std          1.153956
min          0.149990
25%          1.196000
50%          1.797000
75%          2.647250
max          5.000010
dtype: float64

In [36]:
Y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 20640 entries, 0 to 20639
Series name: None
Non-Null Count  Dtype  
--------------  -----  
20640 non-null  float64
dtypes: float64(1)
memory usage: 161.4 KB


##### - used X.info() and Y.info() to get information about the features and structure of the dataset.
##### - used X.describe() and Y.describe() to get a statistical summary of the dataset.

In [39]:
missing_values = X.isnull().sum()

In [40]:
print(missing_values)

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64


###### isnull() function IS USED TO DETECT MISSING VALUES.THERE IS NO MISSING VALUES

In [41]:
# Step 2: Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

print("Scaled Dataset:")
print(scaled_X[:10])

Scaled Dataset:
[[ 2.34476576  0.98214266  0.62855945 -0.15375759 -0.9744286  -0.04959654
   1.05254828 -1.32783522]
 [ 2.33223796 -0.60701891  0.32704136 -0.26333577  0.86143887 -0.09251223
   1.04318455 -1.32284391]
 [ 1.7826994   1.85618152  1.15562047 -0.04901636 -0.82077735 -0.02584253
   1.03850269 -1.33282653]
 [ 0.93296751  1.85618152  0.15696608 -0.04983292 -0.76602806 -0.0503293
   1.03850269 -1.33781784]
 [-0.012881    1.85618152  0.3447108  -0.03290586 -0.75984669 -0.08561576
   1.03850269 -1.33781784]
 [ 0.08744664  1.85618152 -0.26972966  0.01466934 -0.89407076 -0.08961842
   1.03850269 -1.33781784]
 [-0.11136631  1.85618152 -0.2009177  -0.3066332  -0.29271158 -0.0907249
   1.03382082 -1.33781784]
 [-0.39513665  1.85618152 -0.25523193 -0.07354166 -0.23707923 -0.12347647
   1.03382082 -1.33781784]
 [-0.94235915  1.06160074 -0.45870257  0.04425393 -0.19380963 -0.1004992
   1.03382082 -1.34280914]
 [-0.09446958  1.85618152 -0.18528316 -0.22468709  0.1108437  -0.08650142
   1

##### Using 'StandardScaler()' function,WE CAN ENSURE THE EFFICIENCY OF MACHIENE LEARNING MODEL

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(scaled_X, Y, test_size=0.2, random_state=42)

print("X: ", X_train.shape)
print("X:", X_test.shape)
print("Y", Y_train.shape)
print("Y:", Y_test.shape)

X:  (16512, 8)
X: (4128, 8)
Y (16512,)
Y: (4128,)


In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Regression Algorithms 
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

# Train and evaluate each model
for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    Y_pred

    # Calculate performance metrics
    mse = mean_squared_error(Y_test, Y_pred)
    mae = mean_absolute_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)

    # Display results
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R² Score: {r2}")


Linear Regression
Mean Squared Error: 0.5558915986952442
Mean Absolute Error: 0.5332001304956565
R² Score: 0.575787706032451

Decision Tree Regressor
Mean Squared Error: 0.4942716777366763
Mean Absolute Error: 0.4537843265503876
R² Score: 0.6228111330554302

Random Forest Regressor
Mean Squared Error: 0.25549776668540763
Mean Absolute Error: 0.32761306601259704
R² Score: 0.805024407701793

Gradient Boosting Regressor
Mean Squared Error: 0.29399901242474274
Mean Absolute Error: 0.37165044848436773
R² Score: 0.7756433164710084

Support Vector Regressor
Mean Squared Error: 0.35519846199894217
Mean Absolute Error: 0.39776309634378626
R² Score: 0.7289407597956459


####  
Comparison of Regression Models for California Housing Dataset

Model 1: Linear Regression

. How it works: Linear Regression models the relationship between features and the target by fitting a linear equation.
. WHY: Suitable for linear relationships between features and the target. May not capture complex interactions.

Model 2: Decision Tree Regressor

.How it works: Decision Tree splits data into subsets using feature thresholds to reduce prediction error.
. why: Captures non-linear relationships and feature interactions. Effective for housing prices influenced by non-linear factors.

Model 3: Random Forest Regressor

. How it works: Random Forest combines multiple decision trees to produce stable and accurate predictions.
. why: Handles non-linearity and high-dimensional data well. Suitable for large feature spaces and complex relationships.

Model 4: Gradient Boosting Regressor

. How it works: Gradient Boosting builds models sequentially to correct errors and optimize a loss function.
. why: Excels in capturing complex relationships with fine-tuned models. Suitable for high-accuracy goals.

Model 5: Support Vector Regressor (SVR)

. How it works: SVR finds a hyperplane that fits data points within a specified margin of tolerance.
. why: Works well with small- to medium-sized datasets and handles non-linear relationships. May struggle with larger datasets due to computational complexity.


#### Best Performing Model: Random Forest Regressor

Mean Squared Error (MSE): 0.2555 

Mean Absolute Error (MAE): 0.3276 

R² Score: 0.8050 


 .Highest R² score (best fit to data)
 
 .Lowest MSE and MAE (most accurate predictions)
 
  This combination of high accuracy and low error makes it the top-performing model.

Worst Performing Model: Linear Regression

Mean Squared Error (MSE): 0.5559 

Mean Absolute Error (MAE): 0.5332

R² Score: 0.5758 


The Linear Regression model underperformed with:

. Lowest R² score (least variance explained)

. Highest MSE and MAE (least accurate predictions)

This makes it the worst-performing model for this dataset.