# ML Assignment 3: Regression
## California Housing Price Prediction
### Objective:
To apply regression techniques in supervised learning using the California Housing dataset and evaluate the performance of different models.

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## 1. Loading and Preprocessing the Data

In [2]:
# Load the dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Target'] = data.target

# Display dataset information
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   Target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
# Check for missing values
print(df.isnull().sum())  # No missing values found

# Splitting data into features and target
X = df.drop(columns=['Target'])
y = df['Target']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64


## 2. Implementing Regression Algorithms

In [4]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    if name == 'Support Vector Regressor':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {'MSE': mse, 'MAE': mae, 'R² Score': r2}

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,MSE,MAE,R² Score
Linear Regression,0.555892,0.5332,0.575788
Decision Tree Regressor,0.495235,0.454679,0.622076
Random Forest Regressor,0.255368,0.327543,0.805123
Gradient Boosting Regressor,0.293997,0.371643,0.775645
Support Vector Regressor,0.357004,0.398599,0.727563


## 3. Model Evaluation and Comparison

In [5]:
# Identify best and worst models
best_model = results_df['R² Score'].idxmax()
worst_model = results_df['R² Score'].idxmin()

print(f'Best performing model: {best_model}')
print(f'Worst performing model: {worst_model}')

Best performing model: Random Forest Regressor
Worst performing model: Linear Regression


### Conclusion:
- The model with the highest R² score is the best at predicting house prices.
- The model with the lowest R² score is the least effective for this dataset.
- Mean Squared Error (MSE) and Mean Absolute Error (MAE) indicate how much error is in the predictions.

In [6]:
# Save results to CSV
results_df.to_csv('regression_results.csv', index=True)