# REGRESSION

### Objective:
#### The objective of this assignment is to evaluate your understanding of regression techniques in supervised learning by applying them to a real-world dataset.

## California Housing Regression Assignment

### -------------------------
### 1. Load and Preprocess
### -------------------------

In [61]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [62]:
# Load dataset
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
df['MedHouseVal'] = california.target



In [63]:
# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [64]:
# Feature scaling (standardization)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('MedHouseVal', axis=1))
scaled_df = pd.DataFrame(scaled_features, columns=california.feature_names)
scaled_df['MedHouseVal'] = df['MedHouseVal']



In [67]:
# Preview the preprocessed data
print("\nFirst 5 rows of the preprocessed dataset:")
print(scaled_df.head())


First 5 rows of the preprocessed dataset:
     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.344766  0.982143  0.628559  -0.153758   -0.974429 -0.049597  1.052548   
1  2.332238 -0.607019  0.327041  -0.263336    0.861439 -0.092512  1.043185   
2  1.782699  1.856182  1.155620  -0.049016   -0.820777 -0.025843  1.038503   
3  0.932968  1.856182  0.156966  -0.049833   -0.766028 -0.050329  1.038503   
4 -0.012881  1.856182  0.344711  -0.032906   -0.759847 -0.085616  1.038503   

   Longitude  MedHouseVal  
0  -1.327835        4.526  
1  -1.322844        3.585  
2  -1.332827        3.521  
3  -1.337818        3.413  
4  -1.337818        3.422  


### -------------------------
### 2. Regression Models
### -------------------------


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [41]:
# Train-test split
X = scaled_df.drop('MedHouseVal', axis=1)
y = scaled_df['MedHouseVal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
# Function to evaluate models
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Evaluation:")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}\n")
    return mse, mae, r2

In [45]:
# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR()
}

In [47]:
# Function to evaluate models
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Evaluation:")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}\n")
    return mse, mae, r2


In [49]:
# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR()
}


In [51]:
# Evaluate and store results
results = {}
for name, model in models.items():
    print(f"Running {name}...")
    mse, mae, r2 = evaluate_model(model)
    results[name] = {"MSE": mse, "MAE": mae, "R²": r2}

Running Linear Regression...
LinearRegression Evaluation:
MSE: 0.5559, MAE: 0.5332, R²: 0.5758

Running Decision Tree...
DecisionTreeRegressor Evaluation:
MSE: 0.4943, MAE: 0.4538, R²: 0.6228

Running Random Forest...
RandomForestRegressor Evaluation:
MSE: 0.2555, MAE: 0.3276, R²: 0.8050

Running Gradient Boosting...
GradientBoostingRegressor Evaluation:
MSE: 0.2940, MAE: 0.3717, R²: 0.7756

Running SVR...
SVR Evaluation:
MSE: 0.3552, MAE: 0.3978, R²: 0.7289



In [57]:
# Convert to DataFrame and display
results_df = pd.DataFrame(results).T
print("\nModel Evaluation Results Summary:\n")
print(results_df)


Model Evaluation Results Summary:

                        MSE       MAE        R²
Linear Regression  0.555892  0.533200  0.575788
Decision Tree      0.494272  0.453784  0.622811
Random Forest      0.255498  0.327613  0.805024
Gradient Boosting  0.293999  0.371650  0.775643
SVR                0.355198  0.397763  0.728941



###  -------------------------
###  3. Summary & Conclusion
###  -------------------------

In [56]:
# Best and worst models
best_model = results_df['R²'].idxmax()
worst_model = results_df['R²'].idxmin()

print(f"\n✅ Best Performing Model: {best_model}")
print(f"❌ Worst Performing Model: {worst_model}")



✅ Best Performing Model: Random Forest
❌ Worst Performing Model: Linear Regression
