In [1]:
pip install scikit-learn pandas numpy


Note: you may need to restart the kernel to use updated packages.


In [14]:
# Importing necessary libraries
from sklearn.datasets import fetch_california_housing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

In [16]:
# Load the California Housing dataset
data = fetch_california_housing(as_frame=True)
df = data.frame  # Convert to pandas DataFrame

# Display the first 5 rows to understand the dataset
print(df.head())

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('MedHouseVal', axis=1))  # Features
y = df['MedHouseVal']  # Target variable

# Split data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  
Missing values:
 MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64
Training data shape: (16512, 8)
Test data shape: (4128, 8)


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize and train Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = linear_model.predict(X_test)

# Evaluate performance
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Performance:")
print(f"Mean Squared Error: {mse_lr:.4f}")
print(f"Mean Absolute Error: {mae_lr:.4f}")
print(f"R-squared Score: {r2_lr:.4f}")


Linear Regression Performance:
Mean Squared Error: 0.5559
Mean Absolute Error: 0.5332
R-squared Score: 0.5758


In [20]:
from sklearn.tree import DecisionTreeRegressor

# Initialize and train Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluate performance
mse_dt = mean_squared_error(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print("\nDecision Tree Regressor Performance:")
print(f"Mean Squared Error: {mse_dt:.4f}")
print(f"Mean Absolute Error: {mae_dt:.4f}")
print(f"R-squared Score: {r2_dt:.4f}")



Decision Tree Regressor Performance:
Mean Squared Error: 0.4943
Mean Absolute Error: 0.4538
R-squared Score: 0.6228


In [22]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Regressor Performance:")
print(f"Mean Squared Error: {mse_rf:.4f}")
print(f"Mean Absolute Error: {mae_rf:.4f}")
print(f"R-squared Score: {r2_rf:.4f}")



Random Forest Regressor Performance:
Mean Squared Error: 0.2555
Mean Absolute Error: 0.3276
R-squared Score: 0.8050


In [24]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize and train Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1)
gb_model.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test)

# Evaluate performance
mse_gb = mean_squared_error(y_test, y_pred_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("\nGradient Boosting Regressor Performance:")
print(f"Mean Squared Error: {mse_gb:.4f}")
print(f"Mean Absolute Error: {mae_gb:.4f}")
print(f"R-squared Score: {r2_gb:.4f}")



Gradient Boosting Regressor Performance:
Mean Squared Error: 0.2940
Mean Absolute Error: 0.3717
R-squared Score: 0.7756


In [26]:
from sklearn.svm import SVR

# Initialize and train Support Vector Regressor
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr_model.fit(X_train, y_train)

# Make predictions
y_pred_svr = svr_model.predict(X_test)

# Evaluate performance
mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("\nSupport Vector Regressor Performance:")
print(f"Mean Squared Error: {mse_svr:.4f}")
print(f"Mean Absolute Error: {mae_svr:.4f}")
print(f"R-squared Score: {r2_svr:.4f}")



Support Vector Regressor Performance:
Mean Squared Error: 0.3552
Mean Absolute Error: 0.3978
R-squared Score: 0.7289


In [28]:
# Function to calculate and display evaluation metrics
def evaluate_model(y_test, y_pred, model_name):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} Performance:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"R-squared Score: {r2:.4f}\n")
    return mse, mae, r2

# Evaluate all models
results = {}
results['Linear Regression'] = evaluate_model(y_test, y_pred_lr, "Linear Regression")
results['Decision Tree'] = evaluate_model(y_test, y_pred_dt, "Decision Tree")
results['Random Forest'] = evaluate_model(y_test, y_pred_rf, "Random Forest")
results['Gradient Boosting'] = evaluate_model(y_test, y_pred_gb, "Gradient Boosting")
results['Support Vector Regressor'] = evaluate_model(y_test, y_pred_svr, "Support Vector Regressor")


Linear Regression Performance:
Mean Squared Error: 0.5559
Mean Absolute Error: 0.5332
R-squared Score: 0.5758

Decision Tree Performance:
Mean Squared Error: 0.4943
Mean Absolute Error: 0.4538
R-squared Score: 0.6228

Random Forest Performance:
Mean Squared Error: 0.2555
Mean Absolute Error: 0.3276
R-squared Score: 0.8050

Gradient Boosting Performance:
Mean Squared Error: 0.2940
Mean Absolute Error: 0.3717
R-squared Score: 0.7756

Support Vector Regressor Performance:
Mean Squared Error: 0.3552
Mean Absolute Error: 0.3978
R-squared Score: 0.7289



In [30]:
# Compare results and identify the best and worst-performing models
comparison_df = pd.DataFrame(results, index=['MSE', 'MAE', 'R-squared']).T
print(comparison_df)

# Highlight the best and worst models
best_model = comparison_df['R-squared'].idxmax()
worst_model = comparison_df['R-squared'].idxmin()
print(f"Best-performing model: {best_model}")
print(f"Worst-performing model: {worst_model}")


                               MSE       MAE  R-squared
Linear Regression         0.555892  0.533200   0.575788
Decision Tree             0.494272  0.453784   0.622811
Random Forest             0.255498  0.327613   0.805024
Gradient Boosting         0.293999  0.371650   0.775643
Support Vector Regressor  0.355198  0.397763   0.728941
Best-performing model: Random Forest
Worst-performing model: Linear Regression
