In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
housing = fetch_california_housing(as_frame=True)
df = housing.frame
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data loaded and scaled successfully.")


Data loaded and scaled successfully.


In [2]:
print(df.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


1. Linear Regression

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the model
linear_reg = LinearRegression()

# Train the model
linear_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred_linear = linear_reg.predict(X_test_scaled)

# Evaluate the model
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("--- Linear Regression ---")
print(f"Mean Squared Error (MSE): {mse_linear:.4f}")
print(f"R-squared (R2): {r2_linear:.4f}")


--- Linear Regression ---
Mean Squared Error (MSE): 0.5559
R-squared (R2): 0.5758


2. Polynomial Regression

In [4]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features (e.g., degree=2 for quadratic terms)
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train_scaled)
X_test_poly = poly_features.transform(X_test_scaled)

# Initialize and train a linear model on the new features
polynomial_reg = LinearRegression()
polynomial_reg.fit(X_train_poly, y_train)

# Make predictions
y_pred_poly = polynomial_reg.predict(X_test_poly)

# Evaluate the model
mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print("\n--- Polynomial Regression (Degree 2) ---")
print(f"Mean Squared Error (MSE): {mse_poly:.4f}")
print(f"R-squared (R2): {r2_poly:.4f}")



--- Polynomial Regression (Degree 2) ---
Mean Squared Error (MSE): 0.4643
R-squared (R2): 0.6457


3. Ridge Regression

In [5]:
from sklearn.linear_model import Ridge

# Initialize the Ridge model with a regularization strength (alpha)
ridge_reg = Ridge(alpha=1.0)

# Train the model
ridge_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred_ridge = ridge_reg.predict(X_test_scaled)

# Evaluate the model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("\n--- Ridge Regression ---")
print(f"Mean Squared Error (MSE): {mse_ridge:.4f}")
print(f"R-squared (R2): {r2_ridge:.4f}")



--- Ridge Regression ---
Mean Squared Error (MSE): 0.5559
R-squared (R2): 0.5758


4. Lasso Regression

In [6]:
from sklearn.linear_model import Lasso

# Initialize the Lasso model
lasso_reg = Lasso(alpha=0.01) # A small alpha value is often better for this dataset

# Train the model
lasso_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lasso = lasso_reg.predict(X_test_scaled)

# Evaluate the model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("\n--- Lasso Regression ---")
print(f"Mean Squared Error (MSE): {mse_lasso:.4f}")
print(f"R-squared (R2): {r2_lasso:.4f}")



--- Lasso Regression ---
Mean Squared Error (MSE): 0.5483
R-squared (R2): 0.5816


5. Elastic Net Regression

In [7]:
from sklearn.linear_model import ElasticNet

# Initialize the Elastic Net model
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.5)

# Train the model
elastic_net.fit(X_train_scaled, y_train)

# Make predictions
y_pred_elastic = elastic_net.predict(X_test_scaled)

# Evaluate the model
mse_elastic = mean_squared_error(y_test, y_pred_elastic)
r2_elastic = r2_score(y_test, y_pred_elastic)

print("\n--- Elastic Net Regression ---")
print(f"Mean Squared Error (MSE): {mse_elastic:.4f}")
print(f"R-squared (R2): {r2_elastic:.4f}")



--- Elastic Net Regression ---
Mean Squared Error (MSE): 0.5500
R-squared (R2): 0.5803


6. Feature Engineering
This example shows how to create a simple new feature (rooms_per_household) before training a model. You would typically perform this step before splitting and scaling the data. 

In [8]:
# Reload the original data for a cleaner start
housing_fe = fetch_california_housing(as_frame=True)
df_fe = housing_fe.frame

# Perform feature engineering
df_fe['rooms_per_household'] = df_fe['AveRooms'] / df_fe['AveOccup']

X_fe = df_fe.drop('MedHouseVal', axis=1)
y_fe = df_fe['MedHouseVal']

# Split and scale the new data
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X_fe, y_fe, test_size=0.2, random_state=42)
scaler_fe = StandardScaler()
X_train_fe_scaled = scaler_fe.fit_transform(X_train_fe)
X_test_fe_scaled = scaler_fe.transform(X_test_fe)

# Train and evaluate a linear model with the engineered features
linear_reg_fe = LinearRegression()
linear_reg_fe.fit(X_train_fe_scaled, y_train_fe)
y_pred_fe = linear_reg_fe.predict(X_test_fe_scaled)
mse_fe = mean_squared_error(y_test_fe, y_pred_fe)
r2_fe = r2_score(y_test_fe, y_pred_fe)

print("\n--- Linear Regression with Feature Engineering ---")
print(f"Mean Squared Error (MSE): {mse_fe:.4f}")
print(f"R-squared (R2): {r2_fe:.4f}")



--- Linear Regression with Feature Engineering ---
Mean Squared Error (MSE): 0.4724
R-squared (R2): 0.6395


7. Model Evaluation

In [9]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def evaluate_model(model, X_test_data, y_test_true):
    """Prints evaluation metrics for a trained model."""
    y_pred = model.predict(X_test_data)
    mse = mean_squared_error(y_test_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_true, y_pred)

    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")

# Example: Evaluate the Ridge model from earlier
print("\n--- Evaluating Ridge Model on Test Set ---")
evaluate_model(ridge_reg, X_test_scaled, y_test)



--- Evaluating Ridge Model on Test Set ---
Mean Squared Error (MSE): 0.5559
Root Mean Squared Error (RMSE): 0.7456
R-squared (R2): 0.5758
