In [2]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [4]:
#Dataset: We'll use the California Housing dataset, which is included in scikit-learn.
# Load the dataset
housing = fetch_california_housing()
# Convert to a pandas DataFrame for easier manipulation
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['MedHouseVal'] = housing.target

##### --- 1. Baseline Model ---


In [8]:

# Define features (X) and target (y)
X_baseline = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

In [10]:
# Split the data
X_train_b, X_test_b, y_train, y_test = train_test_split(X_baseline, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_b_scaled = scaler.fit_transform(X_train_b)
X_test_b_scaled = scaler.transform(X_test_b)

In [12]:
# Train the baseline model
model_baseline = LinearRegression()
model_baseline.fit(X_train_b_scaled, y_train)

In [14]:
# Evaluate the baseline model
y_pred_b = model_baseline.predict(X_test_b_scaled)
baseline_r2 = r2_score(y_test, y_pred_b)

print(f"--- Baseline Model ---")
print(f"R-squared (R2) score: {baseline_r2:.4f}")

--- Baseline Model ---
R-squared (R2) score: 0.5758


##### Step 2: Feature Engineering

In [18]:
# Create a copy to work on
df_engineered = df.copy()

In [20]:
# --- Technique 1: Interaction Terms ---
# Hypothesis: The number of rooms per person in a block is a useful feature.
# Also, the value of bedrooms might depend on the total number of rooms.
df_engineered['Rooms_per_Pop'] = df_engineered['AveRooms'] / df_engineered['Population']
df_engineered['Bedrms_per_Room'] = df_engineered['AveBedrms'] / df_engineered['AveRooms']

# --- Technique 2: Polynomial Features ---
# Hypothesis: The effect of Median Income isn't linear. 
# A jump from 1 to 2 is different from 14 to 15.
# We'll add MedInc squared.
df_engineered['MedInc_sq'] = df_engineered['MedInc'] ** 2

# --- Technique 3: Binning ---
# Hypothesis: The 'HouseAge' might be better as a category (e.g., "New", "Old").
# We'll use pd.qcut to create 4 equal-sized bins.
df_engineered['Age_Bin'] = pd.qcut(df_engineered['HouseAge'], 
                                   q=4, 
                                   labels=['New', 'Medium', 'Old', 'Very Old'])


In [22]:
# We must one-hot encode this new categorical feature
age_dummies = pd.get_dummies(df_engineered['Age_Bin'], prefix='Age', drop_first=True)
df_engineered = pd.concat([df_engineered, age_dummies], axis=1)

# Drop the original 'HouseAge' and 'Age_Bin' as they are now encoded
df_engineered = df_engineered.drop(['HouseAge', 'Age_Bin'], axis=1)

print("\n--- Engineered DataFrame Head ---")
print(df_engineered.head())


--- Engineered DataFrame Head ---
   MedInc  AveRooms  AveBedrms  Population  AveOccup  Latitude  Longitude  \
0  8.3252  6.984127   1.023810       322.0  2.555556     37.88    -122.23   
1  8.3014  6.238137   0.971880      2401.0  2.109842     37.86    -122.22   
2  7.2574  8.288136   1.073446       496.0  2.802260     37.85    -122.24   
3  5.6431  5.817352   1.073059       558.0  2.547945     37.85    -122.25   
4  3.8462  6.281853   1.081081       565.0  2.181467     37.85    -122.25   

   MedHouseVal  Rooms_per_Pop  Bedrms_per_Room  MedInc_sq  Age_Medium  \
0        4.526       0.021690         0.146591  69.308955       False   
1        3.585       0.002598         0.155797  68.913242        True   
2        3.521       0.016710         0.129516  52.669855       False   
3        3.413       0.010425         0.184458  31.844578       False   
4        3.422       0.011118         0.172096  14.793254       False   

   Age_Old  Age_Very Old  
0    False          True  
1    Fals

##### --- 2. Engineered Model ---


In [25]:
# Define new features (X) and target (y)
X_engineered = df_engineered.drop('MedHouseVal', axis=1)
y_engineered = df_engineered['MedHouseVal']

In [27]:
# Split the new data
X_train_e, X_test_e, y_train, y_test = train_test_split(X_engineered, y_engineered, test_size=0.2, random_state=42)

# Scale the new features
# We must use a new scaler object!
scaler_e = StandardScaler()
X_train_e_scaled = scaler_e.fit_transform(X_train_e)
X_test_e_scaled = scaler_e.transform(X_test_e)


In [29]:
# Train the new model
model_engineered = LinearRegression()
model_engineered.fit(X_train_e_scaled, y_train)

In [31]:
# Evaluate the new model
y_pred_e = model_engineered.predict(X_test_e_scaled)
engineered_r2 = r2_score(y_test, y_pred_e)

print(f"\n--- Engineered Model ---")
print(f"New R-squared (R2) score: {engineered_r2:.4f}")


--- Engineered Model ---
New R-squared (R2) score: 0.6015


##### Compare Results

In [34]:
print("\n--- Model Comparison ---")
print(f"Baseline R2 Score:   {baseline_r2:.4f}")
print(f"Engineered R2 Score: {engineered_r2:.4f}")
print(f"Improvement:         {engineered_r2 - baseline_r2:+.4f}")


--- Model Comparison ---
Baseline R2 Score:   0.5758
Engineered R2 Score: 0.6015
Improvement:         +0.0258


we see a noticeable improvement in the $R^2$ score! We didn't add any new data or use a more complex model. We used the same LinearRegression model.The "lift" comes entirely from our feature engineering. By creating features that have a more direct, linear-like relationship with the house price (like MedInc_sq or Rooms_per_Pop), we made the linear model's job easier. We provided it with more context and captured non-linear patterns that the original features alone couldn't describe.