## 1. Import Libraries

In [1]:
# We import needed libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# We will save objects later
import joblib

## 2. Load Data

In [2]:
# We read the dataset from the raw folder
data_path = '../data/raw/housing.csv'
df = pd.read_csv(data_path)
# Show first rows
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## 3. Check Missing Values

In [3]:
# We count missing values per column
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## 4. Fill Missing total_bedrooms With Median

In [7]:
# We compute the median of total_bedrooms
bedrooms_median = df['total_bedrooms'].median()
# We fill missing values with this median
df['total_bedrooms'] = df['total_bedrooms'].fillna(bedrooms_median)
# Confirm no missing now
df['total_bedrooms'].isna().sum()

np.int64(0)

## 5. Create Engineered Features

In [9]:
# We create 3 new helpful feature columns
# Average rooms per household
df['rooms_per_household'] = df['total_rooms'] / df['households']
# Bedrooms per room
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
# Population per household
df['population_per_household'] = df['population'] / df['households']
# Show first rows of new columns
df[['rooms_per_household','bedrooms_per_room','population_per_household']].head()

Unnamed: 0,rooms_per_household,bedrooms_per_room,population_per_household
0,6.984127,0.146591,2.555556
1,6.238137,0.155797,2.109842
2,8.288136,0.129516,2.80226
3,5.817352,0.184458,2.547945
4,6.281853,0.172096,2.181467


## 6. One-Hot Encode ocean_proximity (drop_first=True)

In [10]:
# We make dummy columns for ocean_proximity
df_encoded = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)
# Show new columns
df_encoded.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,6.984127,0.146591,2.555556,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,6.238137,0.155797,2.109842,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,8.288136,0.129516,2.80226,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,5.817352,0.184458,2.547945,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,6.281853,0.172096,2.181467,False,False,True,False


## 7. Separate Features and Target

In [11]:
# Target is median_house_value
target_col = 'median_house_value'
X = df_encoded.drop(columns=[target_col])
y = df_encoded[target_col]
X.shape, y.shape

((20640, 15), (20640,))

## 8. Train Test Split

In [12]:
# We split data into train and test parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((16512, 15), (4128, 15))

## 9. Scale Numeric Features With StandardScaler

In [13]:
# We find numeric columns to scale
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
# We create scaler and fit on train
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
# We apply scaling only to numeric columns
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
# Show first rows of scaled train
X_train_scaled.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
14196,1.272587,-1.372811,0.34849,0.222569,0.211228,0.768276,0.322906,-0.326196,-0.174916,-0.211785,0.051376,False,False,False,True
8267,0.709162,-0.876696,1.618118,0.340293,0.593094,-0.098901,0.672027,-0.035843,-0.402835,0.342185,-0.117362,False,False,False,True
17445,-0.447603,-0.460146,-1.95271,-0.342597,-0.495226,-0.449818,-0.430461,0.144701,0.088216,-0.661658,-0.03228,False,False,False,True
14265,1.232698,-1.382172,0.586545,-0.56149,-0.409306,-0.007434,-0.380587,-1.017864,-0.600015,0.783032,0.077507,False,False,False,True
2271,-0.108551,0.532084,1.142008,-0.119565,-0.256559,-0.485877,-0.314962,-0.171488,0.349007,-0.550364,-0.068832,True,False,False,False


## 10. Save Preprocessing Objects (Include Engineered Features)

In [14]:
# We save median, list of final columns, numeric columns, engineered feature names, and scaler
engineered_features = ['rooms_per_household','bedrooms_per_room','population_per_household']
preprocess_artifacts = {
    'bedrooms_median': bedrooms_median,
    'final_columns': X_train_scaled.columns.tolist(),
    'numeric_columns': numeric_cols.tolist(),
    'engineered_features': engineered_features,
    'scaler': scaler
}
joblib.dump(preprocess_artifacts, '../models/preprocessing.pkl')
print('Saved preprocessing objects with engineered features to models/preprocessing.pkl')

Saved preprocessing objects with engineered features to models/preprocessing.pkl


## 11. Simple Summary

In [15]:
# We print a simple summary line
print('Filled missing bedrooms, created engineered features, one-hot encoded, scaled numeric features, and saved artifacts.')

Filled missing bedrooms, created engineered features, one-hot encoded, scaled numeric features, and saved artifacts.
