## Import Libraries

## ðŸ“š 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib

---
## ðŸ“Š 2. Exploratory Data Analysis (EDA)

### 2.1 Load Data

In [2]:
df = pd.read_csv('../data/raw/housing.csv')
print(f'Dataset shape: {df.shape}')
df.head()

Dataset shape: (20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### 2.2 Dataset Overview

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


### 2.3 Statistical Summary

In [4]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### 2.4 Check Missing Values

In [5]:
print('Missing values:')
print(df.isnull().sum())

Missing values:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


---
## ðŸ§¹ 3. Data Preprocessing

### 3.1 Handle Missing Values

In [6]:
bedrooms_median = df['total_bedrooms'].median()
df['total_bedrooms'] = df['total_bedrooms'].fillna(bedrooms_median)
print(f'Missing values after imputation: {df.isnull().sum().sum()}')

Missing values after imputation: 0


---
## ðŸ”§ 4. Feature Engineering

### 4.1 Create Derived Features

In [7]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

print('Engineered features created:')
print('  - rooms_per_household')
print('  - bedrooms_per_room')
print('  - population_per_household')

Engineered features created:
  - rooms_per_household
  - bedrooms_per_room
  - population_per_household


### 4.2 Encode Categorical Variables

In [8]:
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)
print(f'Total features after encoding: {df.shape[1]}')

Total features after encoding: 16


### 4.3 Separate Features and Target

In [9]:
X = df.drop(columns=['median_house_value'])
y = df['median_house_value']

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')

Features shape: (20640, 15)
Target shape: (20640,)


### 4.4 Train/Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training set: {X_train.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')

Training set: 16512 samples
Test set: 4128 samples


### 4.5 Feature Scaling

In [11]:
numeric_cols = [col for col in X_train.columns if not col.startswith('ocean_proximity_')]

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

print(f'Scaled {len(numeric_cols)} numeric features')

Scaled 11 numeric features


---
## ðŸ¤– 5. Model Training

### 5.1 Define Evaluation Metrics

In [12]:
def compute_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    return r2, mae, rmse

### 5.2 Train Linear Regression

In [13]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
lin_pred = lin_reg.predict(X_test_scaled)
lin_r2, lin_mae, lin_rmse = compute_metrics(y_test, lin_pred)

print(f'Linear Regression â†’ RÂ²: {lin_r2:.4f}, MAE: ${lin_mae:,.0f}, RMSE: ${lin_rmse:,.0f}')

Linear Regression â†’ RÂ²: 0.5970, MAE: $50,889, RMSE: $72,669




### 5.3 Train Ridge Regression

In [14]:
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X_train_scaled, y_train)
ridge_pred = ridge.predict(X_test_scaled)
ridge_r2, ridge_mae, ridge_rmse = compute_metrics(y_test, ridge_pred)

print(f'Ridge Regression â†’ RÂ²: {ridge_r2:.4f}, MAE: ${ridge_mae:,.0f}, RMSE: ${ridge_rmse:,.0f}')

Ridge Regression â†’ RÂ²: 0.5969, MAE: $50,899, RMSE: $72,679




### 5.4 Train Lasso Regression

In [15]:
lasso = Lasso(alpha=100, random_state=42, max_iter=10000)
lasso.fit(X_train_scaled, y_train)
lasso_pred = lasso.predict(X_test_scaled)
lasso_r2, lasso_mae, lasso_rmse = compute_metrics(y_test, lasso_pred)

print(f'Lasso Regression â†’ RÂ²: {lasso_r2:.4f}, MAE: ${lasso_mae:,.0f}, RMSE: ${lasso_rmse:,.0f}')

Lasso Regression â†’ RÂ²: 0.5976, MAE: $50,886, RMSE: $72,620




---
## ðŸ“ˆ 6. Model Evaluation

### 6.1 Compare Model Performance

In [16]:
results = pd.DataFrame([
    {'Model': 'Linear Regression', 'RÂ²': lin_r2, 'MAE': lin_mae, 'RMSE': lin_rmse},
    {'Model': 'Ridge Regression', 'RÂ²': ridge_r2, 'MAE': ridge_mae, 'RMSE': ridge_rmse},
    {'Model': 'Lasso Regression', 'RÂ²': lasso_r2, 'MAE': lasso_mae, 'RMSE': lasso_rmse}
])

results = results.sort_values('RÂ²', ascending=False)
print('\n' + '='*60)
print('MODEL PERFORMANCE COMPARISON')
print('='*60)
print(results.to_string(index=False))
print('='*60)


MODEL PERFORMANCE COMPARISON
            Model       RÂ²          MAE         RMSE
 Lasso Regression 0.597559 50886.499646 72619.712168
Linear Regression 0.597018 50888.660016 72668.538379
 Ridge Regression 0.596898 50898.811574 72679.324274


### 6.2 Save Best Model

In [17]:
best_name = results.iloc[0]['Model']
model_map = {
    'Linear Regression': lin_reg,
    'Ridge Regression': ridge,
    'Lasso Regression': lasso
}
best_model = model_map[best_name]

artifact = {
    'model_name': best_name,
    'model': best_model,
    'scaler': scaler,
    'numeric_columns': numeric_cols,
    'final_columns': X_train_scaled.columns.tolist(),
    'bedrooms_median': bedrooms_median,
    'engineered_features': ['rooms_per_household', 'bedrooms_per_room', 'population_per_household']
}

joblib.dump(artifact, '../models/best_model.pkl')
print(f'\nâœ“ Best model ({best_name}) saved to models/best_model.pkl')


âœ“ Best model (Lasso Regression) saved to models/best_model.pkl
