In [1]:
import warnings
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

LOAD DATASET

In [4]:
df=pd.read_csv("housing_dataset.csv")

In [5]:
print(df.isnull().sum().to_string())
# Fix: proper assignment, not inplace on copy
med_bed=df["total_bedrooms"].median()
df["total_bedrooms"]=df["total_bedrooms"].fillna(med_bed)

prox_dummies=pd.get_dummies(df["ocean_proximity"],prefix="prox",drop_first=True)
df=pd.concat([df.drop(columns="ocean_proximity"),prox_dummies],axis=1)

for col in ["total_rooms","total_bedrooms","population","households"]:
    cap=df[col].quantile(0.99)
    df[col]=df[col].clip(upper=cap)
print("Capped outliers at 99th pct for room/population cols")

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
Capped outliers at 99th pct for room/population cols


FEATURE ENGINEERING

In [6]:
df["rooms_per_household"]=df["total_rooms"]/df["households"]
df["bedrooms_per_room"]=df["total_bedrooms"]/df["total_rooms"]
df["population_per_household"]=df["population"]/df["households"]
assert df.isnull().sum().sum()==0, "Still have NaNs!"
# print("  ✅ Engineered 3 ratio features — zero NaNs confirmed")
print(f"\n── Describe ──\n{df.describe().round(2).to_string()}")


── Describe ──
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  population  households  median_income  median_house_value  rooms_per_household  bedrooms_per_room  population_per_household
count   20640.00  20640.00            20640.00     20640.00        20640.00    20640.00    20640.00       20640.00            20640.00             20640.00           20640.00                  20640.00
mean     -119.57     35.63               28.64      2587.24          529.09     1403.61      492.21           3.87           206855.82                 5.42               0.21                      3.05
std         2.00      2.14               12.59      1857.01          370.68      973.48      336.97           1.90           115395.62                 2.47               0.06                      8.66
min      -124.35     32.54                1.00         2.00            1.00        3.00        1.00           0.50            14999.00                 0.85               0.04      

DATASET SPLIT

In [8]:
TARGET="median_house_value"
FEATURES=[c for c in df.columns if c!=TARGET]
X=df[FEATURES]; y=df[TARGET]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)
scaler=StandardScaler()
Xs_train=scaler.fit_transform(X_train)
Xs_test=scaler.transform(X_test)
print(f"Train: {X_train.shape[0]:,}  |  Test: {X_test.shape[0]:,}")
print(f"Features ({len(FEATURES)}): {FEATURES}")

Train: 16,512  |  Test: 4,128
Features (15): ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'prox_INLAND', 'prox_ISLAND', 'prox_NEAR BAY', 'prox_NEAR OCEAN', 'rooms_per_household', 'bedrooms_per_room', 'population_per_household']


MODEL TRAINING

In [9]:
model=LinearRegression()
model.fit(Xs_train,y_train)
print("Model trained")

Model trained


MODEL EVALUATION

In [10]:
y_pred_train=model.predict(Xs_train)
y_pred_test=model.predict(Xs_test)
rmse_train=np.sqrt(mean_squared_error(y_train,y_pred_train))
rmse_test=np.sqrt(mean_squared_error(y_test,y_pred_test))
r2_train=r2_score(y_train,y_pred_train)
r2_test=r2_score(y_test,y_pred_test)
mae_test=np.abs(y_test-y_pred_test).mean()

print(f"\n{'Metric':<22}{'Train':>14}{'Test':>14}")
print(f"{'RMSE ($)':<22}{rmse_train:>14,.0f}{rmse_test:>14,.0f}")
print(f"{'R²':<22}{r2_train:>14.4f}{r2_test:>14.4f}")
print(f"{'MAE ($)':<22}{np.abs(y_train-y_pred_train).mean():>14,.0f}{mae_test:>14,.0f}")


Metric                         Train          Test
RMSE ($)                      66,755        72,087
R²                            0.6666        0.6034
MAE ($)                       48,443        50,751


COEFFICIENTS

In [11]:
coef_df=pd.DataFrame({"Feature":FEATURES,"Coefficient":model.coef_}).sort_values("Coefficient",ascending=False).reset_index(drop=True)
print("\n── Standardised Coefficients ──")
print(f"  {'Feature':<32}{'Coeff ($)':>12}  Bar")
max_c=coef_df["Coefficient"].abs().max()
for _,row in coef_df.iterrows():
    bar="█"*max(1,int(abs(row.Coefficient)/max_c*20))
    sign="+" if row.Coefficient>=0 else "-"
    print(f"  {row.Feature:<32}{sign}{abs(row.Coefficient):>11,.0f}  {bar}")
print(f"\n  Intercept: ${model.intercept_:,.0f}")


── Standardised Coefficients ──
  Feature                            Coeff ($)  Bar
  median_income                   +     78,410  ████████████████████
  households                      +     46,540  ███████████
  bedrooms_per_room               +     16,346  ████
  housing_median_age              +     14,145  ███
  rooms_per_household             +      7,408  █
  total_bedrooms                  +      6,554  █
  total_rooms                     +      5,896  █
  prox_ISLAND                     +      1,976  █
  population_per_household        +      1,649  █
  prox_NEAR OCEAN                 +        390  █
  prox_NEAR BAY                   -      2,339  █
  prox_INLAND                     -     16,072  ████
  population                      -     53,486  █████████████
  longitude                       -     57,118  ██████████████
  latitude                        -     58,124  ██████████████

  Intercept: $207,195


SAVE MODEL

In [12]:
joblib.dump({"model":model,"scaler":scaler,"features":FEATURES},f"housing_lr_model.pkl")
print(f"Model saved")

Model saved


PREDICTION EXAMPLE

In [14]:
sample=X_test.iloc[:10].copy()
preds=model.predict(scaler.transform(sample))
actuals=y_test.iloc[:10].values
errors=preds-actuals; pct_err=errors/actuals*100
print("\n10 Example Predictions")
print(f"  {'#':<4}{'Actual ($)':>12}{'Predicted ($)':>14}{'Error ($)':>11}{'Err %':>8}")
for i,(a,p,e,pe) in enumerate(zip(actuals,preds,errors,pct_err)):
    s="+" if e>=0 else ""
    print(f"  {i+1:<4}{a:>12,.0f}{p:>14,.0f}{s}{e:>10,.0f}{s}{pe:>7.1f}%")


10 Example Predictions
  #     Actual ($) Predicted ($)  Error ($)   Err %
  1         47,700        56,869+     9,169+   19.2%
  2         45,800       123,902+    78,102+  170.5%
  3        500,001       283,282  -216,719  -43.3%
  4        218,600       260,707+    42,107+   19.3%
  5        278,000       259,437   -18,563   -6.7%
  6        158,700       150,999    -7,701   -4.9%
  7        198,200       308,835+   110,635+   55.8%
  8        157,500       241,263+    83,763+   53.2%
  9        340,000       263,350   -76,650  -22.5%
  10       446,600       411,333   -35,267   -7.9%
