In [1]:
# PART 0: Import Libraries & Load Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv(r"C:\Users\manje\OneDrive\Desktop\housd prediction model\House Price India.csv")
df.head()


Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,...,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,6762810145,42491,5,2.5,3650,9050,2.0,0,4,5,...,1921,0,122003,52.8645,-114.557,2880,5400,2,58,2380000
1,6762810635,42491,4,2.5,2920,4000,1.5,0,0,5,...,1909,0,122004,52.8878,-114.47,2470,4000,2,51,1400000
2,6762810998,42491,5,2.75,2910,9480,1.5,0,0,3,...,1939,0,122004,52.8852,-114.468,2940,6600,1,53,1200000
3,6762812605,42491,4,2.5,3310,42998,2.0,0,0,3,...,2001,0,122005,52.9532,-114.321,3350,42847,3,76,838000
4,6762812919,42491,3,2.0,2710,4500,1.5,0,0,4,...,1929,0,122006,52.9047,-114.485,2060,4500,1,51,805000


In [2]:
# PART 1: BASELINE MODELS

target = "Price"

y = df[target].values

def get_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# Mean Baseline
mean_pred = np.full_like(y, y.mean())
mean_rmse, mean_mae, mean_r2 = get_metrics(y, mean_pred)

# Median Baseline
median_pred = np.full_like(y, np.median(y))
median_rmse, median_mae, median_r2 = get_metrics(y, median_pred)

print("Mean Baseline:", mean_rmse, mean_mae, mean_r2)
print("Median Baseline:", median_rmse, median_mae, median_r2)


Mean Baseline: 367519.81108222017 233142.52777017784 -3.5282887722587475e-13
Median Baseline: 378126.633543771 221062.11354309166 -0.058554023383986964


In [3]:
# PART 2: PREPROCESSING

df = df[df[target].notna()].reset_index(drop=True)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove("Price")

cat_cols = df.select_dtypes(include=['object']).columns.tolist()
low_cardinality_cat = [c for c in cat_cols if df[c].nunique() <= 50]

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
for c in low_cardinality_cat:
    df[c] = df[c].fillna("Missing")

X = df[numeric_cols + low_cardinality_cat]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
numeric_transformer = Pipeline([("scaler", StandardScaler())])
categorical_transformer = Pipeline([("ohe", OneHotEncoder(handle_unknown='ignore'))])

preprocess = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, low_cardinality_cat)
])


In [4]:
# PART 3: SIMPLE LINEAR REGRESSION (SLR)

corr = df[numeric_cols + ["Price"]].corr()["Price"].abs().sort_values(ascending=False)
best_single = corr.index[0]

print("Best predictor for SLR =", best_single)

X_train_slr = X_train[[best_single]].values
X_test_slr = X_test[[best_single]].values

scaler = StandardScaler().fit(X_train_slr)
X_train_slr_scaled = scaler.transform(X_train_slr)
X_test_slr_scaled = scaler.transform(X_test_slr)

slr = LinearRegression().fit(X_train_slr_scaled, y_train)
y_pred_slr = slr.predict(X_test_slr_scaled)

slr_rmse, slr_mae, slr_r2 = get_metrics(y_test, y_pred_slr)
print("SLR:", slr_rmse, slr_mae, slr_r2)


Best predictor for SLR = Price


KeyError: "None of [Index(['Price'], dtype='object')] are in the [columns]"

In [5]:
# PART 4: MULTIPLE LINEAR REGRESSION (MLR)

mlr = Pipeline([
    ("pre", preprocess),
    ("model", LinearRegression())
])

mlr.fit(X_train, y_train)
y_pred_mlr = mlr.predict(X_test)

mlr_rmse, mlr_mae, mlr_r2 = get_metrics(y_test, y_pred_mlr)

# Adjusted R2
p = preprocess.fit_transform(X_train).shape[1]
n = len(y_test)
adj_r2 = 1 - (1 - mlr_r2) * (n - 1) / (n - p - 1)

print("MLR:", mlr_rmse, mlr_mae, mlr_r2, adj_r2)


MLR: 187067.05983154147 105220.86360163798 0.7516722570534733 0.7497890408022414


In [7]:
pip install statsmodels


Collecting statsmodels
  Downloading statsmodels-0.14.5-cp312-cp312-win_amd64.whl.metadata (9.8 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.2-py2.py3-none-any.whl.metadata (3.6 kB)
Downloading statsmodels-0.14.5-cp312-cp312-win_amd64.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   -- ------------------------------------- 0.5/9.6 MB 4.2 MB/s eta 0:00:03
   ------- -------------------------------- 1.8/9.6 MB 5.6 MB/s eta 0:00:02
   ---------- ----------------------------- 2.6/9.6 MB 5.0 MB/s eta 0:00:02
   --------------------- ------------------ 5.2/9.6 MB 7.2 MB/s eta 0:00:01
   --------------------------------- ------ 8.1/9.6 MB 8.8 MB/s eta 0:00:01
   ---------------------------------------- 9.6/9.6 MB 8.7 MB/s eta 0:00:00
Downloading patsy-1.0.2-py2.py3-none-any.whl (233 kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-1.0.2 statsmodels-0.14.5
Note: you may need to restart the kernel to


[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
# PART 5: VIF CHECK

from statsmodels.stats.outliers_influence import variance_inflation_factor

X_numeric = X_train[numeric_cols]
vif_df = pd.DataFrame()
vif_df["feature"] = numeric_cols
vif_df["VIF"] = [variance_inflation_factor(X_numeric.values, i) for i in range(len(numeric_cols))]

vif_df.sort_values("VIF", ascending=False)


Unnamed: 0,feature,VIF
4,living area,inf
12,Area of the basement,inf
11,Area of the house(excluding basement),inf
0,id,49118740.0
10,grade of the house,3.436989
3,number of bathrooms,3.35746
18,living_area_renov,3.002322
13,Built Year,2.4076
19,lot_area_renov,2.114348
5,lot area,2.07098


In [9]:
# PART 6: REGULARIZED MODELS

models = {
    "Ridge": Ridge(),
    "Lasso": Lasso(max_iter=5000),
    "ElasticNet": ElasticNet(max_iter=5000)
}

params = {
    "Ridge": {"model__alpha": [0.1, 1, 10]},
    "Lasso": {"model__alpha": [0.001, 0.01, 0.1]},
    "ElasticNet": {"model__alpha": [0.01, 0.1], "model__l1_ratio": [0.3, 0.5]}
}

results = {}

for name, model in models.items():
    pipe = Pipeline([("pre", preprocess), ("model", model)])
    grid = GridSearchCV(pipe, params[name], cv=3, scoring="neg_root_mean_squared_error")
    grid.fit(X_train, y_train)
    
    best = grid.best_estimator_
    pred = best.predict(X_test)
    rmse, mae, r2 = get_metrics(y_test, pred)
    
    results[name] = (rmse, mae, r2, grid.best_params_)

results


{'Ridge': (np.float64(187149.35981931802),
  np.float64(105497.43319296658),
  0.751453705845912,
  {'model__alpha': 10}),
 'Lasso': (np.float64(187162.32683698056),
  np.float64(105542.34707157659),
  0.7514192625986356,
  {'model__alpha': 0.1}),
 'ElasticNet': (np.float64(187069.238887749),
  np.float64(105186.20518532387),
  0.7516664717135862,
  {'model__alpha': 0.01, 'model__l1_ratio': 0.3})}

In [10]:
# PART 7: CROSS VALIDATION

cv = KFold(5, shuffle=True, random_state=42)

cv_results = cross_validate(
    mlr, X, y,
    cv=cv,
    scoring=("neg_root_mean_squared_error", "r2"),
    return_train_score=False
)

print("CV RMSE:", -cv_results["test_neg_root_mean_squared_error"].mean())
print("CV R2:", cv_results["test_r2"].mean())


CV RMSE: 186647.30992803985
CV R2: 0.7416957262343822


In [12]:
# PART 8: CHAMPION MODEL SELECTION (FIXED VERSION)

leaderboard = {
    "Mean Baseline": mean_rmse,
    "Median Baseline": median_rmse,
    "MLR": mlr_rmse,
    "Ridge": results["Ridge"][0],
    "Lasso": results["Lasso"][0],
    "ElasticNet": results["ElasticNet"][0]
}

# --- Only add SLR if it exists ---
try:
    leaderboard[f"SLR ({best_single})"] = slr_rmse
except:
    print("SLR was not computed — skipping SLR in leaderboard.")

# Convert to series and sort
leaderboard_series = pd.Series(leaderboard).sort_values()

print("\nLeaderboard (sorted by RMSE):")
print(leaderboard_series)

# Pick champion
champion = leaderboard_series.index[0]
print("\nChampion Model =", champion)


SLR was not computed — skipping SLR in leaderboard.

Leaderboard (sorted by RMSE):
MLR                187067.059832
ElasticNet         187069.238888
Ridge              187149.359819
Lasso              187162.326837
Mean Baseline      367519.811082
Median Baseline    378126.633544
dtype: float64

Champion Model = MLR


In [14]:
# PART 9: SAVE ARTIFACTS
import os
import json
ART_DIR = r"C:\Users\manje\OneDrive\Desktop\housd prediction model\artifacts"
os.makedirs(ART_DIR, exist_ok=True)

# Save champion model
joblib.dump(mlr, ART_DIR + r"\champion_model.joblib")

# Save predictions
pred_df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred_mlr})
pred_df.to_csv(ART_DIR + r"\houseprice_predictions_test.csv", index=False)

# Save metrics
metrics_dict = {
    "RMSE": mlr_rmse,
    "MAE": mlr_mae,
    "R2": mlr_r2,
    "Adj_R2": adj_r2
}
with open(ART_DIR + r"\test_metrics.json", "w") as f:
    json.dump(metrics_dict, f, indent=4)

print("Artifacts saved.")


Artifacts saved.
