In [6]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import sqlite3
from pathlib import Path

In [None]:
data_dir = Path("./data")
db_path = data_dir / "hdb_prices.db"

conn = sqlite3.connect(db_path)
resale_df = pd.read_sql("SELECT * FROM resale_prices", conn)
conn.close()

   _id    month        town flat_type      flat_model block  \
0    1  2012-03  ANG MO KIO    2 ROOM        Improved   172   
1    2  2012-03  ANG MO KIO    2 ROOM        Improved   510   
2    3  2012-03  ANG MO KIO    3 ROOM  New Generation   610   
3    4  2012-03  ANG MO KIO    3 ROOM  New Generation   474   
4    5  2012-03  ANG MO KIO    3 ROOM  New Generation   604   

         street_name storey_range  floor_area_sqm lease_commence_date  \
0   ANG MO KIO AVE 4     06 TO 10            45.0                1986   
1   ANG MO KIO AVE 8     01 TO 05            44.0                1980   
2   ANG MO KIO AVE 4     06 TO 10            68.0                1980   
3  ANG MO KIO AVE 10     01 TO 05            67.0                1984   
4   ANG MO KIO AVE 5     06 TO 10            67.0                1980   

   resale_price  
0      250000.0  
1      265000.0  
2      315000.0  
3      320000.0  
4      321000.0  


In [8]:
resale_df.columns

Index(['_id', 'month', 'town', 'flat_type', 'flat_model', 'block',
       'street_name', 'storey_range', 'floor_area_sqm', 'lease_commence_date',
       'resale_price'],
      dtype='object')

In [None]:


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import joblib
import optuna

# ----------------------------------------------------------------------
# 1. Load data
# ----------------------------------------------------------------------
DATA_PATH = "resale_price.csv"   # <--- change if necessary
df = pd.read_csv(DATA_PATH)

# ----------------------------------------------------------------------
# 2. Basic cleaning / column drops
# ----------------------------------------------------------------------
drop_cols = ["block", "street_name", "remaining_lease", "lease_commence_date"]
df = df.drop(columns=drop_cols, errors="ignore")

# ----------------------------------------------------------------------
# 3. Feature engineering
# ----------------------------------------------------------------------
# 3a. Map storey_range to low / medium / high
def map_storey(s: str) -> str:
    """
    s examples: '01 TO 03', '22 TO 24', '40 TO 42'
    """
    try:
        lower = int(s.split(" TO ")[0])
    except Exception:
        return "unknown"
    if lower <= 6:
        return "low"
    elif lower <= 12:
        return "medium"
    else:
        return "high"

df["storey_group"] = df["storey_range"].astype(str).apply(map_storey)

# 3b. Convert month to datetime then to numeric "days since earliest"
df["month"] = pd.to_datetime(df["month"], errors="coerce")
df["month_int"] = (df["month"] - df["month"].min()).dt.days

# 3c. Final feature / target lists
num_features = ["floor_area_sqm", "month_int"]
cat_features = ["town", "flat_type", "flat_model", "storey_group"]
target = "resale_price"

X = df[num_features + cat_features]
y = df[target]

# ----------------------------------------------------------------------
# 4. Pre-processing pipeline
# ----------------------------------------------------------------------
numeric_tf = "passthrough"
categorical_tf = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_features),
        ("cat", categorical_tf, cat_features),
    ]
)

# ----------------------------------------------------------------------
# 5. Train / test split
# ----------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)

# ----------------------------------------------------------------------
# 6. Optuna hyper-parameter search
# ----------------------------------------------------------------------
def objective(trial: optuna.Trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 2000, step=200),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "random_state": 42,
        "n_jobs": -1,
    }

    model = xgb.XGBRegressor(**params)
    pipe = Pipeline(steps=[("prep", preprocessor), ("reg", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    return rmse

study = optuna.create_study(direction="minimize", study_name="xgb_resale")
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)

# ----------------------------------------------------------------------
# 7. Retrain on full training set with best parameters
# ----------------------------------------------------------------------
best_params = study.best_params.copy()
best_params["random_state"] = 42
best_params["n_jobs"] = -1

final_model = xgb.XGBRegressor(**best_params)
pipe_final = Pipeline(steps=[("prep", preprocessor), ("reg", final_model)])
pipe_final.fit(X_train, y_train)

# ----------------------------------------------------------------------
# 8. Final evaluation
# ----------------------------------------------------------------------
test_preds = pipe_final.predict(X_test)
print("Test MAE :", mean_absolute_error(y_test, test_preds))
print("Test RMSE:", mean_squared_error(y_test, test_preds, squared=False))

# ----------------------------------------------------------------------
# 9. Save model
# ----------------------------------------------------------------------
MODEL_PATH = "resale_xgb_tuned.pkl"
joblib.dump(pipe_final, MODEL_PATH)
print(f"Model saved to {os.path.abspath(MODEL_PATH)}")