# 1. Load files

In [14]:
"""
Train a CatBoost model for UK housing prices on 2016â€“2017 data.

Steps:
1. Load cleaned data
2. Filter to years 2016â€“2017
3. Optionally keep only the newest N rows
4. Train/val/test split (80 / 10 / 10)
5. Train CatBoostRegressor with categorical features
6. Evaluate on val + test
7. Save model
"""

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor

# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
CLEANED_PATH = "cleaned_uk_housing2.csv"
TARGET_COL = "price"

USE_NEWEST_ONLY = True       # set to False if you want ALL 2016â€“2017 rows
MAX_ROWS = 1000000          # newest N rows within 2016â€“2017

RANDOM_STATE = 42
# ------------------------------------------------------------
# 1. LOAD CLEANED DATA
# ------------------------------------------------------------
if not os.path.exists(CLEANED_PATH):
    raise FileNotFoundError(f"File not found: {CLEANED_PATH}")

df = pd.read_csv(CLEANED_PATH)
print(f"âœ… Loaded cleaned data: {df.shape}")

if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in the dataset.")


âœ… Loaded cleaned data: (21158869, 14)


# 2. FEATURE ENGINEERING + FILTER (2016â€“2017)

In [None]:


# Ensure date_of_transfer exists and build numeric timestamp
if "date_of_transfer" not in df.columns:
    raise ValueError("Column 'date_of_transfer' not found in the dataset.")

df["date_of_transfer"] = pd.to_datetime(df["date_of_transfer"], errors="coerce")
df["date_numeric"] = df["date_of_transfer"].astype("int64") // 10**9

# Year / month
if "year" not in df.columns:
    df["year"] = df["date_of_transfer"].dt.year
if "month" not in df.columns:
    df["month"] = df["date_of_transfer"].dt.month

df["year"] = pd.to_numeric(df["year"], errors="coerce")

# Choose features for the model
selected_features = [
    "district",
    "town",
    "county",
    "month",
    "year",
    "property_type",
    "tenure",
    "new_build_flag",
    "date_numeric",
]

cols_to_use = [c for c in selected_features if c in df.columns] + [TARGET_COL]
df_small = df[cols_to_use].copy()

# Ensure new_build_flag is categorical if present
if "new_build_flag" in df_small.columns:
    df_small["new_build_flag"] = df_small["new_build_flag"].astype("object")

# Filter to 2016â€“2017
df_small = df_small[df_small["year"].isin([2016, 2017])].copy()
print(f"âœ… Rows after year filter (2016â€“2017): {df_small.shape}")

# Sort oldest â†’ newest using year, month, and date_numeric as tie-breaker
sort_cols = [c for c in ["year", "month", "date_numeric"] if c in df_small.columns]
df_small = df_small.sort_values(by=sort_cols, ascending=True)

# Optionally keep only the newest MAX_ROWS
if USE_NEWEST_ONLY and len(df_small) > MAX_ROWS:
    df_small = df_small.tail(MAX_ROWS).reset_index(drop=True)
    print(f"âœ… Using newest {MAX_ROWS:,} rows from 2016â€“2017: {df_small.shape}")
else:
    df_small = df_small.reset_index(drop=True)
    print("âœ… Using all rows within 2016â€“2017:", df_small.shape)


âœ… Rows after year filter (2016â€“2017): (1170866, 10)
âœ… Using newest 1,000,000 rows from 2016â€“2017: (1000000, 10)


# 3. TRAIN / VAL / TEST SPLIT (80 / 10 / 10)

In [None]:

X = df_small.drop(columns=[TARGET_COL])
y = df_small[TARGET_COL]

# 80% train+val, 20% test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# Within trainval, make explicit val chunk (~10% overall)
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.1111, random_state=RANDOM_STATE
)
# 0.1111 of 0.8 â‰ˆ 0.0889, so roughly 80 / 10 / 10

print("\n=== SPLIT SUMMARY ===")
print("Train:", X_train.shape)
print("Val:  ", X_val.shape)
print("Test: ", X_test.shape)





=== SPLIT SUMMARY ===
Train: (711120, 9)
Val:   (88880, 9)
Test:  (200000, 9)


# 4. CATEGORICAL FEATURES FOR CATBOOST

In [None]:
# Categorical columns: CatBoost can handle them natively
cat_feature_names = []
for col in ["district", "town", "county", "property_type", "tenure", "new_build_flag"]:
    if col in X_train.columns:
        cat_feature_names.append(col)

print("\nCategorical features for CatBoost:", cat_feature_names)



Categorical features for CatBoost: ['district', 'town', 'county', 'property_type', 'tenure', 'new_build_flag']


# 5. TRAIN CATBOOST REGRESSOR

In [None]:

model = CatBoostRegressor(
    loss_function="RMSE",
    eval_metric="RMSE",
    depth=8,
    learning_rate=0.05,
    n_estimators=3000,
    subsample=0.8,
    random_state=RANDOM_STATE,
    l2_leaf_reg=3.0,
    od_type="Iter",   # early stopping type
    od_wait=100,      # rounds to wait before stopping
    use_best_model=True,
    verbose=False     # training logs off; fit() can still override with verbose
)
print("\n Training CatBoost on raw price (no log-transform)...")
model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_feature_names,
    verbose=100  # print every 100 iterations
)



ðŸš€ Training CatBoost on raw price (no log-transform)...
0:	learn: 91056.1892742	test: 91303.9437515	best: 91303.9437515 (0)	total: 248ms	remaining: 12m 22s
100:	learn: 59275.4400136	test: 59188.2923955	best: 59188.2923955 (100)	total: 20.7s	remaining: 9m 53s
200:	learn: 58366.7779498	test: 58297.6319716	best: 58297.6319716 (200)	total: 40.6s	remaining: 9m 25s


KeyboardInterrupt: 

# 6. EVALUATION

In [None]:

def eval_regression(y_true, y_pred, name=""):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"\n=== {name} ===")
    print(f"MAE:  Â£{mae:,.2f}")
    print(f"RMSE: Â£{rmse:,.2f}")
    print(f"RÂ²:   {r2:.4f}")
    return mae, rmse, r2

# Validation
val_pred = model.predict(X_val)
eval_regression(y_val, val_pred, "Validation (CatBoost)")

# Test
test_pred = model.predict(X_test)
mae_test, rmse_test, r2_test = eval_regression(y_test, test_pred, "TEST (CatBoost)")

avg_price = y_test.mean()
mae_percent = (mae_test / avg_price) * 100
print(f"\nAverage actual house price (test): Â£{avg_price:,.2f}")
print(f"MAE â‰ˆ {mae_percent:.2f}% of average price.")


=== Validation (CatBoost) ===
MAE:  Â£42,425.34
RMSE: Â£57,089.25
RÂ²:   0.6281

=== TEST (CatBoost) ===
MAE:  Â£42,309.31
RMSE: Â£56,954.98
RÂ²:   0.6281

Average actual house price (test): Â£198,689.62
MAE â‰ˆ 21.29% of average price.


# 7 Save model

In [None]:
import pickle

# Save as Pickle
PKL_PATH = "uk_housing_price_catboost.pkl"
with open(PKL_PATH, "wb") as f:
    pickle.dump(model, f)

print(f"\nðŸ’¾ Saved CatBoost model to: {PKL_PATH}")

# Also save native CatBoost format (optional but recommended)
CBM_PATH = "uk_housing_price_catboost.cbm"
model.save_model(CBM_PATH)
print(f"ðŸ’¾ Saved CatBoost model in native format to: {CBM_PATH}")


ðŸ’¾ Saved CatBoost model to: uk_housing_price_catboost.pkl
ðŸ’¾ Saved CatBoost model in native format to: uk_housing_price_catboost.cbm
