#### 1. Imports and Set-up


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import xgboost as xgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

tracking_uri = "../logs/mlruns"
os.makedirs(os.path.join(tracking_uri, ".trash"), exist_ok=True)

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("house_price_prediction")

#### 2. Load and prep data


In [None]:
import sys
import os
from pathlib import Path
import yaml


# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_loading.data_loading.data_loader import load_data_from_json
from src.data_loading.preprocessing.preprocessing import preprocess_df
from src.data_loading.preprocessing.imputation import impute_missing_values


# go two levels up from notebook dir -> project root
ROOT = (
    Path(__file__).resolve().parents[2]
    if "__file__" in globals()
    else Path.cwd().parents[1]
)
CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "preprocessing_config.yaml"
)

with open(CONFIG_PATH) as f:
    CONFIG = yaml.safe_load(f)

df_raw = load_data_from_json("../data/parsed_json/*.json")
df_clean = preprocess_df(
    df_raw,
    drop_raw=CONFIG["preprocessing"]["drop_raw"],
    numeric_cols=CONFIG["preprocessing"]["numeric_cols"],
)
df_clean = impute_missing_values(
    df_clean, CONFIG["preprocessing"]["imputation"]
)
# Drop price_num NaNs for the training of the model
df_clean = df_clean[df_clean["price_num"].notna()]
df = df_clean.copy()

In [None]:
df.isna().sum()

In [None]:
from src.features.data_prep_for_modelling.data_preparation import prepare_data

FEATURES_CONFIG_PATH = (
    ROOT / "house_price_prediction_project" / "config" / "features.yaml"
)

# Scaled features
X_train_scaled, X_test_scaled, y_train, y_test, scaler, X_val, y_val = (
    prepare_data(
        df, config_path=FEATURES_CONFIG_PATH, model_name="linear_regression"
    )
)

# # Raw features (no scaling)
# X_train_raw, X_test_raw, y_train, y_test, _, X_val_raw, y_val_raw = prepare_data(
#     df, config_path=FEATURES_CONFIG_PATH, model_name="linear_regression", scale=False
# )

#### 3. Code for evaluating and logging models


#### 4. Linear Regression


In [None]:
from src.model.evaluate import ModelEvaluator
from src.model.mlflow_logger import MLFlowLogger

evaluator = ModelEvaluator()
logger = MLFlowLogger()

lr_model = LinearRegression()

trained_lr, y_train_pred, y_val_pred, y_test_pred,  lr_results = evaluator.evaluate(
    lr_model,
    X_train_scaled,
    y_train,
    X_test_scaled,
    y_test,
    model_name="LinearRegression",
)
logger.log_model(trained_lr, "LinearRegression", lr_results)

#### 5. Random Forest Regression


In [None]:
from src.features.feature_engineering.encoding import encode_energy_label

X_train, X_test, y_train, y_test, scaler, X_val, y_val = prepare_data(
    df_clean, config_path=FEATURES_CONFIG_PATH, model_name="random_forest"
)
X_train, energy_encoder = encode_energy_label(X_train, fit=True)

X_test, _ = encode_energy_label(X_test, encoder=energy_encoder, fit=False)

In [None]:
X_train.energy_label_encoded.unique()

In [None]:
rf_model = RandomForestRegressor()

trained_rf, y_train_pred, y_val_pred, y_test_pred, rf_results = evaluator.evaluate(
    rf_model,
    X_train,
    y_train,
    X_test,
    y_test,
    model_name="RandomForestRegression",
)
logger.log_model(trained_rf, "RandomForestRegression", rf_results)

#### 6. XGBoost model


In [None]:
from src.model.utils import load_model_config

X_train, X_test, y_train, y_test, scaler, X_val, y_val = prepare_data(
    df_clean, config_path=FEATURES_CONFIG_PATH, model_name="XGBoost"
)
X_train, energy_encoder = encode_energy_label(X_train, fit=True)

X_test, _ = encode_energy_label(X_test, encoder=energy_encoder, fit=False)

MODEL_CONFIG_PATH = (
    ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"
)

model_params, fit_params = load_model_config(
    MODEL_CONFIG_PATH, model_name="xgb"
)

xgb_model = xgb.XGBRegressor(**model_params)

trained_xgb, y_train_pred, y_val_pred, y_test_pred, xgb_results = evaluator.evaluate(
    xgb_model,
    X_train,
    y_train,
    X_test,
    y_test,
    fit_params=fit_params,
    model_name="XGBoostRegression",
)
logger.log_model(trained_xgb, "XGBoostRegression", xgb_results)

# xgb_model, results = evaluate_model(
#     xgb_model,
#     X_train,
#     y_train,
#     X_test,
#     y_test,
# )
# log_to_mlflow(xgb_model, "XGBoost_Regression", results)

#### 7. XGBoost with early stopping and more tuning


In [None]:
X_train, X_test, y_train, y_test, scaler, X_val, y_val = prepare_data(
    df_clean,
    config_path=FEATURES_CONFIG_PATH,
    model_name="XGBoostEarlyStopping",
)
X_train, energy_encoder = encode_energy_label(X_train, fit=True)

X_test, _ = encode_energy_label(X_test, encoder=energy_encoder, fit=False)
X_val, _ = encode_energy_label(X_val, encoder=energy_encoder, fit=False)

xgb_model_params, xgb_fit_params = load_model_config(
    MODEL_CONFIG_PATH, "xgb_with_early_stopping"
)

evaluator = ModelEvaluator()


trained_xgb, y_train_pred, y_val_pred, y_test_pred, xgb_results = evaluator.evaluate(
    xgb_model_params,
    X_train,
    y_train,
    X_test,
    y_test,
    X_val=X_val,
    y_val=y_val,
    fit_params=xgb_fit_params,
    use_xgb_train=True,
    model_name="xgb_with_early_stopping",
)

logger.log_model(
    trained_xgb, "xgb_with_early_stopping", xgb_results, use_xgb_train=True
)

#### 7. Compare models using MLflow


In [None]:
experiment_name = "house_price_prediction"

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])

In [None]:
metrics_of_interest = [
    # Original scale
    "metrics.train_rmse",
    "metrics.test_rmse",
    "metrics.train_mae",
    "metrics.test_mae",
    "metrics.train_r2",
    "metrics.test_r2",
    "metrics.train_mape",
    "metrics.test_mape",
    
    # Log / transformed scale
    # "metrics.train_rmse_trans",
    # "metrics.test_rmse_trans",
    # "metrics.train_mae_trans",
    # "metrics.test_mae_trans",
    # "metrics.train_r2_trans",
    # "metrics.test_r2_trans",
    # "metrics.train_mape_trans",
    # "metrics.test_mape_trans",
]
comparison_df = runs_df[
    ["run_id", "tags.mlflow.runName"] + metrics_of_interest
]

comparison_df.sort_values("metrics.test_r2", ascending=False, inplace=True)
comparison_df

In [None]:
best_model = comparison_df.sort_values(
    "metrics.test_r2", ascending=False
).iloc[0]
print("Best model based on test R²:")
print(best_model)

#### 8. Hyperparameter tuning with Optuna


In [None]:
from functools import partial
from src.model.objectives_optuna import objective_xgb, objective_rf

HYPERPARAM_CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "hyperparameters_optuna.yaml"
)

# XGBoost study
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study_xgb = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
objective_xgb_partial = partial(
    objective_xgb,
    df=df_clean,
    features_config=FEATURES_CONFIG_PATH,
    hyperparam_config=HYPERPARAM_CONFIG_PATH,
    model_name="XGBoostEarlyStopping",
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,

)
study_xgb.optimize(objective_xgb_partial, n_trials=30)

print("Best XGBoost params:", study_xgb.best_params)
print("Best XGBoost Test RMSE:", study_xgb.best_value)

# RandomForest study
study_rf = optuna.create_study(direction="minimize")
objective_rf_partial = partial(
    objective_rf,
    df=df_clean,
    features_config=FEATURES_CONFIG_PATH,
    hyperparam_config=HYPERPARAM_CONFIG_PATH,
    model_name="random_forest_optuna",
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
)
study_rf.optimize(objective_rf_partial, n_trials=30)

print("Best RF params:", study_rf.best_params)
print("Best RF Test RMSE:", study_rf.best_value)

#### 9. RF and Xgboost with best parameters


In [None]:
best_rf = RandomForestRegressor(**study_rf.best_params)
trained_rf, y_train_pred, y_val_pred, y_test_pred, results_rf = evaluator.evaluate(
    best_rf,
    X_train,
    y_train,
    X_test,
    y_test,
    X_val=X_val,
    y_val=y_val,
    use_xgb_train=False,
    model_name="RF_Optuna",
)

logger.log_model(trained_rf, "RF_Optuna", results_rf, use_xgb_train=False)

In [None]:
from src.features.feature_engineering.encoding import (
    encode_energy_labels_train_test_val,
)

X_train, X_test, y_train, y_test, _, X_val, y_val = prepare_data(
    df_clean,
    config_path=FEATURES_CONFIG_PATH,
    model_name="XGBoostEarlyStopping",
)

X_train, X_test, X_val, enc = encode_energy_labels_train_test_val(
    X_train, X_test, X_val
)


xgb_model_params, xgb_fit_params = load_model_config(
    MODEL_CONFIG_PATH, "xgb_with_early_stopping"
)

evaluator = ModelEvaluator()

trained_xgb, _, _, _, xgb_results = evaluator.evaluate(
    xgb_model_params,
    X_train,
    y_train,
    X_test,
    y_test,
    X_val=X_val,
    y_val=y_val,
    fit_params=xgb_fit_params,
    use_xgb_train=True,
    model_name="xgb_with_early_stopping_optuna",
)

logger.log_model(
    trained_xgb, "xgb_with_early_stopping_optuna", xgb_results, use_xgb_train=True
)

#### 10. Let's see how outliers skew RMSE


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot target distribution
sns.boxplot(y=y_test)
plt.show()

# Optional: scatter of predictions vs true values
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.xlabel("True")
plt.ylabel("Predicted")
plt.show()

In [None]:
# Compute residuals
residuals = y_test - y_test_pred

# Summary stats
print("Residuals summary:")
print("Min:", np.min(residuals))
print("Max:", np.max(residuals))
print("Median:", np.median(residuals))
print("Mean:", np.mean(residuals))
print("Std:", np.std(residuals))

# Plot histogram
plt.hist(residuals, bins=50)
plt.title("Residuals Distribution")
plt.xlabel("Residual")
plt.ylabel("Count")
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error

train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE:  {test_mae:.2f}")

So outliers are skewing the RMSE statistic quite heavily. Hence, I will log transform the target and dot he same analysis.


In [None]:
evaluator = ModelEvaluator(
    target_transform=np.log1p,
    inverse_transform=np.expm1
)

trained_rf, y_train_pred, y_val_pred, y_test_pred, results = evaluator.evaluate(
    rf_model, X_train, y_train, X_test, y_test
)
logger.log_model(trained_rf, "RF_LogTransform_Evaluator", results)


In [None]:
evaluator = ModelEvaluator(
    target_transform=np.log1p,
    inverse_transform=np.expm1
)

trained_xgb, y_train_pred, y_val_pred, y_test_pred, results = evaluator.evaluate(
    xgb_model, X_train, y_train, X_test, y_test
)
logger.log_model(trained_rf, "XGB_LogTransform_Evaluator", results)


In [None]:
# Compute residuals
residuals = y_test - y_test_pred

# Define extreme outliers: e.g., top 5% of absolute residuals
threshold = np.percentile(np.abs(residuals), 95)
outliers_mask = np.abs(residuals) >= threshold

plt.figure(figsize=(8, 6))

# Plot non-outliers
plt.scatter(
    y_test[~outliers_mask],
    y_test_pred[~outliers_mask],
    alpha=0.5,
    label="Normal listings",
)

# Highlight extreme residuals
plt.scatter(
    y_test[outliers_mask],
    y_test_pred[outliers_mask],
    color="red",
    label="Extreme listings",
)

# Diagonal line (perfect prediction)
max_val = max(y_test.max(), y_test_pred.max())
plt.plot(
    [0, max_val],
    [0, max_val],
    color="black",
    linestyle="--",
    label="Perfect prediction",
)

plt.xlabel("Actual Price (€)")
plt.ylabel("Predicted Price (€)")
plt.title("Predicted vs Actual Prices with Extreme Listings Highlighted")
plt.legend()
plt.show()

This is a clear visualization to see how predictions behave across the entire range and highlight the extreme listings that inflate RMSE. Large RMSE is not a deal breaker since:

**Statistical justification**
Skewed distribution: My dataset has a few extremely expensive houses that are far from the mean. RMSE is sensitive to large errors because it squares residuals, so these few points dominate the metric.

MAE is more robust: By reporting MAE alongside RMSE, I show the typical prediction error for most listings, which is a fairer assessment of model performance.

Log-transform mitigates skew: Training on log1p(y) reduces the influence of outliers and stabilizes variance, producing a more reliable model for the bulk of the data.

**Practical/business justification**

The extreme listings (multi-million € homes) are rare. The model performs well on 99% of listings, which is what matters for most users or business decisions.

Trying to perfectly predict the top 1–5% of luxury listings would:

Require specialized models or additional data

Complicate the pipeline

Increase overfitting risk

Reporting MAE and residual distributions communicates clearly that errors on extreme listings exist, but are expected and do not invalidate the model.


#### 11. New approach and moving with optuna (will be changed)


In [None]:
# XGBoost with log-transform

objective_xgb_partial = partial(
    objective_xgb,
    df=df_clean,
    features_config=FEATURES_CONFIG_PATH,
    hyperparam_config=HYPERPARAM_CONFIG_PATH,
    model_name="XGBoostEarlyStopping",
    use_log=True,  # log-transform is applied
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
)
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study_xgb = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
study_xgb.optimize(objective_xgb_partial, n_trials=30)


# Random Forest with log-transform
objective_rf_partial = partial(
    objective_rf,
    df=df_clean,
    features_config=FEATURES_CONFIG_PATH,
    hyperparam_config=HYPERPARAM_CONFIG_PATH,
    model_name="random_forest_optuna",
    use_log=True,  # log-transform is applied
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
)

study_rf = optuna.create_study(direction="minimize")
study_rf.optimize(objective_rf_partial, n_trials=30)

In [None]:
# Initialize evaluator with log-transform if used in Optuna
evaluator = ModelEvaluator(
    target_transform=np.log1p,
    inverse_transform=np.expm1,
)

# --- Random Forest ---
best_rf = RandomForestRegressor(**study_rf.best_params)
trained_rf, y_train_pred, y_val_pred, y_test_pred, results_rf = evaluator.evaluate(
    best_rf,
    X_train,
    y_train,
    X_test,
    y_test,
    X_val=X_val,
    y_val=y_val,
    use_xgb_train=False,  # RF uses scikit-learn API
    model_name="RF_Optuna_Log",  # name for logging
)

logger.log_model(trained_rf, "RF_Optuna_Log", results_rf, use_xgb_train=False)


# --- XGBoost ---
best_xgb_params = study_xgb.best_params
# Note: For XGBoost, we pass params dict to evaluator and set use_xgb_train=True
trained_xgb, y_train_pred, y_val_pred, y_test_pred, results_xgb = evaluator.evaluate(
    best_xgb_params,
    X_train,
    y_train,
    X_test,
    y_test,
    X_val=X_val,
    y_val=y_val,
    use_xgb_train=True,  # use xgb.train
    model_name="XGB_Optuna_Log",
    fit_params={"num_boost_round": 1000, "early_stopping_rounds": 50},
)

logger.log_model(trained_xgb, "XGB_Optuna_Log", results_xgb, use_xgb_train=True)


In [None]:
experiment_name = "house_price_prediction"

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])

metrics_of_interest = [
    # Original scale
    "metrics.train_rmse",
    "metrics.test_rmse",
    "metrics.train_mae",
    "metrics.test_mae",
    "metrics.train_r2",
    "metrics.test_r2",
    "metrics.train_mape",
    "metrics.test_mape",
    
    # Log / transformed scale
    # "metrics.train_rmse_trans",
    # "metrics.test_rmse_trans",
    # "metrics.train_mae_trans",
    # "metrics.test_mae_trans",
    # "metrics.train_r2_trans",
    # "metrics.test_r2_trans",
    # "metrics.train_mape_trans",
    # "metrics.test_mape_trans",
]
comparison_df = runs_df[
    ["run_id", "tags.mlflow.runName"] + metrics_of_interest
]

comparison_df.sort_values("metrics.test_mae", ascending=True, inplace=True)
comparison_df

In [None]:
best_model = comparison_df.sort_values(
    "metrics.test_mae", ascending=True
).iloc[0]
print("Best model based on test MAE:")
print(best_model)

#### 12. Extra feature engineering


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# -----------------------
# Helper Functions
# -----------------------
def to_float(value):
    if pd.isna(value):
        return np.nan
    cleaned = re.sub(r"[^\d\.]", "", str(value))
    return float(cleaned) if cleaned else np.nan

def auto_log_transform(df, numeric_cols, threshold_skew=0.5):
    """Find skewed cols in TRAIN ONLY and return names"""
    log_cols = []
    for col in numeric_cols:
        if (df[col] > 0).all():
            skewness = df[col].skew()
            if abs(skewness) > threshold_skew:
                log_cols.append(col)
    return log_cols

def extract_floor(x):
    if pd.isna(x) or x in ["N/A", "Begane grond"]:
        return 0
    match = re.search(r"(\d+)", str(x))
    return int(match.group(1)) if match else 0

def simplify_roof(roof):
    if pd.isna(roof) or roof == "N/A":
        return "Unknown"
    if "Plat dak" in roof:
        return "Flat"
    if "Zadeldak" in roof:
        return "Saddle"
    if "Samengesteld dak" in roof:
        return "Composite"
    if "Mansarde" in roof:
        return "Mansard"
    return "Other"

def simplify_ownership(x):
    if pd.isna(x) or x.strip() == "":
        return "Unknown"
    if "Volle eigendom" in x:
        return "Full"
    if "Erfpacht" in x and "Gemeentelijk" in x:
        return "Municipal"
    if "Erfpacht" in x:
        return "Leasehold"
    return "Other"

def extract_lease_years(x, current_year=2025):
    if pd.isna(x) or "Volle eigendom" in str(x) or str(x).strip() == "":
        return np.nan
    match = re.search(r"einddatum erfpacht: (\d{2})-(\d{2})-(\d{4})", str(x))
    if match:
        _, _, year = map(int, match.groups())
        return max(year - current_year, 0)
    return np.nan

def simplify_location(x):
    if pd.isna(x):
        return "Unknown"
    if "centrum" in str(x).lower():
        return "Central"
    if "woonwijk" in str(x).lower():
        return "Residential"
    if "vrij uitzicht" in str(x).lower():
        return "OpenView"
    if "park" in str(x).lower():
        return "Park"
    return "Other"

def drop_low_variance_dummies(df, threshold=0.95):
    low_var_cols = [
        col
        for col in df.columns
        if df[col].value_counts(normalize=True, dropna=False).iloc[0] >= threshold
    ]
    return df.drop(columns=low_var_cols), low_var_cols

# -----------------------
# 0. Split FIRST
# -----------------------

df = df.copy()
y = df["price_num"]
X = df.drop(columns=["price_num"]).copy()

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, random_state=42
)  # 70/10/20

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

# -----------------------
# 1. Numeric features
# -----------------------
numeric_features = [
    "size_num", "contribution_vve_num", "external_storage_num", "living_area",
    "nr_rooms", "bathrooms", "toilets", "num_facilities",
    "inhabitants_in_neighborhood", "families_with_children_pct",
    "price_per_m2_neighborhood",
]

for col in numeric_features:
    X_train[col] = X_train[col].apply(to_float)
    median_val = X_train[col].median()
    X_train[col].fillna(median_val, inplace=True)

    X_val[col] = X_val[col].apply(to_float).fillna(median_val)
    X_test[col] = X_test[col].apply(to_float).fillna(median_val)

# Log-transform based only on TRAIN skewness
log_candidates = auto_log_transform(X_train, numeric_features, threshold_skew=0.5)
log_cols = []
for col in log_candidates:
    X_train[f"log_{col}"] = np.log1p(X_train[col])
    X_val[f"log_{col}"] = np.log1p(X_val[col])
    X_test[f"log_{col}"] = np.log1p(X_test[col])
    log_cols.append(f"log_{col}")

print("Log-transformed cols:", log_cols)

# # -----------------------
# # Log-transform target (price)
# # -----------------------
# target_skew = y_train.skew()
# print(f"Train target skewness: {target_skew:.2f}")

# # Apply log1p if skewed
# if target_skew > 0.5:
#     y_train_log = np.log1p(y_train)
#     y_val_log   = np.log1p(y_val)
#     y_test_log  = np.log1p(y_test)
#     print("Target log-transformed")
# else:
#     y_train_log, y_val_log, y_test_log = y_train, y_val, y_test
#     print("Target not transformed")

# -----------------------
# 2. Binary flags
# -----------------------
binary_flags = [
    "has_mechanische_ventilatie", "has_tv_kabel", "has_lift",
    "has_natuurlijke_ventilatie", "has_n/a", "has_schuifpui",
    "has_glasvezelkabel", "has_frans_balkon", "has_buitenzonwering",
    "has_zonnepanelen",
]

for col in binary_flags:
    X_train[col] = X_train[col].fillna(0).astype(int)
    X_val[col] = X_val[col].fillna(0).astype(int)
    X_test[col] = X_test[col].fillna(0).astype(int)

# -----------------------
# 3. Direct numeric features
# -----------------------
direct_numeric_features = [
    "bedrooms", "year_of_construction", "contribution_vve_num", "size_num",
    "external_storage_num", "living_area", "nr_rooms", "bathrooms", "toilets",
    "num_facilities", "floor_level", "lease_years_remaining", "backyard_num",
    "balcony_flag",
]

# -----------------------
# 4. Energy label encoding
# -----------------------
energy_order = ["G","F","E","D","C","B","A","A+","A++","A+++","A++++"]

def clean_energy(series):
    return series.replace({0: "G", "N/A": "G"}).fillna("G")

X_train["energy_label"] = clean_energy(X_train["energy_label"])
X_val["energy_label"]   = clean_energy(X_val["energy_label"])
X_test["energy_label"]  = clean_energy(X_test["energy_label"])

encoder_energy = OrdinalEncoder(categories=[energy_order], dtype=int)
X_train["energy_label_encoded"] = encoder_energy.fit_transform(X_train[["energy_label"]])
X_val["energy_label_encoded"]   = encoder_energy.transform(X_val[["energy_label"]])
X_test["energy_label_encoded"]  = encoder_energy.transform(X_test[["energy_label"]])

X_train.drop(columns=["energy_label"], inplace=True)
X_val.drop(columns=["energy_label"], inplace=True)
X_test.drop(columns=["energy_label"], inplace=True)

# -----------------------
# 5. Categorical (OHE, fit on train)
# -----------------------
def fit_ohe(series, prefix):
    dummies = pd.get_dummies(series, prefix=prefix, drop_first=True)
    return dummies, dummies.columns

def apply_ohe(series, cols, prefix):
    dummies = pd.get_dummies(series, prefix=prefix, drop_first=True)
    for col in cols:
        if col not in dummies:
            dummies[col] = 0
    return dummies[cols]

# postal district
X_train["postal_district"] = X_train["postal_code_clean"].str[:3]
postal_ohe, postal_cols = fit_ohe(X_train["postal_district"], "district")
X_val_postal = apply_ohe(X_val["postal_code_clean"].str[:3], postal_cols, "district")
X_test_postal = apply_ohe(X_test["postal_code_clean"].str[:3], postal_cols, "district")

# status
X_train["status"] = X_train["status"].fillna("N/A")
status_ohe, status_cols = fit_ohe(X_train["status"], "status")
X_val_status = apply_ohe(X_val["status"].fillna("N/A"), status_cols, "status")
X_test_status = apply_ohe(X_test["status"].fillna("N/A"), status_cols, "status")

# roof type
X_train["roof_type_simple"] = X_train["roof_type"].apply(simplify_roof)
roof_ohe, roof_cols = fit_ohe(X_train["roof_type_simple"], "roof")
X_val_roof = apply_ohe(X_val["roof_type"].apply(simplify_roof), roof_cols, "roof")
X_test_roof = apply_ohe(X_test["roof_type"].apply(simplify_roof), roof_cols, "roof")

# ownership
X_train["ownership_simple"] = X_train["ownership_type"].apply(simplify_ownership)
ownership_ohe, ownership_cols = fit_ohe(X_train["ownership_simple"], "ownership")
X_val_ownership = apply_ohe(X_val["ownership_type"].apply(simplify_ownership), ownership_cols, "ownership")
X_test_ownership = apply_ohe(X_test["ownership_type"].apply(simplify_ownership), ownership_cols, "ownership")

# location
X_train["location_simple"] = X_train["location"].apply(simplify_location)
location_ohe, location_cols = fit_ohe(X_train["location_simple"], "location")
X_val_location = apply_ohe(X_val["location"].apply(simplify_location), location_cols, "location")
X_test_location = apply_ohe(X_test["location"].apply(simplify_location), location_cols, "location")

# garden
X_train["garden"] = X_train["garden"].fillna("None")
garden_ohe, garden_cols = fit_ohe(X_train["garden"], "garden")
X_val_garden = apply_ohe(X_val["garden"].fillna("None"), garden_cols, "garden")
X_test_garden = apply_ohe(X_test["garden"].fillna("None"), garden_cols, "garden")

# -----------------------
# 6. Numeric additions
# -----------------------
X_train["floor_level"] = X_train["located_on"].apply(extract_floor)
X_val["floor_level"] = X_val["located_on"].apply(extract_floor)
X_test["floor_level"] = X_test["located_on"].apply(extract_floor)

X_train["lease_years_remaining"] = X_train["ownership_type"].apply(extract_lease_years).fillna(0)
X_val["lease_years_remaining"] = X_val["ownership_type"].apply(extract_lease_years).fillna(0)
X_test["lease_years_remaining"] = X_test["ownership_type"].apply(extract_lease_years).fillna(0)

X_train["backyard_num"] = X_train["backyard"].apply(to_float).fillna(0)
X_val["backyard_num"] = X_val["backyard"].apply(to_float).fillna(0)
X_test["backyard_num"] = X_test["backyard"].apply(to_float).fillna(0)

X_train["balcony_flag"] = X_train["balcony"].apply(lambda x: 0 if pd.isna(x) or x=="N/A" else 1)
X_val["balcony_flag"] = X_val["balcony"].apply(lambda x: 0 if pd.isna(x) or x=="N/A" else 1)
X_test["balcony_flag"] = X_test["balcony"].apply(lambda x: 0 if pd.isna(x) or x=="N/A" else 1)

# -----------------------
# 7. Combine features
# -----------------------
model_features = log_cols + binary_flags + direct_numeric_features + ["energy_label_encoded"]

# Combine OHE features
ohe_all_train = pd.concat([postal_ohe, status_ohe, roof_ohe, ownership_ohe, location_ohe, garden_ohe], axis=1)
ohe_all_val   = pd.concat([X_val_postal, X_val_status, X_val_roof, X_val_ownership, X_val_location, X_val_garden], axis=1)
ohe_all_test  = pd.concat([X_test_postal, X_test_status, X_test_roof, X_test_ownership, X_test_location, X_test_garden], axis=1)

# Drop low-variance columns (decide on TRAIN only)
ohe_reduced_train, dropped_cols = drop_low_variance_dummies(ohe_all_train, threshold=0.95)
ohe_all_val = ohe_all_val.drop(columns=dropped_cols, errors="ignore")
ohe_all_test = ohe_all_test.drop(columns=dropped_cols, errors="ignore")

print(f"Dropped {len(dropped_cols)} low-variance columns:", dropped_cols)

# Final matrices
X_train_final = pd.concat([X_train[model_features], ohe_reduced_train], axis=1)
X_val_final   = pd.concat([X_val[model_features], ohe_all_val], axis=1)
X_test_final  = pd.concat([X_test[model_features], ohe_all_test], axis=1)

print("Final shapes:")
print("Train:", X_train_final.shape)
print("Val:", X_val_final.shape)
print("Test:", X_test_final.shape)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Use the processed train set
numeric_cols = log_cols + direct_numeric_features + ["energy_label_encoded"]

# Ensure numeric columns are numeric
X_numeric = X_train_final[numeric_cols].apply(pd.to_numeric, errors="coerce")

# Correlation matrix
corr_matrix = X_numeric.corr()

# Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation matrix for numeric features (Train set)")
plt.show()

# Identify highly correlated pairs
high_corr = []
cols = corr_matrix.columns
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.9:
            high_corr.append((cols[i], cols[j], corr_val))

print("Highly correlated numeric pairs (|r|>0.9):")
for pair in high_corr:
    print(pair)


In [None]:
cols_to_drop = [
    "size_num",
    "living_area",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "num_facilities",
    # "external_storage_num",
]

X_train_final.drop(columns=cols_to_drop, inplace=True)
X_val_final.drop(columns=cols_to_drop, inplace=True)
X_test_final.drop(columns=cols_to_drop, inplace=True)

print("Train shape:", X_train_final.shape)
print("validation shape:", X_val_final.shape)
print("Test shape:", X_test_final.shape)


## 13. Baseline models after feature engineering


#### Baseline Random Forest


In [None]:
#no need for validation set her
X_train_full = pd.concat([X_train_final, X_val_final], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

evaluator = ModelEvaluator(
    target_transform=np.log1p,     # log-transform target for training
    inverse_transform=np.expm1     # convert predictions back to original scale
)

rf_model = RandomForestRegressor()

trained_rf, y_train_pred, y_val_pred, y_test_pred, rf_results = evaluator.evaluate(
    rf_model,
    X_train_full,
    y_train_full,
    X_test_final,
    y_test,
    model_name="RandomForestRegression",

)
logger.log_model(trained_rf, "Random_Forest_Regression_feature_eng", rf_results)

#### Baseline XGboost with early stopping


In [None]:
MODEL_CONFIG_PATH = ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"
model_params, fit_params = load_model_config(MODEL_CONFIG_PATH, model_name="xgb_with_early_stopping")

trained_xgb, y_train_pred, y_val_pred, y_test_pred, xgb_results = evaluator.evaluate(
    model_params,
    X_train_final,
    y_train,
    X_test_final,
    y_test,
    X_val=X_val_final,
    y_val=y_val,
    fit_params=fit_params,
    model_name="XGBoostEarlyStopping",
    use_xgb_train=True  # ensures early stopping is used
)

# Log model
logger.log_model(trained_xgb, "XGBoostRegressionFeatureEng", xgb_results)


## 14. Optuna tuning after feature eng


In [None]:
# XGBoost study
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study_xgb = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)

objective_xgb_partial = partial(
    objective_xgb,
    df=df_clean,
    features_config=FEATURES_CONFIG_PATH,
    hyperparam_config=HYPERPARAM_CONFIG_PATH,
    model_name="XGBoostEarlyStopping",
    X_train=X_train_final,
    y_train=y_train,
    X_val=X_val_final,
    y_val=y_val,
    use_log=True,
    use_extended_features=True,
)
study_xgb.optimize(objective_xgb_partial, n_trials=30)

print("Best XGBoost params:", study_xgb.best_params)
print("Best XGBoost Test RMSE:", study_xgb.best_value)

# RandomForest study
study_rf = optuna.create_study(direction="minimize")
objective_rf_partial = partial(
    objective_rf,
    df=df_clean,
    features_config=FEATURES_CONFIG_PATH,
    hyperparam_config=HYPERPARAM_CONFIG_PATH,
    model_name="random_forest_optuna",
    X_train=X_train_final,
    y_train=y_train,
    X_val=X_val_final,
    y_val=y_val,
    use_log=True,
    use_extended_features=True,
)
study_rf.optimize(objective_rf_partial, n_trials=30)

print("Best RF params:", study_rf.best_params)
print("Best RF Test RMSE:", study_rf.best_value)

In [None]:
# # Run Optuna for XGBoost
# sampler = optuna.samplers.TPESampler(seed=42)
# pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)

# study_xgb = optuna.create_study(
#     direction="minimize", sampler=sampler, pruner=pruner
# )
# study_xgb.optimize(
#     lambda trial: objective_xgb(trial, X_train, y_train, X_test, y_test),
#     n_trials=200,
# )

# print("Best XGBoost params:", study_xgb.best_params)
# print("Best XGBoost Test MAE:", study_xgb.best_value)

# # Run Optuna for Random Forest
# study_rf = optuna.create_study(
#     direction="minimize", sampler=sampler, pruner=pruner
# )
# study_rf.optimize(
#     lambda trial: objective_rf(trial, X_train, y_train, X_test, y_test),
#     n_trials=100,
# )

# print("Best RF params:", study_rf.best_params)
# print("Best RF Test MAE:", study_rf.best_value)

## 15. Best params run after feature eng


In [None]:
# Initialize evaluator with log-transform if used in Optuna
evaluator = ModelEvaluator(
    target_transform=np.log1p,
    inverse_transform=np.expm1,
)

# --- Random Forest ---
best_rf = RandomForestRegressor(**study_rf.best_params)
trained_rf, y_train_pred, y_val_pred, y_test_pred, results_rf = evaluator.evaluate(
    best_rf,
    X_train_final,
    y_train,
    X_test_final,
    y_test,
    X_val=X_val_final,
    y_val=y_val,
    use_xgb_train=False,  # RF uses scikit-learn API
    model_name="RandomForestRegression",  # name for logging
)

logger.log_model(trained_rf, "RF_LogTransform_Optuna_feature_eng", results_rf, use_xgb_train=False)


# --- XGBoost ---
best_xgb_params = study_xgb.best_params
# Note: For XGBoost, we pass params dict to evaluator and set use_xgb_train=True
trained_xgb, y_train_pred, y_val_pred, y_test_pred, results_xgb = evaluator.evaluate(
    best_xgb_params,
    X_train_final,
    y_train,
    X_test_final,
    y_test,
    X_val=X_val_final,
    y_val=y_val,
    use_xgb_train=True,  # use xgb.train
    model_name="XGBoostEarlyStopping",
    fit_params={"num_boost_round": 1000, "early_stopping_rounds": 50},
)

logger.log_model(trained_xgb, "XGB_Optuna_LogTransformed_feature_eng", results_xgb, use_xgb_train=True)


In [None]:
# # Convert log1p targets
# dtrain = xgb.DMatrix(X_train, label=np.log1p(y_train))
# dtest = xgb.DMatrix(X_test, label=np.log1p(y_test))

# best_params = study_xgb.best_params
# best_params.update(
#     {"objective": "reg:squarederror", "seed": 42, "tree_method": "hist"}
# )

# best_xgb, results_xgb = evaluate_xgb_dmatrix(
#     best_params,
#     dtrain,
#     dtest,
#     y_train_orig=y_train,
#     y_test_orig=y_test,
#     run_name="XGB_Optuna_LogTransformed_feature_eng",
# )

In [None]:
# # Prepare log-transformed targets
# y_train_log = np.log1p(y_train)
# y_test_log = np.log1p(y_test)

# # Refit best RF on log targets and evaluate
# best_rf = RandomForestRegressor(**study_rf.best_params)
# best_rf, results_rf, y_test_pred = evaluate_model_log(
#     best_rf, X_train, y_train_log, X_test, y_test_log
# )

# # Log to MLflow
# log_to_mlflow(best_rf, "RF_LogTransform_Optuna_feature_eng", results_rf)

In [None]:
experiment_name = "house_price_prediction"

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])

metrics_of_interest = [
    "metrics.train_rmse",
    "metrics.test_rmse",
    "metrics.train_r2",
    "metrics.test_r2",
    "metrics.train_mae",
    "metrics.test_mae",
    "metrics.train_mape",
    "metrics.test_mape",
]
comparison_df = runs_df[
    ["run_id", "tags.mlflow.runName"] + metrics_of_interest
]

comparison_df.sort_values("metrics.test_mae", ascending=True, inplace=True)
comparison_df

In [None]:
comparison_df.columns

I have chosen run_id 33688ff883c54d2fb4a14cbef2ae617a because the combination of statistcs looks the best: one of the highest R2 for both test and train, and rmse and mae are one of the lowest ones.


In [None]:
print(mlflow.get_tracking_uri())

In [None]:
from mlflow.tracking import MlflowClient

run_id = "33688ff883c54d2fb4a14cbef2ae617a"
client = MlflowClient()

artifacts = client.list_artifacts(
    run_id, path="xgb_model"
)  # match the artifact_path you used
for a in artifacts:
    print(a.path)

In [None]:
run_id = "33688ff883c54d2fb4a14cbef2ae617a"
model_path = f"runs:/{run_id}/xgb_model"

# Load the model
loaded_model = mlflow.xgboost.load_model(model_path)

# Make predictions
y_pred = loaded_model.predict(dtest)  # dtest = xgb.DMatrix(X_test)

In [None]:
run = mlflow.get_run(run_id)
print(run.data.metrics)  # Train/test RMSE, R2, etc.
print(run.data.params)  # Model hyperparameters

In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Register model
model_uri = f"runs:/{run_id}/xgb_model"
registered_model_name = "RealEstate_XGB"
model_version = mlflow.register_model(model_uri, registered_model_name)

print(
    f"Model registered as {registered_model_name}, version {model_version.version}"
)
# saved the best

#### 16. Cross validation for this best model


In [None]:
mlflow.end_run()

In [None]:
# Parameters
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

cv_metrics = []

X_np = X_train.values  # or X_train_scaled if scaling used
y_np = y_train.values

with mlflow.start_run(run_name="XGB_CV_last_model"):

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_np)):
        X_tr, X_val = X_np[train_idx], X_np[val_idx]
        y_tr, y_val = y_np[train_idx], y_np[val_idx]

        dtrain = xgb.DMatrix(X_tr, label=np.log1p(y_tr))
        dval = xgb.DMatrix(X_val, label=np.log1p(y_val))

        xgb_model = xgb.train(
            best_params,
            dtrain,
            num_boost_round=500,
            evals=[(dval, "eval")],
            early_stopping_rounds=50,
            verbose_eval=False,
        )

        # Predict and back-transform
        y_val_pred = np.expm1(xgb_model.predict(dval))

        # Metrics
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        mae = mean_absolute_error(y_val, y_val_pred)
        r2 = r2_score(y_val, y_val_pred)
        cv_metrics.append({"rmse": rmse, "mae": mae, "r2": r2})

        # Log metrics per fold
        mlflow.log_metric(f"fold_{fold+1}_rmse", rmse)
        mlflow.log_metric(f"fold_{fold+1}_mae", mae)
        mlflow.log_metric(f"fold_{fold+1}_r2", r2)

        # Plot predicted vs actual
        plt.figure(figsize=(6, 6))
        sns.scatterplot(x=y_val, y=y_val_pred)
        plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
        plt.xlabel("Actual")
        plt.ylabel("Predicted")
        plt.title(f"Fold {fold+1} Predicted vs Actual")
        plt.show()  # show inline

        # Log image via PIL
        buf = io.BytesIO()
        plt.savefig(buf, format="png")
        buf.seek(0)
        img = Image.open(buf)
        mlflow.log_image(img, f"fold_{fold+1}_pred_vs_actual.png")
        plt.close()

# Aggregate metrics
mean_rmse = np.mean([m["rmse"] for m in cv_metrics])
std_rmse = np.std([m["rmse"] for m in cv_metrics])
mean_mae = np.mean([m["mae"] for m in cv_metrics])
std_mae = np.std([m["mae"] for m in cv_metrics])
mean_r2 = np.mean([m["r2"] for m in cv_metrics])
std_r2 = np.std([m["r2"] for m in cv_metrics])

# Log aggregated metrics
mlflow.log_metric("CV_mean_rmse", mean_rmse)
mlflow.log_metric("CV_std_rmse", std_rmse)
mlflow.log_metric("CV_mean_mae", mean_mae)
mlflow.log_metric("CV_std_mae", std_mae)
mlflow.log_metric("CV_mean_r2", mean_r2)
mlflow.log_metric("CV_std_r2", std_r2)

print(f"CV RMSE: {mean_rmse:.2f} ± {std_rmse:.2f}")
print(f"CV MAE: {mean_mae:.2f} ± {std_mae:.2f}")
print(f"CV R²: {mean_r2:.3f} ± {std_r2:.3f}")

In [None]:
run = mlflow.get_run(run_id)
print(run.data.metrics)  # Train/test RMSE, R2, etc.
print(run.data.params)

In [None]:
# Prepare storage
y_vals_all = []
y_preds_all = []

# Collect predictions from each fold
for train_idx, val_idx in kf.split(X_np):
    X_tr, X_val = X_np[train_idx], X_np[val_idx]
    y_tr, y_val = y_np[train_idx], y_np[val_idx]

    dtrain = xgb.DMatrix(X_tr, label=np.log1p(y_tr))
    dval = xgb.DMatrix(X_val, label=np.log1p(y_val))

    xgb_model = xgb.train(
        best_params,
        dtrain,
        num_boost_round=500,
        evals=[(dval, "eval")],
        early_stopping_rounds=50,
        verbose_eval=False,
    )

    y_val_pred = np.expm1(xgb_model.predict(dval))

    y_vals_all.extend(y_val)
    y_preds_all.extend(y_val_pred)

# Convert to arrays
y_vals_all = np.array(y_vals_all)
y_preds_all = np.array(y_preds_all)
residuals = y_vals_all - y_preds_all

# Predicted vs Actual
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_vals_all, y=y_preds_all)
plt.plot(
    [y_vals_all.min(), y_vals_all.max()],
    [y_vals_all.min(), y_vals_all.max()],
    "r--",
    label="Perfect Prediction",
)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("CV: Predicted vs Actual (all folds)")
plt.legend()
plt.show()

# Residual plot
plt.figure(figsize=(6, 4))
sns.histplot(residuals, kde=True, bins=20, color="skyblue")
plt.xlabel("Residual (Actual - Predicted)")
plt.title("CV Residuals Distribution")
plt.show()

#### Generating price range (for pipeline)


In [None]:
import pandas as pd
import numpy as np

# Number of bins
n_bins = 10

# Create a DataFrame
df = pd.DataFrame({"pred": y_preds_all, "residual": residuals})

# Use qcut instead of cut to ensure roughly equal-sized bins
df["pred_bin"] = pd.qcut(df["pred"], q=n_bins, duplicates="drop")

# Compute 5th and 95th percentiles per bin
bin_ranges = (
    df.groupby("pred_bin")["residual"]
    .agg(
        lower_bound=lambda x: np.percentile(x, 5),
        upper_bound=lambda x: np.percentile(x, 95),
    )
    .reset_index()
)


# Function to get price range
def get_price_range(pred_price):
    for _, row in bin_ranges.iterrows():
        if row["pred_bin"].left <= pred_price <= row["pred_bin"].right:
            return (
                pred_price + row["lower_bound"],
                pred_price + row["upper_bound"],
            )
    return pred_price, pred_price  # fallback if outside all bins


# Example
example_pred = 500_000
lb, ub = get_price_range(example_pred)
print(f"Predicted price: {example_pred}, Range: {lb:.0f} - {ub:.0f}")

Need to just figure out logging these ranges
