#### 1. Imports and Set-up


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import xgboost as xgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

mlflow.set_tracking_uri("../logs/mlruns")
mlflow.set_experiment("house_price_prediction")

#### 2. Load and prep data


In [None]:
import sys
import os
from pathlib import Path
import yaml


# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_loading.data_loading.data_loader import load_data_from_json
from src.data_loading.preprocessing.preprocessing import preprocess_df
from src.data_loading.preprocessing.imputation import impute_missing_values


# go two levels up from notebook dir -> project root
ROOT = (
    Path(__file__).resolve().parents[2]
    if "__file__" in globals()
    else Path.cwd().parents[1]
)
CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "preprocessing_config.yaml"
)

with open(CONFIG_PATH) as f:
    CONFIG = yaml.safe_load(f)

df_raw = load_data_from_json("../data/parsed_json/*.json")
df_clean = preprocess_df(
    df_raw,
    drop_raw=CONFIG["preprocessing"]["drop_raw"],
    numeric_cols=CONFIG["preprocessing"]["numeric_cols"],
)
df_clean = impute_missing_values(
    df_clean, CONFIG["preprocessing"]["imputation"]
)
# Drop price_num NaNs for the training of the model
df = df_clean[df_clean["price_num"].notna()]
df = df_clean.copy()

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# def clean_year(year):
#     if isinstance(year, str):
#         if year.startswith("Voor"):  # e.g., "Voor 1906"
#             return int(year.split()[-1]) - 1  # use 1905
#         elif year.startswith("Na"):  # e.g., "Na 2020"
#             return int(year.split()[-1]) + 1  # use 2021
#         elif year.isdigit():
#             return int(year)
#         else:
#             return None  # invalid string
#     elif isinstance(year, (int, float)):
#         return int(year)
#     else:
#         return None


# df["year_of_construction"] = df["year_of_construction"].apply(clean_year)
# df["year_of_construction"] = df["year_of_construction"].fillna(
#     df["year_of_construction"].median()
# )

In [None]:
# numeric_cols = ["bedrooms", "nr_rooms", "bathrooms", "toilets"]

# df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

In [None]:
linear_features = [
    "size_num",
    "bedrooms",
    "year_of_construction",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "contribution_vve_num",
    "external_storage_num",
    "inhabitants_in_neighborhood",
    "families_with_children_pct",
    "price_per_m2_neighborhood",
]
target = "price_num"

X = df[linear_features].replace("N/A", np.nan).fillna(0)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### 3. Code for evaluating and logging models


In [None]:
def evaluate_model(
    model, X_train, y_train, X_test, y_test, metrics=None, fit_params=None
):
    """
    Fit model, predict, and return evaluation metrics.
    """
    if fit_params is None:
        fit_params = {}
    model.fit(X_train, y_train, **fit_params)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    results = {}
    if metrics is None:
        metrics = {
            "rmse": lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)),
            "mea": lambda y, y_pred: mean_absolute_error(y, y_pred),
            "r2": r2_score,
        }

    for name, func in metrics.items():
        results[f"train_{name}"] = func(y_train, y_train_pred)
        results[f"test_{name}"] = func(y_test, y_test_pred)

    return model, results


def log_to_mlflow(model, model_name, results):
    """
    Log model and metrics to MLflow.
    """
    with mlflow.start_run(run_name=model_name):
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        mlflow.log_metrics(results)
        if hasattr(model, "get_params"):
            mlflow.log_params(model.get_params())
        print(f"{model_name} -> {results}")

#### 4. Linear Regression


In [None]:
lr, lr_results = evaluate_model(
    LinearRegression(), X_train_scaled, y_train, X_test_scaled, y_test
)
log_to_mlflow(lr, "Linear_Regression", lr_results)

#### 5. Random Forest Regression


In [None]:
all_features = [
    "size_num",
    "bedrooms",
    "energy_label",
    "year_of_construction",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "contribution_vve_num",
    "external_storage_num",
    "has_mechanische_ventilatie",
    "has_tv_kabel",
    "has_lift",
    "has_natuurlijke_ventilatie",
    "has_n/a",
    "has_schuifpui",
    "has_glasvezelkabel",
    "has_frans_balkon",
    "has_buitenzonwering",
    "has_zonnepanelen",
    "has_airconditioning",
    "has_balansventilatie",
    "has_dakraam",
    "has_alarminstallatie",
    "has_domotica",
    "has_rookkanaal",
    "has_elektra",
    "has_sauna",
    "has_zonnecollectoren",
    "has_cctv",
    "has_rolluiken",
    "has_stromend_water",
    "has_satellietschotel",
    "num_facilities",
    "inhabitants_in_neighborhood",
    "families_with_children_pct",
    "price_per_m2_neighborhood",
]

X = df[all_features].replace("N/A", np.nan).fillna(0)

X["energy_label"] = X["energy_label"].replace({0: "G"})
energy_order = [
    "G",
    "F",
    "E",
    "D",
    "C",
    "B",
    "A",
    "A+",
    "A++",
    "A+++",
    "A++++",
]
encoder = OrdinalEncoder(categories=[energy_order])
X["energy_label_encoded"] = encoder.fit_transform(X[["energy_label"]])
X = X.drop(columns=["energy_label"])

y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [None]:
X.energy_label_encoded.unique()

In [None]:
rf, rf_results = evaluate_model(
    RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    X_train,
    y_train,
    X_test,
    y_test,
)
log_to_mlflow(rf, "Random_Forest_Regression", rf_results)

#### 6. XGBoost model


In [None]:
xgb_model = xgb.XGBRegressor(
    n_estimators=500, max_depth=6, learning_rate=0.05, random_state=42
)
xgb_model, results = evaluate_model(
    xgb_model,
    X_train,
    y_train,
    X_test,
    y_test,
)
log_to_mlflow(xgb_model, "XGBoost_Regression", results)

#### 7. XGBoost with early stopping and more tuning


In [None]:
import xgboost as xgb

# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Parameters
params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "eta": 0.05,
    "seed": 42,
}

# Train with early stopping
xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=[(dtest, "eval")],
    early_stopping_rounds=50,
    verbose_eval=False,
)

# Predictions
y_train_pred = xgb_model.predict(dtrain)
y_test_pred = xgb_model.predict(dtest)

# Metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(
    f"Train RMSE: {train_rmse:.2f}, MAE: {train_mae:.2f}, train_R²: {train_r2:.3f}"
)
print(
    f"Test RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}, test_R²: {test_r2:.3f}"
)

with mlflow.start_run(run_name="XGBoost_Regression"):
    # Log model
    mlflow.xgboost.log_model(xgb_model, artifact_path="xgb_model")

    # Log metrics
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)

#### 7. Compare models using MLflow


In [None]:
experiment_name = "house_price_prediction"

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])

In [None]:
metrics_of_interest = [
    "metrics.train_rmse",
    "metrics.test_rmse",
    "metrics.train_r2",
    "metrics.test_r2",
]
comparison_df = runs_df[
    ["run_id", "tags.mlflow.runName"] + metrics_of_interest
]

comparison_df.sort_values("metrics.test_r2", ascending=False, inplace=True)
comparison_df

In [None]:
best_model = comparison_df.sort_values(
    "metrics.test_r2", ascending=False
).iloc[0]
print("Best model based on test R²:")
print(best_model)

#### 8. Hyperparameter tuning with Optuna


In [None]:
def objective_xgb(trial):
    params = {
        "objective": "reg:squarederror",
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "seed": 42,
        "tree_method": "hist",
    }

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtest, "eval")],
        early_stopping_rounds=50,
        verbose_eval=False,
    )

    # Predictions
    y_train_pred = model.predict(dtrain)
    y_test_pred = model.predict(dtest)

    # Metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(
        f"Train RMSE: {train_rmse:.2f}, Test RMSE: {test_rmse:.2f}, Train R²: {train_r2:.3f}, Test R²: {test_r2:.3f}"
    )

    # Optuna only optimizes one metric, here test RMSE
    return test_rmse


def objective_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4),
        "max_features": trial.suggest_categorical(
            "max_features", ["sqrt", "log2", None]
        ),
        "random_state": 42,
        "n_jobs": -1,
    }

    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

In [None]:
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=30)

print("Best XGBoost params:", study_xgb.best_params)
print("Best XGBoost Test RMSE:", study_xgb.best_value)

# Run Optuna for RandomForest
study_rf = optuna.create_study(direction="minimize")
study_rf.optimize(objective_rf, n_trials=30)

print("Best RF params:", study_rf.best_params)
print("Best RF Test RMSE:", study_rf.best_value)

#### 9. RF and Xgboost with best parameters


In [None]:
best_rf = RandomForestRegressor(**study_rf.best_params)
best_rf, results_rf = evaluate_model(best_rf, X_train, y_train, X_test, y_test)
log_to_mlflow(best_rf, "RF_Optuna", results_rf)

In [None]:
def evaluate_xgb_dmatrix(
    params,
    dtrain,
    dtest,
    run_name="XGBoost_Regression",
    num_boost_round=500,
    early_stopping_rounds=50,
):
    """
    Train and evaluate an XGBoost model using DMatrix + xgb.train,
    log metrics and model to MLflow.
    """
    # Train
    xgb_model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dtest, "eval")],
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=False,
    )

    # Predictions
    y_train_pred = xgb_model.predict(dtrain)
    y_test_pred = xgb_model.predict(dtest)

    # Metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    results = {
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "train_r2": train_r2,
        "test_r2": test_r2,
    }

    print(
        f"Train RMSE: {train_rmse:.2f}, MAE: {train_mae:.2f}, train_R²: {train_r2:.3f}"
    )
    print(
        f"Test RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}, test_R²: {test_r2:.3f}"
    )

    # Log to MLflow
    with mlflow.start_run(run_name=run_name):
        mlflow.xgboost.log_model(xgb_model, artifact_path="xgb_model")
        mlflow.log_metrics(results)
        mlflow.log_params(params)

    return xgb_model, results

In [None]:
best_params = study_xgb.best_params
best_params.update(
    {"objective": "reg:squarederror", "seed": 42, "tree_method": "hist"}
)
best_xgb, results_xgb = evaluate_xgb_dmatrix(
    best_params, dtrain, dtest, run_name="XGB_Optuna"
)

#### 10. Let's see how outliers skew RMSE


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot target distribution
sns.boxplot(y=y_test)
plt.show()

# Optional: scatter of predictions vs true values
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.xlabel("True")
plt.ylabel("Predicted")
plt.show()

In [None]:
# Compute residuals
residuals = y_test - y_test_pred

# Summary stats
print("Residuals summary:")
print("Min:", np.min(residuals))
print("Max:", np.max(residuals))
print("Median:", np.median(residuals))
print("Mean:", np.mean(residuals))
print("Std:", np.std(residuals))

# Plot histogram
plt.hist(residuals, bins=50)
plt.title("Residuals Distribution")
plt.xlabel("Residual")
plt.ylabel("Count")
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error

train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE:  {test_mae:.2f}")

So outliers are skewing the RMSE statistic quite heavily. Hence, I will log transform the target and dot he same analysis.


In [None]:
y_train_log = np.log1p(y_train)  # log(1 + price)
y_test_log = np.log1p(y_test)

In [None]:
def evaluate_model_log(model, X_train, y_train_log, X_test, y_test_log):
    """Fit model on log-transformed target and compute predictions on original scale."""

    # Fit model
    model.fit(X_train, y_train_log)

    # Predict and back-transform
    y_train_pred = np.expm1(model.predict(X_train))
    y_test_pred = np.expm1(model.predict(X_test))

    # Metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    results = {
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "train_r2": train_r2,
        "test_r2": test_r2,
    }

    # Residuals
    residuals = y_test - y_test_pred

    # Plot residuals
    plt.figure(figsize=(8, 4))
    plt.hist(residuals, bins=50)
    plt.title("Residuals Distribution")
    plt.xlabel("Residual")
    plt.ylabel("Count")
    plt.show()

    return model, results, y_test_pred

In [None]:
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf, rf_results, rf_pred = evaluate_model_log(
    rf, X_train, y_train_log, X_test, y_test_log
)
log_to_mlflow(rf, "RF_LogTransform", rf_results)

In [None]:
def evaluate_xgb_dmatrix_log(
    params,
    dtrain,
    dtest,
    y_train,
    y_test,
    run_name="XGBoost_Regression_Log",
    num_boost_round=500,
    early_stopping_rounds=50,
):
    """
    Train and evaluate an XGBoost model using DMatrix + xgb.train on log-transformed target,
    log metrics and model to MLflow.

    y_train, y_test: original (untransformed) target arrays, used for computing metrics on original scale.
    """
    # Train
    xgb_model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dtest, "eval")],
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=False,
    )

    # Predictions on log scale
    y_train_pred_log = xgb_model.predict(dtrain)
    y_test_pred_log = xgb_model.predict(dtest)

    # Back-transform to original scale
    y_train_pred = np.expm1(y_train_pred_log)
    y_test_pred = np.expm1(y_test_pred_log)

    # Metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    results = {
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "train_r2": train_r2,
        "test_r2": test_r2,
    }

    print(
        f"Train RMSE: {train_rmse:.2f}, MAE: {train_mae:.2f}, R²: {train_r2:.3f}"
    )
    print(
        f"Test RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}, R²: {test_r2:.3f}"
    )

    # Residuals
    residuals = y_test - y_test_pred
    plt.figure(figsize=(8, 4))
    plt.hist(residuals, bins=50)
    plt.title("Residuals Distribution")
    plt.xlabel("Residual")
    plt.ylabel("Count")
    plt.show()

    # Log to MLflow
    with mlflow.start_run(run_name=run_name):
        mlflow.xgboost.log_model(xgb_model, artifact_path="xgb_model")
        mlflow.log_metrics(results)
        mlflow.log_params(params)

    return xgb_model, results

In [None]:
# Log-transform y for DMatrix
dtrain = xgb.DMatrix(X_train, label=np.log1p(y_train))
dtest = xgb.DMatrix(X_test, label=np.log1p(y_test))

params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "eta": 0.05,
    "seed": 42,
    "tree_method": "hist",
}

xgb_model_log, results_log = evaluate_xgb_dmatrix_log(
    params, dtrain, dtest, y_train, y_test
)

In [None]:
# Compute residuals
residuals = y_test - y_test_pred

# Define extreme outliers: e.g., top 5% of absolute residuals
threshold = np.percentile(np.abs(residuals), 95)
outliers_mask = np.abs(residuals) >= threshold

plt.figure(figsize=(8, 6))

# Plot non-outliers
plt.scatter(
    y_test[~outliers_mask],
    y_test_pred[~outliers_mask],
    alpha=0.5,
    label="Normal listings",
)

# Highlight extreme residuals
plt.scatter(
    y_test[outliers_mask],
    y_test_pred[outliers_mask],
    color="red",
    label="Extreme listings",
)

# Diagonal line (perfect prediction)
max_val = max(y_test.max(), y_test_pred.max())
plt.plot(
    [0, max_val],
    [0, max_val],
    color="black",
    linestyle="--",
    label="Perfect prediction",
)

plt.xlabel("Actual Price (€)")
plt.ylabel("Predicted Price (€)")
plt.title("Predicted vs Actual Prices with Extreme Listings Highlighted")
plt.legend()
plt.show()

This is a clear visualization to see how predictions behave across the entire range and highlight the extreme listings that inflate RMSE. Large RMSE is not a deal breaker since:

**Statistical justification**
Skewed distribution: My dataset has a few extremely expensive houses that are far from the mean. RMSE is sensitive to large errors because it squares residuals, so these few points dominate the metric.

MAE is more robust: By reporting MAE alongside RMSE, I show the typical prediction error for most listings, which is a fairer assessment of model performance.

Log-transform mitigates skew: Training on log1p(y) reduces the influence of outliers and stabilizes variance, producing a more reliable model for the bulk of the data.

**Practical/business justification**

The extreme listings (multi-million € homes) are rare. The model performs well on 99% of listings, which is what matters for most users or business decisions.

Trying to perfectly predict the top 1–5% of luxury listings would:

Require specialized models or additional data

Complicate the pipeline

Increase overfitting risk

Reporting MAE and residual distributions communicates clearly that errors on extreme listings exist, but are expected and do not invalidate the model.


#### 11. New approach and moving with optuna


In [None]:
from sklearn.metrics import mean_absolute_error
from optuna.integration import XGBoostPruningCallback


# XGBoost objective for Optuna (optimize MAE)
def objective_xgb(trial, X_train, y_train, X_test, y_test):
    # Log-transform target
    y_train_log = np.log1p(y_train)
    y_test_log = np.log1p(y_test)

    # Create DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train_log)
    dtest = xgb.DMatrix(X_test, label=y_test_log)

    # Hyperparameters
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "mae",
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5.0),
        "seed": 42,
        "tree_method": "hist",
    }

    pruning_callback = XGBoostPruningCallback(trial, "eval-mae")

    # Train model
    evals_result = {}
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dtest, "eval")],
        evals_result=evals_result,
        early_stopping_rounds=100,
        verbose_eval=False,
        callbacks=[pruning_callback],
    )

    # Back-transform predictions
    y_train_pred = np.expm1(model.predict(dtrain))
    y_test_pred = np.expm1(model.predict(dtest))

    # Report pruning metric
    trial.report(
        evals_result["eval"]["mae"][model.best_iteration],
        step=model.best_iteration,
    )
    if trial.should_prune():
        raise optuna.TrialPruned()

    # Compute MAE on original scale
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    print(f"Train MAE: {train_mae:.2f}, Test MAE: {test_mae:.2f}")

    return test_mae  # Optuna optimizes this


# Random Forest objective for Optuna (optimize MAE)
def objective_rf(trial, X_train, y_train, X_test, y_test):
    # Log-transform target
    y_train_log = np.log1p(y_train)

    # Hyperparameters
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4),
        "max_features": trial.suggest_categorical(
            "max_features", ["sqrt", "log2", None]
        ),
        "random_state": 42,
        "n_jobs": -1,
    }

    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train_log)

    # Back-transform predictions
    y_pred = np.expm1(model.predict(X_test))
    test_mae = mean_absolute_error(y_test, y_pred)

    return test_mae  # Optuna optimizes this

In [None]:
# Run Optuna for XGBoost
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)

study_xgb = optuna.create_study(
    direction="minimize", sampler=sampler, pruner=pruner
)
study_xgb.optimize(
    lambda trial: objective_xgb(trial, X_train, y_train, X_test, y_test),
    n_trials=200,
)

print("Best XGBoost params:", study_xgb.best_params)
print("Best XGBoost Test MAE:", study_xgb.best_value)

# Run Optuna for Random Forest
study_rf = optuna.create_study(
    direction="minimize", sampler=sampler, pruner=pruner
)
study_rf.optimize(
    lambda trial: objective_rf(trial, X_train, y_train, X_test, y_test),
    n_trials=100,
)

print("Best RF params:", study_rf.best_params)
print("Best RF Test MAE:", study_rf.best_value)

In [None]:
def evaluate_xgb_dmatrix(
    params,
    dtrain,
    dtest,
    y_train_orig,
    y_test_orig,
    run_name="XGBoost_Regression",
    num_boost_round=500,
    early_stopping_rounds=50,
):
    """
    Train and evaluate an XGBoost model using DMatrix + xgb.train,
    with support for log-transformed targets.
    Logs RMSE, MAE, R², and hyperparameters to MLflow.

    Parameters:
    - params: dict of XGBoost parameters
    - dtrain, dtest: xgb.DMatrix (log-transformed labels)
    - y_train_orig, y_test_orig: original target values (not log-transformed)
    - run_name: MLflow run name
    - num_boost_round: maximum number of boosting iterations
    - early_stopping_rounds: rounds for early stopping
    """
    # Train model
    xgb_model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dtest, "eval")],
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=False,
    )

    # Predict and back-transform to original scale
    y_train_pred = np.expm1(xgb_model.predict(dtrain))
    y_test_pred = np.expm1(xgb_model.predict(dtest))

    # Compute metrics
    train_rmse = np.sqrt(mean_squared_error(y_train_orig, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test_orig, y_test_pred))
    train_mae = mean_absolute_error(y_train_orig, y_train_pred)
    test_mae = mean_absolute_error(y_test_orig, y_test_pred)
    train_r2 = r2_score(y_train_orig, y_train_pred)
    test_r2 = r2_score(y_test_orig, y_test_pred)

    results = {
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "train_r2": train_r2,
        "test_r2": test_r2,
    }

    # Print summary
    print(
        f"Train RMSE: {train_rmse:.2f}, MAE: {train_mae:.2f}, R²: {train_r2:.3f}"
    )
    print(
        f"Test RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}, R²: {test_r2:.3f}"
    )

    # Log to MLflow
    with mlflow.start_run(run_name=run_name):
        mlflow.xgboost.log_model(xgb_model, artifact_path="xgb_model")
        mlflow.log_metrics(results)
        mlflow.log_params(params)

    return xgb_model, results

In [None]:
# Convert log1p targets
dtrain = xgb.DMatrix(X_train, label=np.log1p(y_train))
dtest = xgb.DMatrix(X_test, label=np.log1p(y_test))

best_params = study_xgb.best_params
best_params.update(
    {"objective": "reg:squarederror", "seed": 42, "tree_method": "hist"}
)

best_xgb, results_xgb = evaluate_xgb_dmatrix(
    best_params,
    dtrain,
    dtest,
    y_train_orig=y_train,
    y_test_orig=y_test,
    run_name="XGB_Optuna_LogTransformed",
)

In [None]:
# Prepare log-transformed targets
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Refit best RF on log targets and evaluate
best_rf = RandomForestRegressor(**study_rf.best_params)
best_rf, results_rf, y_test_pred = evaluate_model_log(
    best_rf, X_train, y_train_log, X_test, y_test_log
)

# Log to MLflow
log_to_mlflow(best_rf, "RF_LogTransform_Optuna", results_rf)

In [None]:
experiment_name = "house_price_prediction"

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])

metrics_of_interest = [
    "metrics.train_rmse",
    "metrics.test_rmse",
    "metrics.train_r2",
    "metrics.test_r2",
    "metrics.train_mae",
    "metrics.test_mae",
]
comparison_df = runs_df[
    ["run_id", "tags.mlflow.runName"] + metrics_of_interest
]

comparison_df.sort_values("metrics.test_mae", ascending=True, inplace=True)
comparison_df

In [None]:
best_model = comparison_df.sort_values(
    "metrics.test_mae", ascending=True
).iloc[0]
print("Best model based on test MAE:")
print(best_model)

#### 12. Extra feature engineering


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

df = df.copy()


# -----------------------
# Helper Functions
# -----------------------
def to_float(value):
    if pd.isna(value):
        return np.nan
    cleaned = re.sub(r"[^\d\.]", "", str(value))
    return float(cleaned) if cleaned else np.nan


def auto_log_transform(df, numeric_cols, threshold_skew=0.5):
    log_cols = []
    for col in numeric_cols:
        if (df[col] > 0).all():
            skewness = df[col].skew()
            if abs(skewness) > threshold_skew:
                df[f"log_{col}"] = np.log1p(df[col])
                log_cols.append(f"log_{col}")
    return log_cols


def extract_floor(x):
    if pd.isna(x) or x in ["N/A", "Begane grond"]:
        return 0
    match = re.search(r"(\d+)", x)
    return int(match.group(1)) if match else 0


def simplify_roof(roof):
    if pd.isna(roof) or roof == "N/A":
        return "Unknown"
    if "Plat dak" in roof:
        return "Flat"
    if "Zadeldak" in roof:
        return "Saddle"
    if "Samengesteld dak" in roof:
        return "Composite"
    if "Mansarde" in roof:
        return "Mansard"
    return "Other"


def simplify_ownership(x):
    if pd.isna(x) or x.strip() == "":
        return "Unknown"
    if "Volle eigendom" in x:
        return "Full"
    if "Erfpacht" in x and "Gemeentelijk" in x:
        return "Municipal"
    if "Erfpacht" in x:
        return "Leasehold"
    return "Other"


def extract_lease_years(x, current_year=2025):
    if pd.isna(x) or "Volle eigendom" in x or x.strip() == "":
        return np.nan
    match = re.search(r"einddatum erfpacht: (\d{2})-(\d{2})-(\d{4})", x)
    if match:
        day, month, year = map(int, match.groups())
        return max(year - current_year, 0)
    return np.nan


def simplify_location(x):
    if pd.isna(x):
        return "Unknown"
    if "centrum" in x:
        return "Central"
    if "woonwijk" in x:
        return "Residential"
    if "vrij uitzicht" in x:
        return "OpenView"
    if "park" in x:
        return "Park"
    return "Other"


def drop_low_variance_dummies(df, threshold=0.95):
    low_var_cols = [
        col
        for col in df.columns
        if df[col].value_counts(normalize=True, dropna=False).iloc[0]
        >= threshold
    ]
    return df.drop(columns=low_var_cols), low_var_cols


# -----------------------
# 1. Numeric features
# -----------------------
numeric_features = [
    "size_num",
    "contribution_vve_num",
    "external_storage_num",
    "living_area",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "num_facilities",
    "inhabitants_in_neighborhood",
    "families_with_children_pct",
    "price_per_m2_neighborhood",
]

for col in numeric_features:
    df[col] = df[col].apply(to_float)
    df[col].fillna(df[col].median(), inplace=True)

log_cols = auto_log_transform(df, numeric_features)

# -----------------------
# 2. Binary / flag features
# -----------------------
binary_flags = [
    "has_mechanische_ventilatie",
    "has_tv_kabel",
    "has_lift",
    "has_natuurlijke_ventilatie",
    "has_n/a",
    "has_schuifpui",
    "has_glasvezelkabel",
    "has_frans_balkon",
    "has_buitenzonwering",
    "has_zonnepanelen",
    # Removed very low-variance flags here (optional)
]

for col in binary_flags:
    df[col] = df[col].fillna(0).astype(int)

# -----------------------
# 3. Direct numeric features
# -----------------------
direct_numeric_features = [
    "bedrooms",
    "year_of_construction",
    "contribution_vve_num",
    "size_num",
    "external_storage_num",
    "living_area",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "num_facilities",
    "floor_level",
    "lease_years_remaining",
    "backyard_num",
    "balcony_flag",
]

# -----------------------
# 4. Energy label encoding
# -----------------------
df["energy_label"] = df["energy_label"].replace({0: "G"}).replace("N/A", "G")
energy_order = [
    "G",
    "F",
    "E",
    "D",
    "C",
    "B",
    "A",
    "A+",
    "A++",
    "A+++",
    "A++++",
]
encoder_energy = OrdinalEncoder(categories=[energy_order], dtype=int)
df["energy_label_encoded"] = encoder_energy.fit_transform(df[["energy_label"]])
df.drop(columns=["energy_label"], inplace=True)

# -----------------------
# 5. Categorical features (OHE)
# -----------------------
df["postal_district"] = df["postal_code_clean"].str[:3]
postal_ohe = pd.get_dummies(
    df["postal_district"], prefix="district", drop_first=True
)

df["status"] = df["status"].fillna("N/A")
status_ohe = pd.get_dummies(df["status"], prefix="status", drop_first=True)

df["roof_type_simple"] = df["roof_type"].apply(simplify_roof)
roof_ohe = pd.get_dummies(
    df["roof_type_simple"], prefix="roof", drop_first=True
)

df["ownership_simple"] = df["ownership_type"].apply(simplify_ownership)
ownership_ohe = pd.get_dummies(
    df["ownership_simple"], prefix="ownership", drop_first=True
)

df["location_simple"] = df["location"].apply(simplify_location)
location_ohe = pd.get_dummies(
    df["location_simple"], prefix="location", drop_first=True
)

df["garden"] = df["garden"].fillna("None")
garden_ohe = pd.get_dummies(df["garden"], prefix="garden", drop_first=True)

# -----------------------
# 6. Numeric additions
# -----------------------
df["floor_level"] = df["located_on"].apply(extract_floor)
df["lease_years_remaining"] = (
    df["ownership_type"].apply(extract_lease_years).fillna(0)
)
df["backyard_num"] = df["backyard"].apply(to_float).fillna(0)
df["balcony_flag"] = df["balcony"].apply(
    lambda x: 0 if pd.isna(x) or x == "N/A" else 1
)

# -----------------------
# 7. Combine all features
# -----------------------
model_features = (
    log_cols
    + binary_flags
    + direct_numeric_features
    + ["energy_label_encoded"]
)

# Combine all OHE features
ohe_all = pd.concat(
    [
        postal_ohe,
        status_ohe,
        roof_ohe,
        ownership_ohe,
        location_ohe,
        garden_ohe,
    ],
    axis=1,
)

# Drop low-variance dummy columns
ohe_reduced, dropped_cols = drop_low_variance_dummies(ohe_all, threshold=0.95)
print(f"Dropped {len(dropped_cols)} low-variance columns:", dropped_cols)

# Final X
X = pd.concat([df[model_features], ohe_reduced], axis=1)
y = df["price_num"]

print("Number of features for modeling:", X.shape[1])
print("Automatically log-transformed columns:", log_cols)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure numeric columns are numeric
numeric_cols = log_cols + direct_numeric_features + ["energy_label_encoded"]
X_numeric = X[numeric_cols].apply(pd.to_numeric, errors="coerce")

# Correlation matrix
corr_matrix = X_numeric.corr()

# Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation matrix for numeric features")
plt.show()

# Identify highly correlated pairs
high_corr = []
cols = corr_matrix.columns
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.9:
            high_corr.append((cols[i], cols[j], corr_val))

print("Highly correlated numeric pairs (|r|>0.9):")
for pair in high_corr:
    print(pair)

In [None]:
cols_to_drop = [
    "size_num",
    "living_area",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "num_facilities",
    "external_storage_num",
]
X.drop(columns=cols_to_drop, inplace=True)
# -----------------------
# 8. Train/Test split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

## 13. Baseline models after feature engineering


#### Baseline Random Forest


In [None]:
rf, rf_results = evaluate_model(
    RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    X_train,
    y_train,
    X_test,
    y_test,
)
log_to_mlflow(rf, "Random_Forest_Regression_feature_eng", rf_results)

#### Baseline XGboost with early stopping


In [None]:
import xgboost as xgb

# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Parameters
params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "eta": 0.05,
    "seed": 42,
}

# Train with early stopping
xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=[(dtest, "eval")],
    early_stopping_rounds=50,
    verbose_eval=False,
)

# Predictions
y_train_pred = xgb_model.predict(dtrain)
y_test_pred = xgb_model.predict(dtest)

# Metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(
    f"Train RMSE: {train_rmse:.2f}, MAE: {train_mae:.2f}, train_R²: {train_r2:.3f}"
)
print(
    f"Test RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}, test_R²: {test_r2:.3f}"
)

with mlflow.start_run(run_name="XGBoost_Regression_feeture_eng"):
    # Log model
    mlflow.xgboost.log_model(xgb_model, artifact_path="xgb_model_feature_eng")

    # Log metrics
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)

## 14. Optuna tuning after feature eng


In [None]:
# Run Optuna for XGBoost
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)

study_xgb = optuna.create_study(
    direction="minimize", sampler=sampler, pruner=pruner
)
study_xgb.optimize(
    lambda trial: objective_xgb(trial, X_train, y_train, X_test, y_test),
    n_trials=200,
)

print("Best XGBoost params:", study_xgb.best_params)
print("Best XGBoost Test MAE:", study_xgb.best_value)

# Run Optuna for Random Forest
study_rf = optuna.create_study(
    direction="minimize", sampler=sampler, pruner=pruner
)
study_rf.optimize(
    lambda trial: objective_rf(trial, X_train, y_train, X_test, y_test),
    n_trials=100,
)

print("Best RF params:", study_rf.best_params)
print("Best RF Test MAE:", study_rf.best_value)

## 15. Best params run after feature eng


In [None]:
# Convert log1p targets
dtrain = xgb.DMatrix(X_train, label=np.log1p(y_train))
dtest = xgb.DMatrix(X_test, label=np.log1p(y_test))

best_params = study_xgb.best_params
best_params.update(
    {"objective": "reg:squarederror", "seed": 42, "tree_method": "hist"}
)

best_xgb, results_xgb = evaluate_xgb_dmatrix(
    best_params,
    dtrain,
    dtest,
    y_train_orig=y_train,
    y_test_orig=y_test,
    run_name="XGB_Optuna_LogTransformed_feature_eng",
)

In [None]:
# Prepare log-transformed targets
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Refit best RF on log targets and evaluate
best_rf = RandomForestRegressor(**study_rf.best_params)
best_rf, results_rf, y_test_pred = evaluate_model_log(
    best_rf, X_train, y_train_log, X_test, y_test_log
)

# Log to MLflow
log_to_mlflow(best_rf, "RF_LogTransform_Optuna_feature_eng", results_rf)

In [None]:
experiment_name = "house_price_prediction"

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])

metrics_of_interest = [
    "metrics.train_rmse",
    "metrics.test_rmse",
    "metrics.train_r2",
    "metrics.test_r2",
    "metrics.train_mae",
    "metrics.test_mae",
]
comparison_df = runs_df[
    ["run_id", "tags.mlflow.runName"] + metrics_of_interest
]

comparison_df.sort_values("metrics.test_mae", ascending=True, inplace=True)
comparison_df

I have chosen run_id 33688ff883c54d2fb4a14cbef2ae617a because the combination of statistcs looks the best: one of the highest R2 for both test and train, and rmse and mae are one of the lowest ones.


In [None]:
print(mlflow.get_tracking_uri())

In [None]:
from mlflow.tracking import MlflowClient

run_id = "33688ff883c54d2fb4a14cbef2ae617a"
client = MlflowClient()

artifacts = client.list_artifacts(
    run_id, path="xgb_model"
)  # match the artifact_path you used
for a in artifacts:
    print(a.path)

In [None]:
run_id = "33688ff883c54d2fb4a14cbef2ae617a"
model_path = f"runs:/{run_id}/xgb_model"

# Load the model
loaded_model = mlflow.xgboost.load_model(model_path)

# Make predictions
y_pred = loaded_model.predict(dtest)  # dtest = xgb.DMatrix(X_test)

In [None]:
run = mlflow.get_run(run_id)
print(run.data.metrics)  # Train/test RMSE, R2, etc.
print(run.data.params)  # Model hyperparameters

In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Register model
model_uri = f"runs:/{run_id}/xgb_model"
registered_model_name = "RealEstate_XGB"
model_version = mlflow.register_model(model_uri, registered_model_name)

print(
    f"Model registered as {registered_model_name}, version {model_version.version}"
)
# saved the best

#### 16. Cross validation for this best model


In [None]:
mlflow.end_run()

In [None]:
# Parameters
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

cv_metrics = []

X_np = X_train.values  # or X_train_scaled if scaling used
y_np = y_train.values

with mlflow.start_run(run_name="XGB_CV_last_model"):

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_np)):
        X_tr, X_val = X_np[train_idx], X_np[val_idx]
        y_tr, y_val = y_np[train_idx], y_np[val_idx]

        dtrain = xgb.DMatrix(X_tr, label=np.log1p(y_tr))
        dval = xgb.DMatrix(X_val, label=np.log1p(y_val))

        xgb_model = xgb.train(
            best_params,
            dtrain,
            num_boost_round=500,
            evals=[(dval, "eval")],
            early_stopping_rounds=50,
            verbose_eval=False,
        )

        # Predict and back-transform
        y_val_pred = np.expm1(xgb_model.predict(dval))

        # Metrics
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        mae = mean_absolute_error(y_val, y_val_pred)
        r2 = r2_score(y_val, y_val_pred)
        cv_metrics.append({"rmse": rmse, "mae": mae, "r2": r2})

        # Log metrics per fold
        mlflow.log_metric(f"fold_{fold+1}_rmse", rmse)
        mlflow.log_metric(f"fold_{fold+1}_mae", mae)
        mlflow.log_metric(f"fold_{fold+1}_r2", r2)

        # Plot predicted vs actual
        plt.figure(figsize=(6, 6))
        sns.scatterplot(x=y_val, y=y_val_pred)
        plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], "r--")
        plt.xlabel("Actual")
        plt.ylabel("Predicted")
        plt.title(f"Fold {fold+1} Predicted vs Actual")
        plt.show()  # show inline

        # Log image via PIL
        buf = io.BytesIO()
        plt.savefig(buf, format="png")
        buf.seek(0)
        img = Image.open(buf)
        mlflow.log_image(img, f"fold_{fold+1}_pred_vs_actual.png")
        plt.close()

# Aggregate metrics
mean_rmse = np.mean([m["rmse"] for m in cv_metrics])
std_rmse = np.std([m["rmse"] for m in cv_metrics])
mean_mae = np.mean([m["mae"] for m in cv_metrics])
std_mae = np.std([m["mae"] for m in cv_metrics])
mean_r2 = np.mean([m["r2"] for m in cv_metrics])
std_r2 = np.std([m["r2"] for m in cv_metrics])

# Log aggregated metrics
mlflow.log_metric("CV_mean_rmse", mean_rmse)
mlflow.log_metric("CV_std_rmse", std_rmse)
mlflow.log_metric("CV_mean_mae", mean_mae)
mlflow.log_metric("CV_std_mae", std_mae)
mlflow.log_metric("CV_mean_r2", mean_r2)
mlflow.log_metric("CV_std_r2", std_r2)

print(f"CV RMSE: {mean_rmse:.2f} ± {std_rmse:.2f}")
print(f"CV MAE: {mean_mae:.2f} ± {std_mae:.2f}")
print(f"CV R²: {mean_r2:.3f} ± {std_r2:.3f}")

In [None]:
run = mlflow.get_run(run_id)
print(run.data.metrics)  # Train/test RMSE, R2, etc.
print(run.data.params)

In [None]:
# Prepare storage
y_vals_all = []
y_preds_all = []

# Collect predictions from each fold
for train_idx, val_idx in kf.split(X_np):
    X_tr, X_val = X_np[train_idx], X_np[val_idx]
    y_tr, y_val = y_np[train_idx], y_np[val_idx]

    dtrain = xgb.DMatrix(X_tr, label=np.log1p(y_tr))
    dval = xgb.DMatrix(X_val, label=np.log1p(y_val))

    xgb_model = xgb.train(
        best_params,
        dtrain,
        num_boost_round=500,
        evals=[(dval, "eval")],
        early_stopping_rounds=50,
        verbose_eval=False,
    )

    y_val_pred = np.expm1(xgb_model.predict(dval))

    y_vals_all.extend(y_val)
    y_preds_all.extend(y_val_pred)

# Convert to arrays
y_vals_all = np.array(y_vals_all)
y_preds_all = np.array(y_preds_all)
residuals = y_vals_all - y_preds_all

# Predicted vs Actual
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_vals_all, y=y_preds_all)
plt.plot(
    [y_vals_all.min(), y_vals_all.max()],
    [y_vals_all.min(), y_vals_all.max()],
    "r--",
    label="Perfect Prediction",
)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("CV: Predicted vs Actual (all folds)")
plt.legend()
plt.show()

# Residual plot
plt.figure(figsize=(6, 4))
sns.histplot(residuals, kde=True, bins=20, color="skyblue")
plt.xlabel("Residual (Actual - Predicted)")
plt.title("CV Residuals Distribution")
plt.show()

#### Generating price range (for pipeline)


In [None]:
import pandas as pd
import numpy as np

# Number of bins
n_bins = 10

# Create a DataFrame
df = pd.DataFrame({"pred": y_preds_all, "residual": residuals})

# Use qcut instead of cut to ensure roughly equal-sized bins
df["pred_bin"] = pd.qcut(df["pred"], q=n_bins, duplicates="drop")

# Compute 5th and 95th percentiles per bin
bin_ranges = (
    df.groupby("pred_bin")["residual"]
    .agg(
        lower_bound=lambda x: np.percentile(x, 5),
        upper_bound=lambda x: np.percentile(x, 95),
    )
    .reset_index()
)


# Function to get price range
def get_price_range(pred_price):
    for _, row in bin_ranges.iterrows():
        if row["pred_bin"].left <= pred_price <= row["pred_bin"].right:
            return (
                pred_price + row["lower_bound"],
                pred_price + row["upper_bound"],
            )
    return pred_price, pred_price  # fallback if outside all bins


# Example
example_pred = 500_000
lb, ub = get_price_range(example_pred)
print(f"Predicted price: {example_pred}, Range: {lb:.0f} - {ub:.0f}")

Need to just figure out logging these ranges
