#### 1. Imports and Set-up


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import xgboost as xgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

tracking_uri = "../logs/mlruns"
os.makedirs(os.path.join(tracking_uri, ".trash"), exist_ok=True)

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("house_price_prediction")

#### 2. Load and prep data


In [None]:
import sys
import os
from pathlib import Path
import yaml


# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_loading.data_loading.data_loader import load_data_from_json
from src.data_loading.preprocessing.preprocessing import preprocess_df
from src.data_loading.preprocessing.imputation import impute_missing_values


# go two levels up from notebook dir -> project root
ROOT = (
    Path(__file__).resolve().parents[2]
    if "__file__" in globals()
    else Path.cwd().parents[1]
)
CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "preprocessing_config.yaml"
)

with open(CONFIG_PATH) as f:
    CONFIG = yaml.safe_load(f)

df_raw = load_data_from_json("../data/parsed_json/*.json")
df_clean = preprocess_df(
    df_raw,
    drop_raw=CONFIG["preprocessing"]["drop_raw"],
    numeric_cols=CONFIG["preprocessing"]["numeric_cols"],
)
df_clean = impute_missing_values(
    df_clean, CONFIG["preprocessing"]["imputation"]
)
# Drop price_num NaNs for the training of the model
df_clean = df_clean[df_clean["price_num"].notna()]
df_clean.drop(columns=["living_area"], inplace=True)
# df_clean = df_clean[:100]
df = df_clean.copy()

In [None]:
df

In [None]:
from collections import Counter

# Path to your house_pages.txt
file_path = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "house_pages_scraped.txt"
)

# Read all URLs
with open(file_path, "r") as f:
    urls = f.read().splitlines()

# ✅ Count koop/amsterdam
count_amsterdam = sum(
    1 for url in urls if "koop" in url.lower() and "amsterdam" in url.lower()
)
print(f"Number of koop/amsterdam listings: {count_amsterdam}")


In [None]:
df.isna().sum()

In [None]:
df.columns

In [None]:
from src.features.data_prep_for_modelling.data_preparation import prepare_data

FEATURES_CONFIG_PATH = (
    ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"
)

# Scaled features (applies scaling according to YAML)
X_train_scaled, X_test_scaled, y_train, y_test, X_val, y_val, scaler, _ = prepare_data(
    df,
    config_path=FEATURES_CONFIG_PATH,
    model_name="linear_regression",  # uses the unified YAML key
    use_extended_features=False,       # set True if you want extended features
    cv=False
)

# # Raw features (no scaling)
# X_train_raw, X_test_raw, y_train, y_test, _, X_val_raw, y_val_raw = prepare_data(
#     df,
#     config_path=FEATURES_CONFIG_PATH,
#     model_name="linear_regression",
#     use_extended_features=False
# )

#### 4. Linear Regression


In [None]:
from src.model.evaluate import ModelEvaluator
from src.model.mlflow_logger import MLFlowLogger

evaluator = ModelEvaluator()
logger = MLFlowLogger()

lr_model = LinearRegression()

# Evaluate
trained_lr, y_train_pred, y_val_pred, y_test_pred, lr_results = evaluator.evaluate(
    model=lr_model,
    X_train=X_train_scaled,
    y_train=y_train,
    X_test=X_test_scaled,
    y_test=y_test,
    model_params={},   
    fit_params={},     
    use_xgb_train=False
)

# Log the model and results
logger.log_model(trained_lr, "LinearRegression", lr_results)

#### 5. Random Forest Regression


In [None]:
from src.features.feature_engineering.encoding import encode_energy_label

X_train, X_test, y_train, y_test, scaler, X_val, y_val, _ = prepare_data(
    df_clean,
    config_path=FEATURES_CONFIG_PATH, 
    model_name="random_forest",
    use_extended_features=False,     
    cv=False 
)

In [None]:
X_train.energy_label_encoded.unique()

In [None]:
rf_model = RandomForestRegressor()

trained_rf, y_train_pred, y_val_pred, y_test_pred, rf_results = evaluator.evaluate(
    model=rf_model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model_params={},  
    fit_params={},    
    use_xgb_train=False
)
logger.log_model(trained_rf, "RandomForestRegression", rf_results)

#### 6. XGBoost model


In [None]:
from src.model.utils import load_model_config_and_search_space

X_train, X_test, y_train, y_test, X_val, y_val, scaler, _ = prepare_data(
    df_clean, config_path=FEATURES_CONFIG_PATH, model_name="xgboost", 
    use_extended_features=False, cv=False
)

MODEL_CONFIG_PATH = (
    ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"
)

model_params, fit_params, _ = load_model_config_and_search_space(
    MODEL_CONFIG_PATH, model_name="xgboost"
)
fit_params_safe = fit_params.copy()
n_estimators = fit_params_safe.pop("n_estimators", 100)  

xgb_model = xgb.XGBRegressor(
    n_estimators=n_estimators,
    **model_params
)

trained_xgb, y_train_pred, y_val_pred, y_test_pred, xgb_results = evaluator.evaluate(
    xgb_model,
    X_train,
    y_train,
    X_test=X_test,
    y_test=y_test,
    fit_params=fit_params_safe, 
    use_xgb_train=False,
    X_val=X_val,
    y_val=y_val,
)
logger.log_model(trained_xgb, "XGBoostRegression", xgb_results)

#### 7. XGBoost with early stopping and more tuning


In [None]:
X_train, X_test, y_train, y_test, X_val, y_val, scaler, _ = prepare_data(
    df_clean,
    config_path=FEATURES_CONFIG_PATH,
    model_name="xgboost_early_stopping",
    use_extended_features=False,
    cv=False,
)

xgb_model_params, xgb_fit_params, _ = load_model_config_and_search_space(
    MODEL_CONFIG_PATH, "xgboost_early_stopping"
)

xgb_model = xgb.XGBRegressor(**xgb_model_params)


trained_xgb, y_train_pred, y_val_pred, y_test_pred, xgb_results = evaluator.evaluate(
    xgb_model,
    X_train,
    y_train,
    X_test,
    y_test,
    X_val=X_val,
    y_val=y_val,
    fit_params=xgb_fit_params,
    use_xgb_train=True,  
)

logger.log_model(
    trained_xgb, "xgb_with_early_stopping", xgb_results, use_xgb_train=True
)

#### 7. Compare models using MLflow


In [None]:
experiment_name = "house_price_prediction"

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])

In [None]:
metrics_of_interest = [
    # Original scale
    "metrics.train_rmse",
    "metrics.test_rmse",
    "metrics.train_mae",
    "metrics.test_mae",
    "metrics.train_r2",
    "metrics.test_r2",
    "metrics.train_mape",
    "metrics.test_mape",
    
    # Log / transformed scale
    # "metrics.train_rmse_trans",
    # "metrics.test_rmse_trans",
    # "metrics.train_mae_trans",
    # "metrics.test_mae_trans",
    # "metrics.train_r2_trans",
    # "metrics.test_r2_trans",
    # "metrics.train_mape_trans",
    # "metrics.test_mape_trans",
]
comparison_df = runs_df[
    ["run_id", "tags.mlflow.runName"] + metrics_of_interest
]

comparison_df.sort_values("metrics.test_r2", ascending=False, inplace=True)
comparison_df

In [None]:
best_model = comparison_df.sort_values(
    "metrics.test_r2", ascending=False
).iloc[0]
print("Best model based on test R²:")
print(best_model)

#### 8. Hyperparameter tuning with Optuna


In [None]:
from functools import partial
from src.model.objectives_optuna import unified_objective

FEATURES_AND_MODEL_CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "model_config.yaml"
)

# XGBoost study
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study_xgb = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
objective_xgb_partial = partial(
    unified_objective,
    model_name="xgboost_early_stopping",
    df=df_clean,
    features_config=FEATURES_AND_MODEL_CONFIG_PATH,
    model_config=FEATURES_AND_MODEL_CONFIG_PATH,
    use_extended_features=False,
)
study_xgb.optimize(objective_xgb_partial, n_trials=30)

print("Best XGBoost params:", study_xgb.best_params)
print("Best XGBoost Test RMSE:", study_xgb.best_value)

# RandomForest study
study_rf = optuna.create_study(direction="minimize")
objective_rf_partial = partial(
    unified_objective,
    model_name="random_forest_optuna",
    df=df_clean,
    features_config=FEATURES_AND_MODEL_CONFIG_PATH,
    model_config=FEATURES_AND_MODEL_CONFIG_PATH,
    use_extended_features=False,
)
study_rf.optimize(objective_rf_partial, n_trials=30)

print("Best RF params:", study_rf.best_params)
print("Best RF Test RMSE:", study_rf.best_value)

#### 9. RF and Xgboost with best parameters


In [None]:
best_rf = RandomForestRegressor(**study_rf.best_params)
trained_rf, y_train_pred, y_val_pred, y_test_pred, results_rf = evaluator.evaluate(
    model=best_rf,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    X_val=X_val,
    y_val=y_val,
    fit_params={},
)

logger.log_model(trained_rf, "RF_Optuna", results_rf)

In [None]:
X_train, X_test, y_train, y_test, X_val, y_val, _, _ = prepare_data(
    df_clean,
    config_path=FEATURES_AND_MODEL_CONFIG_PATH,
    model_name="xgboost_early_stopping",
    use_extended_features=False,
    cv=False
)


xgb_model_params, xgb_fit_params, _ = load_model_config_and_search_space(
    MODEL_CONFIG_PATH, "xgboost_early_stopping"
)

xgb_model = xgb.XGBRegressor(**study_xgb.best_params)

trained_xgb, _, _, _, xgb_results = evaluator.evaluate(
    model=xgb_model,  
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    X_val=X_val,
    y_val=y_val,
    fit_params=xgb_fit_params,
    use_xgb_train=True,
)

logger.log_model(
    trained_xgb, "xgb_with_early_stopping_optuna", xgb_results, use_xgb_train=True
)

#### 10. Let's see how outliers skew RMSE


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot target distribution
sns.boxplot(y=y_test)
plt.show()

# Optional: scatter of predictions vs true values
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.xlabel("True")
plt.ylabel("Predicted")
plt.show()

In [None]:
# Compute residuals
residuals = y_test - y_test_pred

# Summary stats
print("Residuals summary:")
print("Min:", np.min(residuals))
print("Max:", np.max(residuals))
print("Median:", np.median(residuals))
print("Mean:", np.mean(residuals))
print("Std:", np.std(residuals))

# Plot histogram
plt.hist(residuals, bins=50)
plt.title("Residuals Distribution")
plt.xlabel("Residual")
plt.ylabel("Count")
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error

train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE:  {test_mae:.2f}")

So outliers are skewing the RMSE statistic quite heavily. Hence, I will log transform the target and dot he same analysis.


In [None]:
evaluator = ModelEvaluator(
    target_transform=np.log1p,
    inverse_transform=np.expm1
)
best_rf = RandomForestRegressor(**study_rf.best_params)

trained_rf, y_train_pred, y_val_pred, y_test_pred, results = evaluator.evaluate(
    model=best_rf,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    use_xgb_train=False  
)
logger.log_model(trained_rf, "RF_LogTransform_Evaluator", results)


In [None]:
from xgboost import XGBRegressor

# Initialize XGBoost with best params (from previous Optuna run)
best_xgb = XGBRegressor(**study_xgb.best_params)

# Use log transform
evaluator = ModelEvaluator(
    target_transform=np.log1p,
    inverse_transform=np.expm1
)

# Evaluate model
trained_xgb, y_train_pred, y_val_pred, y_test_pred, results = evaluator.evaluate(
    model=best_xgb,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,       
    y_val=y_val,     
    X_test=X_test,
    y_test=y_test,
    use_xgb_train=True
)

# Log the trained model
logger.log_model(
    trained_xgb, 
    "XGB_LogTransform_Evaluator", 
    results, 
    use_xgb_train=True
)


In [None]:
# Compute residuals
residuals = y_test - y_test_pred

# Define extreme outliers: e.g., top 5% of absolute residuals
threshold = np.percentile(np.abs(residuals), 95)
outliers_mask = np.abs(residuals) >= threshold

plt.figure(figsize=(8, 6))

# Plot non-outliers
plt.scatter(
    y_test[~outliers_mask],
    y_test_pred[~outliers_mask],
    alpha=0.5,
    label="Normal listings",
)

# Highlight extreme residuals
plt.scatter(
    y_test[outliers_mask],
    y_test_pred[outliers_mask],
    color="red",
    label="Extreme listings",
)

# Diagonal line (perfect prediction)
max_val = max(y_test.max(), y_test_pred.max())
plt.plot(
    [0, max_val],
    [0, max_val],
    color="black",
    linestyle="--",
    label="Perfect prediction",
)

plt.xlabel("Actual Price (€)")
plt.ylabel("Predicted Price (€)")
plt.title("Predicted vs Actual Prices with Extreme Listings Highlighted")
plt.legend()
plt.show()

This is a clear visualization to see how predictions behave across the entire range and highlight the extreme listings that inflate RMSE. Large RMSE is not a deal breaker since:

**Statistical justification**
Skewed distribution: My dataset has a few extremely expensive houses that are far from the mean. RMSE is sensitive to large errors because it squares residuals, so these few points dominate the metric.

MAE is more robust: By reporting MAE alongside RMSE, I show the typical prediction error for most listings, which is a fairer assessment of model performance.

Log-transform mitigates skew: Training on log1p(y) reduces the influence of outliers and stabilizes variance, producing a more reliable model for the bulk of the data.

**Practical/business justification**

The extreme listings (multi-million € homes) are rare. The model performs well on 99% of listings, which is what matters for most users or business decisions.

Trying to perfectly predict the top 1–5% of luxury listings would:

Require specialized models or additional data

Complicate the pipeline

Increase overfitting risk

Reporting MAE and residual distributions communicates clearly that errors on extreme listings exist, but are expected and do not invalidate the model.


#### 11. New approach and moving with optuna


In [None]:
# XGBoost with log-transform
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study_xgb = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)

objective_xgb_partial = partial(
    unified_objective,
    model_name="xgboost_early_stopping",
    df=df_clean,
    features_config=FEATURES_AND_MODEL_CONFIG_PATH,
    model_config=FEATURES_AND_MODEL_CONFIG_PATH,
    use_log=True,  
    n_splits=5,
    use_extended_features=False,
)

study_xgb.optimize(objective_xgb_partial, n_trials=30)

# Random Forest with log-transform
study_rf = optuna.create_study(direction="minimize")

objective_rf_partial = partial(
    unified_objective,
    model_name="random_forest_optuna",
    df=df_clean,
    features_config=FEATURES_AND_MODEL_CONFIG_PATH,
    model_config=FEATURES_AND_MODEL_CONFIG_PATH,
    use_log=True,  
    n_splits=5,
    use_extended_features=False,
)


study_rf.optimize(objective_rf_partial, n_trials=30)

In [None]:
# Initialize evaluator with log-transform (same as used in Optuna)
evaluator = ModelEvaluator(
    target_transform=np.log1p,
    inverse_transform=np.expm1,
)

# --- Random Forest ---
best_rf = RandomForestRegressor(**study_rf.best_params)
trained_rf, y_train_pred, y_val_pred, y_test_pred, results_rf = evaluator.evaluate(
    model=best_rf,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    X_val=X_val,
    y_val=y_val,
    use_xgb_train=False,  # RF uses sklearn API
)
logger.log_model(trained_rf, "RF_Optuna_Log", results_rf, use_xgb_train=False)

# --- XGBoost ---
best_xgb_params = study_xgb.best_params
trained_xgb, y_train_pred, y_val_pred, y_test_pred, results_xgb = evaluator.evaluate(
    model=None,  # we pass params dict instead of a model instance
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    X_val=X_val,
    y_val=y_val,
    use_xgb_train=True,  # XGBoost-specific training
    model_params=best_xgb_params,  # pass best hyperparams
    fit_params={"num_boost_round": 1000, "early_stopping_rounds": 50},  # you can tune these if needed
)
logger.log_model(trained_xgb, "XGB_Optuna_Log", results_xgb, use_xgb_train=True)


In [None]:
experiment_name = "house_price_prediction"

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])

runs_df['start_time_dt'] = pd.to_datetime(runs_df['start_time'], unit='ms')

# Filter runs between two dates
mask = (runs_df['start_time_dt'] >= '2025-09-18')

metrics_of_interest = [
    # Original scale
    "metrics.train_rmse",
    "metrics.test_rmse",
    "metrics.train_mae",
    "metrics.test_mae",
    "metrics.train_r2",
    "metrics.test_r2",
    "metrics.train_mape",
    "metrics.test_mape",
    
    # Log / transformed scale
    # "metrics.train_rmse_trans",
    # "metrics.test_rmse_trans",
    # "metrics.train_mae_trans",
    # "metrics.test_mae_trans",
    # "metrics.train_r2_trans",
    # "metrics.test_r2_trans",
    # "metrics.train_mape_trans",
    # "metrics.test_mape_trans",
]
comparison_df = runs_df[
    ["run_id", "tags.mlflow.runName"] + metrics_of_interest
]

comparison_df.sort_values("metrics.test_mae", ascending=True, inplace=True)
comparison_df = comparison_df[mask]
comparison_df

In [None]:
best_model = comparison_df.sort_values(
    "metrics.test_mae", ascending=True
).iloc[0]
print("Best model based on test MAE:")
print(best_model)

#### 12. Extra feature engineering


In [None]:
df_clean.columns

In [None]:
FEATURES_AND_MODEL_CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "model_config.yaml"
)
# --- Prepare data for final modeling ---
X_train, X_test, y_train, y_test, X_val, y_val, scaler, feature_encoders = prepare_data(
    df=df_clean,
    config_path=FEATURES_AND_MODEL_CONFIG_PATH,
    model_name="xgboost_early_stopping",  
    use_extended_features=True,           
    cv=False                              
)


print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape if X_val is not None else None)
print("Test shape:", X_test.shape)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select only numeric columns from X_train
X_numeric = X_train.select_dtypes(include="number")

# Correlation matrix
corr_matrix = X_numeric.corr()

# Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation matrix for numeric features (Train set)")
plt.show()

# Identify highly correlated pairs
high_corr = []
cols = corr_matrix.columns
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.9:
            high_corr.append((cols[i], cols[j], corr_val))

print("Highly correlated numeric pairs (|r|>0.9):")
for pair in high_corr:
    print(pair)


In [None]:
cols_to_drop = [
    "size_num",
    # "living_area",
    # "nr_rooms",
    # "bathrooms",
    # "toilets",
    "num_facilities",
    # "external_storage_num",
]
X_train_final = X_train.copy()
X_test_final = X_test.copy()
X_val_final = X_val.copy()
X_train_final.drop(columns=cols_to_drop, inplace=True)
X_val_final.drop(columns=cols_to_drop, inplace=True)
X_test_final.drop(columns=cols_to_drop, inplace=True)

print("Train shape:", X_train_final.shape)
print("validation shape:", X_val_final.shape)
print("Test shape:", X_test_final.shape)


## 13. Baseline models after feature engineering


#### Baseline Random Forest


In [None]:
#no need for validation set her
X_train_full = pd.concat([X_train_final, X_val_final], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

evaluator = ModelEvaluator(
    target_transform=np.log1p,     # log-transform target for training
    inverse_transform=np.expm1     # convert predictions back to original scale
)

rf_model = RandomForestRegressor()

trained_rf, y_train_pred, y_val_pred, y_test_pred, rf_results = evaluator.evaluate(
    model=rf_model,
    X_train=X_train_full,
    y_train=y_train_full,
    X_test=X_test_final,
    y_test=y_test,
    use_xgb_train=False, 

)
logger.log_model(trained_rf, "Random_Forest_Regression_feature_eng", rf_results,  use_xgb_train=False)

#### Baseline XGboost with early stopping


In [None]:
MODEL_CONFIG_PATH = ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"
model_params, fit_params, _ = load_model_config_and_search_space(MODEL_CONFIG_PATH, model_name="xgboost_early_stopping")

fit_params_safe = fit_params.copy()
n_estimators = fit_params_safe.pop("n_estimators", 100)  

xgb_model = xgb.XGBRegressor(
    n_estimators=n_estimators,
    **model_params
)


trained_xgb, y_train_pred, y_val_pred, y_test_pred, xgb_results = evaluator.evaluate(
    xgb_model,
    X_train_final,
    y_train,
    X_test_final,
    y_test,
    X_val=X_val_final,
    y_val=y_val,
    fit_params=fit_params,
    use_xgb_train=True  # ensures early stopping is used
)

# Log model
logger.log_model(trained_xgb, "XGBoostRegressionFeatureEng", xgb_results, use_xgb_train=True)


## 14. Optuna tuning after feature eng


In [None]:
df_clean.columns

In [None]:
# XGBoost with log-transform

from functools import partial
from src.model.objectives_optuna import unified_objective

FEATURES_AND_MODEL_CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "model_config.yaml"
)

sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study_xgb = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)

objective_xgb_partial = partial(
    unified_objective,
    model_name="xgboost_early_stopping_optuna_feature_eng",
    df=df_clean,
    features_config=FEATURES_AND_MODEL_CONFIG_PATH,
    model_config=FEATURES_AND_MODEL_CONFIG_PATH,
    use_log=True,  
    n_splits=5,
    use_extended_features=True
)
study_xgb.optimize(objective_xgb_partial, n_trials=30)

# Random Forest with log-transform
study_rf = optuna.create_study(direction="minimize")

objective_rf_partial = partial(
    unified_objective,
    model_name="random_forest_optuna_feature_eng",
    df=df_clean,
    features_config=FEATURES_AND_MODEL_CONFIG_PATH,
    model_config=FEATURES_AND_MODEL_CONFIG_PATH,
    use_log=True,  
    n_splits=5,
    use_extended_features=True

)

study_rf.optimize(objective_rf_partial, n_trials=30)

In [None]:
print(df_clean.columns.tolist())


## 15. Best params run after feature eng


In [None]:
# Initialize evaluator with log-transform if used
evaluator = ModelEvaluator(target_transform=np.log1p, inverse_transform=np.expm1)

# --- Random Forest ---
best_rf = RandomForestRegressor(**study_rf.best_params)
trained_rf, y_train_pred, y_val_pred, y_test_pred, results_rf = evaluator.evaluate(
    model=best_rf,
    X_train=X_train_final,
    y_train=y_train,
    X_test=X_test_final,
    y_test=y_test,
    X_val=X_val_final,
    y_val=y_val,
    use_xgb_train=False,
)
logger.log_model(trained_rf, "RF_LogTransform_Optuna_feature_eng", results_rf, use_xgb_train=False)

# --- XGBoost ---
best_xgb_params = study_xgb.best_params
trained_xgb, y_train_pred, y_val_pred, y_test_pred, results_xgb = evaluator.evaluate(
    model=best_xgb_params,
    X_train=X_train_final,
    y_train=y_train,
    X_test=X_test_final,
    y_test=y_test,
    X_val=X_val_final,
    y_val=y_val,
    use_xgb_train=True,
    fit_params={"num_boost_round": 1000, "early_stopping_rounds": 50},
)
logger.log_model(trained_xgb, "XGB_Optuna_LogTransformed_feature_eng", results_xgb, use_xgb_train=True)


In [None]:
experiment_name = "house_price_prediction"

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])

metrics_of_interest = [
    "metrics.train_rmse",
    "metrics.test_rmse",
    "metrics.train_r2",
    "metrics.test_r2",
    "metrics.train_mae",
    "metrics.test_mae",
    "metrics.train_mape",
    "metrics.test_mape",
]
comparison_df = runs_df[
    ["run_id", "tags.mlflow.runName"] + metrics_of_interest
]

comparison_df.sort_values("metrics.test_mae", ascending=True, inplace=True)
comparison_df

In [None]:
comparison_df.columns

I have chosen run_id 33688ff883c54d2fb4a14cbef2ae617a because the combination of statistcs looks the best: one of the highest R2 for both test and train, and rmse and mae are one of the lowest ones.


In [None]:
print(mlflow.get_tracking_uri())

In [None]:
from mlflow.tracking import MlflowClient

run_id = "33688ff883c54d2fb4a14cbef2ae617a"
client = MlflowClient()

artifacts = client.list_artifacts(
    run_id, path="xgb_model"
)  # match the artifact_path you used
for a in artifacts:
    print(a.path)

In [None]:
run_id = "33688ff883c54d2fb4a14cbef2ae617a"
model_path = f"runs:/{run_id}/xgb_model"

# Load the model
loaded_model = mlflow.xgboost.load_model(model_path)

# Make predictions
y_pred = loaded_model.predict(dtest)  # dtest = xgb.DMatrix(X_test)

In [None]:
run = mlflow.get_run(run_id)
print(run.data.metrics)  # Train/test RMSE, R2, etc.
print(run.data.params)  # Model hyperparameters

In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Register model
model_uri = f"runs:/{run_id}/xgb_model"
registered_model_name = "RealEstate_XGB"
model_version = mlflow.register_model(model_uri, registered_model_name)

print(
    f"Model registered as {registered_model_name}, version {model_version.version}"
)
# saved the best

In [None]:
run = mlflow.get_run(run_id)
print(run.data.metrics)  # Train/test RMSE, R2, etc.
print(run.data.params)

In [None]:
# Prepare storage
y_vals_all = []
y_preds_all = []

# Collect predictions from each fold
for train_idx, val_idx in kf.split(X_np):
    X_tr, X_val = X_np[train_idx], X_np[val_idx]
    y_tr, y_val = y_np[train_idx], y_np[val_idx]

    dtrain = xgb.DMatrix(X_tr, label=np.log1p(y_tr))
    dval = xgb.DMatrix(X_val, label=np.log1p(y_val))

    xgb_model = xgb.train(
        best_params,
        dtrain,
        num_boost_round=500,
        evals=[(dval, "eval")],
        early_stopping_rounds=50,
        verbose_eval=False,
    )

    y_val_pred = np.expm1(xgb_model.predict(dval))

    y_vals_all.extend(y_val)
    y_preds_all.extend(y_val_pred)

# Convert to arrays
y_vals_all = np.array(y_vals_all)
y_preds_all = np.array(y_preds_all)
residuals = y_vals_all - y_preds_all

# Predicted vs Actual
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_vals_all, y=y_preds_all)
plt.plot(
    [y_vals_all.min(), y_vals_all.max()],
    [y_vals_all.min(), y_vals_all.max()],
    "r--",
    label="Perfect Prediction",
)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("CV: Predicted vs Actual (all folds)")
plt.legend()
plt.show()

# Residual plot
plt.figure(figsize=(6, 4))
sns.histplot(residuals, kde=True, bins=20, color="skyblue")
plt.xlabel("Residual (Actual - Predicted)")
plt.title("CV Residuals Distribution")
plt.show()

#### Generating price range (for pipeline)


In [None]:
import pandas as pd
import numpy as np

# Number of bins
n_bins = 10

# Create a DataFrame
df = pd.DataFrame({"pred": y_preds_all, "residual": residuals})

# Use qcut instead of cut to ensure roughly equal-sized bins
df["pred_bin"] = pd.qcut(df["pred"], q=n_bins, duplicates="drop")

# Compute 5th and 95th percentiles per bin
bin_ranges = (
    df.groupby("pred_bin")["residual"]
    .agg(
        lower_bound=lambda x: np.percentile(x, 5),
        upper_bound=lambda x: np.percentile(x, 95),
    )
    .reset_index()
)


# Function to get price range
def get_price_range(pred_price):
    for _, row in bin_ranges.iterrows():
        if row["pred_bin"].left <= pred_price <= row["pred_bin"].right:
            return (
                pred_price + row["lower_bound"],
                pred_price + row["upper_bound"],
            )
    return pred_price, pred_price  # fallback if outside all bins


# Example
example_pred = 500_000
lb, ub = get_price_range(example_pred)
print(f"Predicted price: {example_pred}, Range: {lb:.0f} - {ub:.0f}")

Need to just figure out logging these ranges
