In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import xgboost as xgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

tracking_uri = "../logs/mlruns"
os.makedirs(os.path.join(tracking_uri, ".trash"), exist_ok=True)

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("house_price_prediction")


In [None]:
import sys
import os
from pathlib import Path
import yaml


# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_loading.data_loading.data_loader import load_data_from_json
from src.data_loading.preprocessing.preprocessing import preprocess_df
from src.data_loading.preprocessing.imputation import impute_missing_values


# go two levels up from notebook dir -> project root
ROOT = (
    Path(__file__).resolve().parents[2]
    if "__file__" in globals()
    else Path.cwd().parents[1]
)
CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "preprocessing_config.yaml"
)

with open(CONFIG_PATH) as f:
    CONFIG = yaml.safe_load(f)

df_raw = load_data_from_json("../data/parsed_json/*.json")
df_clean = preprocess_df(
    df_raw,
    drop_raw=CONFIG["preprocessing"]["drop_raw"],
    numeric_cols=CONFIG["preprocessing"]["numeric_cols"],
)
df_clean = impute_missing_values(
    df_clean, CONFIG["preprocessing"]["imputation"]
)
# Drop price_num NaNs for the training of the model
df_clean = df_clean[df_clean["price_num"].notna()]
df_clean.drop(columns=["living_area"], inplace=True)


# df_clean = df_clean[:100] 
df = df_clean.copy()

In [None]:
from pathlib import Path

# --- Adjust ROOT to your project root ---
ROOT = Path("C:/Users/LaurynasBaltrusaitis/OneDrive - Adaptfy BV/Desktop/Education/git_personal_repos")

CONFIG_PATH = ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"

from src.features.data_prep_for_modelling.data_preparation import load_geo_config
from src.features.feature_engineering.location_feature_enrichment import load_cache

# 1️⃣ Test load_geo_config
geo_cache_file, amenities_df, amenity_radius_map = load_geo_config(CONFIG_PATH)
print("Geo cache file:", geo_cache_file)
print("Amenities df:", amenities_df.shape if amenities_df is not None else None)
print("Amenity radius map:", amenity_radius_map)

# 2️⃣ Test load_cache
lat_lon_cache = load_cache(geo_cache_file)
print("Number of addresses in cache:", len(lat_lon_cache))
print("Sample from cache:", list(lat_lon_cache.items())[:5])


In [None]:
geo_cache_file

In [None]:
from src.features.data_prep_for_modelling.data_preparation import prepare_data

FEATURES_CONFIG_PATH = (
    ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"
)


In [None]:
from src.model.evaluate import ModelEvaluator
from src.model.mlflow_logger import MLFlowLogger

evaluator = ModelEvaluator()
logger = MLFlowLogger()


In [None]:
pd.read_csv("../data/df_with_lat_lon_encoded.csv")

In [None]:
df_clean.columns

In [None]:
FEATURES_AND_MODEL_CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "model_config.yaml"
)
from src.features.data_prep_for_modelling.data_preparation import prepare_data_from_config


config_path = FEATURES_AND_MODEL_CONFIG_PATH  # e.g., ROOT / "config/model_config.yaml"
from src.features.feature_engineering.location_feature_enrichment import load_cache


# Call the wrapper
X_train, X_test, y_train, y_test, X_val, y_val, scaler, meta = prepare_data_from_config(
    df=df_clean,
    config_path=config_path,
    model_name="xgboost_early_stopping_optuna_feature_eng_geoloc_exp"
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape if X_val is not None else None)
print("Test shape:", X_test.shape)


In [None]:
X_train_final = X_train.copy()
X_test_final = X_test.copy()
X_val_final = X_val.copy()

print("Train shape:", X_train_final.shape)
print("validation shape:", X_val_final.shape)
print("Test shape:", X_test_final.shape)

In [None]:
# XGBoost with log-transform

from functools import partial
from src.model.objectives_optuna import unified_objective

FEATURES_AND_MODEL_CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "model_config.yaml"
)

sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study_xgb = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)

objective_xgb_partial = partial(
    unified_objective,
    model_name="xgboost_early_stopping_optuna_feature_eng_geoloc_exp",
    df=df_clean,
    features_config=FEATURES_AND_MODEL_CONFIG_PATH,
    model_config=FEATURES_AND_MODEL_CONFIG_PATH,
    use_log=True,  
    n_splits=5,
    use_extended_features=True,
    use_geo_amenities=True,
)
study_xgb.optimize(objective_xgb_partial, n_trials=100)


In [None]:
cols_to_drop = ["postal_code_clean", "lat", "lon"]  # add any other object columns if needed

X_train_final = X_train_final.drop(columns=[c for c in cols_to_drop if c in X_train_final.columns])
X_val_final   = X_val_final.drop(columns=[c for c in cols_to_drop if c in X_val_final.columns])
X_test_final  = X_test_final.drop(columns=[c for c in cols_to_drop if c in X_test_final.columns])

# --- XGBoost ---
evaluator = ModelEvaluator(target_transform=np.log1p, inverse_transform=np.expm1)
best_xgb_params = study_xgb.best_params

trained_xgb, y_train_pred, y_val_pred, y_test_pred, results_xgb = evaluator.evaluate(
    model=None,  # not used in XGBoost.train
    X_train=X_train_final,
    y_train=y_train,
    X_test=X_test_final,
    y_test=y_test,
    X_val=X_val_final,
    y_val=y_val,
    use_xgb_train=True,
    model_params=best_xgb_params,  # <--- crucial
    fit_params={"num_boost_round": 1000, "early_stopping_rounds": 50},
)
logger.log_model(trained_xgb, "XGB_Optuna_LogTransformed_feature_eng", results_xgb, use_xgb_train=True)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from src.model.cv_helpers import prepare_base_data, prepare_fold_features
from src.model.evaluate import ModelEvaluator

def final_cv_evaluation(
    df,
    features_config,
    model_name,
    best_params,
    use_log=True,
    n_splits=5,
    use_geo_amenities=True,
    enable_cache_save=False,
    fit_params=None,
):
    # Prepare data (same as in unified_objective)
    X_full, y_full = prepare_base_data(
        df, features_config, model_name, extended_fe=True
    )

    target_transform = np.log1p if use_log else None
    inverse_transform = np.expm1 if use_log else None

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    fold_metrics = []
    oof_preds = pd.Series(index=X_full.index, dtype=float)
    evaluator = ModelEvaluator(target_transform=target_transform,
                               inverse_transform=inverse_transform)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_full), 1):
        X_train, X_val = X_full.iloc[train_idx].copy(), X_full.iloc[val_idx].copy()
        y_train, y_val = y_full.iloc[train_idx].copy(), y_full.iloc[val_idx].copy()

        # fold-wise feature engineering (same path as unified_objective)
        X_train, X_val, meta, fold_encoders = prepare_fold_features(
            X_train,
            X_val,
            features_config=features_config if use_geo_amenities else None,
            use_extended_features=True,
            enable_cache_save=enable_cache_save,
        )

        # evaluate using your evaluator (use X_val as "test" for the fold)
        trained_model, y_train_pred, y_val_pred, _, results = evaluator.evaluate(
            model=None,
            X_train=X_train,
            y_train=y_train,
            X_test=X_val,
            y_test=y_val,
            X_val=X_val,
            y_val=y_val,
            use_xgb_train=True if "xgb" in model_name.lower() else False,
            model_params=best_params,
            fit_params=fit_params or {"num_boost_round": 1000, "early_stopping_rounds": 50},
        )

        # Collect fold metrics (results should contain val_rmse, val_mae, val_mape)
        fold_metrics.append({
            "fold": fold,
            "val_rmse": results.get("val_rmse"),
            "val_mae": results.get("val_mae"),
            "val_mape": results.get("val_mape"),
            "val_r2": results.get("val_r2"),
        })

        # Save OOF preds (inverse_transform already applied by evaluator if that's its contract)
        oof_preds.iloc[val_idx] = y_val_pred  # ensure y_val_pred is inverse-transformed

    # Aggregate
    df_metrics = pd.DataFrame(fold_metrics).set_index("fold")
    agg = df_metrics.agg(["mean", "std"]).T

    return {
        "fold_metrics": df_metrics,
        "agg_metrics": agg,
        "oof_preds": oof_preds,
    }


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import KFold
import numpy as np

from src.model.cv_helpers import prepare_base_data, prepare_fold_features

# Number of folds (should match your CV setup)
N_SPLITS = 5

# Prepare base data (features + target)
X_full, y_full = prepare_base_data(df_clean, FEATURES_AND_MODEL_CONFIG_PATH, "xgboost_early_stopping_optuna_feature_eng_geoloc_exp")

importance_types = ["weight", "gain", "cover"]
fold_importances = {imp_type: [] for imp_type in importance_types}

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_full), 1):
    X_train, X_val = X_full.iloc[train_idx].copy(), X_full.iloc[val_idx].copy()
    y_train, y_val = y_full.iloc[train_idx].copy(), y_full.iloc[val_idx].copy()

    # Prepare fold-wise features (ensure extended feature engineering matches training)
    X_train_fold, X_val_fold, _, _ = prepare_fold_features(X_train, X_val, features_config=FEATURES_AND_MODEL_CONFIG_PATH, use_extended_features=True, enable_cache_save=False)

    cols_to_drop = ["size_num", "lat", "lon"]  # or just one of them
    X_train_fold = X_train_fold.drop(columns=cols_to_drop, errors="ignore")
    X_val_fold = X_val_fold.drop(columns=cols_to_drop, errors="ignore")
    
    # Train XGBoost on this fold using best params
    dtrain = xgb.DMatrix(X_train_fold, label=np.log1p(y_train))
    dval = xgb.DMatrix(X_val_fold, label=np.log1p(y_val))
    
    model_fold = xgb.train(
        params=best_xgb_params,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=[(dval, "validation")],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # Collect importance for each type
    for imp_type in importance_types:
        imp_dict = model_fold.get_score(importance_type=imp_type)
        df_imp = pd.DataFrame.from_dict(imp_dict, orient="index", columns=[imp_type])
        df_imp.index.name = "feature"
        fold_importances[imp_type].append(df_imp)

# --- Aggregate across folds ---
agg_importances = {}
for imp_type, dfs in fold_importances.items():
    # Combine all folds into a single dataframe
    df_all = pd.concat(dfs, axis=1).fillna(0)
    df_all["mean"] = df_all.mean(axis=1)
    df_all = df_all.sort_values(by="mean", ascending=False)
    agg_importances[imp_type] = df_all
    print(f"\nTop 10 features by mean {imp_type} across folds:")
    print(df_all["mean"].head(100))
    
    # Plot top 20
    df_all["mean"].head(200).plot.barh(figsize=(10,6), title=f"Top 20 features by mean {imp_type} across folds")
    plt.gca().invert_yaxis()
    plt.show()


In [None]:
import shap

fold_shap_values = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_full), 1):
    X_train, X_val = X_full.iloc[train_idx].copy(), X_full.iloc[val_idx].copy()
    y_train, y_val = y_full.iloc[train_idx].copy(), y_full.iloc[val_idx].copy()
    
    # Prepare fold-wise features
    X_train_fold, X_val_fold, _, _ = prepare_fold_features(X_train, X_val, features_config=FEATURES_AND_MODEL_CONFIG_PATH, use_extended_features=True, enable_cache_save=False)
    cols_to_drop = ["size_num", "lat", "lon"] 
    X_train_fold = X_train_fold.drop(columns=cols_to_drop, errors="ignore")
    X_val_fold = X_val_fold.drop(columns=cols_to_drop, errors="ignore")
    
    # Train XGBoost Booster
    dtrain = xgb.DMatrix(X_train_fold, label=np.log1p(y_train))
    dval = xgb.DMatrix(X_val_fold, label=np.log1p(y_val))
    
    model_fold = xgb.train(
        params=best_xgb_params,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=[(dval, "validation")],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # SHAP for Booster
    explainer = shap.TreeExplainer(model_fold)
    shap_values = explainer.shap_values(dval)  # DMatrix
    
    # Store mean absolute SHAP values per feature for this fold
    fold_shap_values.append(pd.DataFrame({
        "feature": X_val_fold.columns,
        "mean_abs_shap": np.abs(shap_values).mean(axis=0)
    }))


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid", font_scale=1.1)

# Combine all folds
df_shap_all = pd.concat(fold_shap_values)

# Group by feature and compute mean across folds
agg_shap = df_shap_all.groupby("feature")["mean_abs_shap"].mean().sort_values(ascending=False)

print("\nTop 10 features by mean absolute SHAP value across folds:")
print(agg_shap.head(10))

# Prepare data
top_features = agg_shap.head(10).sort_values()
features = top_features.index
shap_values = top_features.values
models = np.arange(len(features))
bar_width = 0.6
color = 'skyblue'

# Create figure
fig, ax = plt.subplots(figsize=(10, 6))

# Horizontal bar plot
ax.barh(models, shap_values, height=bar_width, color=color)

# Y-axis labels
ax.set_yticks(models)
ax.set_yticklabels(features)
ax.invert_yaxis()  # highest SHAP on top

# Labels and title
ax.set_xlabel("Mean Absolute SHAP Value")
ax.set_title("Top 10 Features by SHAP Value")

# Add value labels
for i, val in enumerate(shap_values):
    ax.text(val + 0.01 * max(shap_values), i, f"{val:.3f}", va='center', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
# Combine all folds
df_shap_all = pd.concat(fold_shap_values)

# Group by feature and compute mean across folds
agg_shap = df_shap_all.groupby("feature")["mean_abs_shap"].mean().sort_values(ascending=False)

print("\nTop 20 features by mean absolute SHAP value across folds:")
print(agg_shap.head(10))

# Plot
agg_shap.head(10).sort_values().plot.barh(figsize=(10,6), title="Top 10 features by SHAP value")
plt.show()


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import shap

def shap_to_euros(model, X, target_transform_inverse=np.expm1):
    """
    Compute SHAP values in € for a trained XGBoost model with log1p target.
    """
    # Predict log1p(price)
    dmatrix = xgb.DMatrix(X)
    pred_log = model.predict(dmatrix)

    # Get full SHAP values aligned with all columns
    explainer = shap.TreeExplainer(model)
    shap_values_array = explainer.shap_values(X)  # (n_samples, n_features)

    # Sanity check
    if shap_values_array.shape[1] != X.shape[1]:
        raise ValueError(
            f"Mismatch: SHAP shape {shap_values_array.shape[1]} vs X {X.shape[1]}"
        )

    # Convert SHAP deltas to € scale
    price_contrib = np.expm1(pred_log[:, None] + shap_values_array) - np.expm1(pred_log[:, None])
    df_shap_euros = pd.DataFrame(price_contrib, columns=X.columns)

    mean_abs_euros = df_shap_euros.abs().mean().sort_values(ascending=False)
    return mean_abs_euros

# Usage:
mean_abs_shap_euros = shap_to_euros(model_fold, X_val_fold)
print(mean_abs_shap_euros.head(10))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from textwrap import fill

sns.set(style="whitegrid", font_scale=1.1)

# Top 10 SHAP-Euro contributions
top_features = pd.Series({
    "log_size_num": 349033.09,
    "price_per_m2_neighborhood": 77572.78,
    "outdoor_area_ratio": 19810.93,
    "luxury_x_price_m2": 12859.01,
    "size_per_luxury": 12562.23,
    "dist_to_center_bin_encoded": 12214.45,
    "energy_label_encoded": 11375.25,
    "ownership_type_Other": 10560.33,
    "luxury_x_size": 8401.32,
    "has_mechanische_ventilatie": 7442.82
}).sort_values()

features = top_features.index
shap_values = top_features.values
models = np.arange(len(features))
bar_width = 0.6
color = 'skyblue'

# Friendly feature names
feature_rename = {
    "log_size_num": "Property Size (log m²)",
    "price_per_m2_neighborhood": "Price per m² by Neighborhood",
    "outdoor_area_ratio": "Outdoor Area Ratio",
    "luxury_x_price_m2": "Luxury × Price per m²",
    "size_per_luxury": "Size per Luxury Feature",
    "dist_to_center_bin_encoded": "Distance to Center (binned)",
    "energy_label_encoded": "Energy Label",
    "ownership_type_Other": "Other Ownership Type",
    "luxury_x_size": "Luxury × Size",
    "has_mechanische_ventilatie": "Mechanical Ventilation"
}

# Notes dictionary
notes_dict = {
    "log_size_num": "Largest driver of price: bigger properties dominate overall.",
    "price_per_m2_neighborhood": "Captures local market variation and neighborhood pricing.",
    "outdoor_area_ratio": "More outdoor space moderately increases value.",
    "luxury_x_price_m2": "Luxury features amplify local per-m² price.",
    "size_per_luxury": "Reflects amenity density: smaller values indicate more luxury features per m².",
    "dist_to_center_bin_encoded": "Central locations increase price non-linearly.",
    "energy_label_encoded": "Better energy label adds some value.",
    "ownership_type_Other": "Non-standard ownership has moderate effect.",
    "luxury_x_size": "Luxury impact scales with property size.",
    "has_mechanische_ventilatie": "Mechanical ventilation adds minor value."
}

friendly_names = [feature_rename.get(f, f) for f in features]

fig, ax = plt.subplots(figsize=(14, 6))

# Horizontal bar plot
bars = ax.barh(models, shap_values, height=bar_width, color=color)
ax.set_yticks(models)
ax.set_yticklabels(friendly_names)
ax.invert_yaxis()  # highest SHAP on top
ax.set_xlabel("Mean Absolute SHAP Contribution (€)")
ax.set_title("Top 10 Features by SHAP Contribution in Euros")
ax.grid(True, alpha=0.3)

# Minimum offset to avoid overlap with values
min_offset = max(shap_values) * 0.2

for i, bar in enumerate(bars):
    val = bar.get_width()
    # Value label
    ax.text(val + min_offset*0.2, i, f"€{val:,.0f}", va='center', fontsize=10)
    
    # Note, wrapped and placed to the right of bar
    note = notes_dict.get(features[i], "")
    if note:
        wrapped_note = fill(note, width=40)  # wrap at ~40 chars
        note_x = val + min_offset
        ax.text(note_x, i, wrapped_note, va='center', ha='left', fontsize=9,
                color='black', fontstyle='italic',
                bbox=dict(facecolor='lightgray', alpha=0.3, boxstyle='round,pad=0.2'))

# Extend x-limits to fit notes
ax.set_xlim(0, max(shap_values) + 3*min_offset)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from src.features.feature_engineering.feature_engineering import add_luxury_features, add_luxury_interactions, LUXURY_AMENITIES, LUXURY_AMENITIES_WEIGHTS

sns.set(style="whitegrid", font_scale=1.1)

# ------------------- Compute luxury scores -------------------
df_lux = add_luxury_features(df_clean)
df_lux = add_luxury_interactions(df_lux)

# ------------------- Top Luxury Features -------------------
luxury_weights = {k: v for k, v in LUXURY_AMENITIES_WEIGHTS.items() if k in df_lux.columns}
avg_contribution = {feature: (df_lux[feature] * weight).mean() for feature, weight in luxury_weights.items()}

df_bar = pd.DataFrame({
    "Feature": list(avg_contribution.keys()),
    "Value": list(avg_contribution.values())
}).sort_values("Value", ascending=True)

feature_rename = {
    "has_lift": "Lift",
    "has_sauna": "Sauna",
    "has_domotica": "Smart Home Features",
    "has_airconditioning": "Air Conditioning",
    "has_zwembad": "Pool"
}

df_bar["FeatureFriendly"] = df_bar["Feature"].map(feature_rename)

# ------------------- Luxury Interaction Features -------------------
interaction_features = ["luxury_x_price_m2", "luxury_x_size", "luxury_x_inhabitants"]
importances = [0.35, 0.25, 0.15]  # replace with actual model importances

df_interaction = pd.DataFrame({
    "Feature": interaction_features,
    "Value": importances
}).sort_values("Value", ascending=True)

feature_rename_interactions = {
    "luxury_x_price_m2": "Luxury × Neighborhood Price per m²",
    "luxury_x_size": "Luxury × Size",
    "luxury_x_inhabitants": "Luxury × Population Density"
}

df_interaction["FeatureFriendly"] = df_interaction["Feature"].map(feature_rename_interactions)

# ------------------- Plot side by side -------------------
fig, axes = plt.subplots(1, 2, figsize=(18, 6), sharey=False)
bar_width = 0.6

# Palettes
palette_luxury = sns.light_palette("skyblue", n_colors=len(df_bar))
palette_interaction = sns.light_palette("orange", n_colors=len(df_interaction))

# --- Left: Top Luxury Features ---
bars1 = axes[0].barh(df_bar["FeatureFriendly"], df_bar["Value"], color=palette_luxury, height=bar_width)
axes[0].set_xlabel("Average Contribution")
axes[0].set_title("Top Luxury Features")
axes[0].grid(True, alpha=0.3)

# Add value labels
for i, bar in enumerate(bars1):
    val = bar.get_width()
    axes[0].text(val + val*0.02, i, f"{val:.2f}", va='center', fontsize=10)

# Left subplot: Top Luxury Features
axes[0].text(0.95, 0.05, 
             "Shows which individual luxury amenities contribute most to the\nluxury score, averaged across listings.",
             transform=axes[0].transAxes, fontsize=10, fontstyle='italic', color='black',
             ha='right', va='bottom',
             bbox=dict(facecolor='lightgray', alpha=0.3, boxstyle='round,pad=0.5'))

# --- Right: Luxury Interaction Features ---
bars2 = axes[1].barh(df_interaction["FeatureFriendly"], df_interaction["Value"], color=palette_interaction, height=bar_width)
axes[1].set_xlabel("Relative Importance")
axes[1].set_title("Luxury Interaction Features")
axes[1].grid(True, alpha=0.3)

# Add value labels
for i, bar in enumerate(bars2):
    val = bar.get_width()
    axes[1].text(val + val*0.02, i, f"{val:.2f}", va='center', fontsize=10)

# Right subplot: Luxury Interaction Features
axes[1].text(0.95, 0.05, 
             "Shows how luxury features impact price in context:\nneighborhood price per m², home size, and neighborhood population density.",
             transform=axes[1].transAxes, fontsize=10, fontstyle='italic', color='black',
             ha='right', va='bottom',
             bbox=dict(facecolor='lightgray', alpha=0.3, boxstyle='round,pad=0.5'))

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import optuna

sns.set(style="whitegrid", font_scale=1.1)

# Sort trials by number
trials = sorted(study_xgb.trials, key=lambda t: t.number)
trial_numbers = [t.number for t in trials]
objective_values = [t.value for t in trials]

# Compute best-so-far values
best_so_far = []
current_best = float('inf')  # minimizing objective
best_trial_index = 0
for i, value in enumerate(objective_values):
    if value < current_best:
        current_best = value
        best_trial_index = i
    best_so_far.append(current_best)

# Best trial info
best_trial = trials[best_trial_index]
best_params = best_trial.params
best_value = best_trial.value

# Round numeric parameters to 3 decimals
param_text = "\n".join([f"{k}: {round(v,3) if isinstance(v,(int,float)) else v}" 
                        for k, v in best_params.items()])

# Plot
fig, ax = plt.subplots(figsize=(14, 7))

# Scatter trial values
ax.scatter(trial_numbers, objective_values, color='skyblue', label='Trial Values', edgecolor='k')

# Best-so-far line
ax.plot(trial_numbers, best_so_far, color='red', linewidth=2, label='Best-So-Far')

# Highlight best trial
ax.scatter(best_trial.number, best_value, color='green', s=150, marker='*', label='Best Trial', edgecolor='k')

# Annotate best trial with a box
ax.annotate(f'Best Trial #{best_trial.number}\nValue: {best_value:.4f}\n{param_text}',
            xy=(best_trial.number, best_value),
            xytext=(best_trial.number + 0.5, best_value + 0.5),
            arrowprops=dict(facecolor='black', arrowstyle='->'),
            bbox=dict(boxstyle="round,pad=0.3", fc="lightyellow", alpha=0.4),
            fontsize=10)

# Labels and title
ax.set_xlabel('Trial Number')
ax.set_ylabel('Objective Value')
ax.set_title('Optuna XGBoost Optimization Convergence')

# Legend
ax.legend(loc='upper right')

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib.ticker import FuncFormatter

sns.set(style="whitegrid", font_scale=1.1)

# Compute residuals
residuals = y_test - y_test_pred

# Define extreme outliers: top 5% of absolute residuals
threshold = np.percentile(np.abs(residuals), 95)
outliers_mask = np.abs(residuals) >= threshold

plt.figure(figsize=(10, 7))

# Plot normal listings
plt.scatter(
    y_test[~outliers_mask],
    y_test_pred[~outliers_mask],
    alpha=0.5,
    color='skyblue',
    label="Normal listings",
    edgecolor='k'
)

# Plot extreme residuals
plt.scatter(
    y_test[outliers_mask],
    y_test_pred[outliers_mask],
    color="red",
    label="Extreme listings",
    edgecolor='k'
)

# Diagonal line (perfect prediction)
max_val = max(y_test.max(), y_test_pred.max())
plt.plot([0, max_val], [0, max_val], color="black", linestyle="--", label="Perfect prediction")

# Annotate a few largest outliers by absolute residual
num_annotations = 5
outliers_df = pd.DataFrame({
    'y_true': y_test[outliers_mask],
    'y_pred': y_test_pred[outliers_mask],
    'residual': residuals[outliers_mask],
    'size': X_test.loc[outliers_mask, 'size_num']  # replace 'size' with your feature name
})
outliers_df['abs_residual'] = np.abs(outliers_df['residual'])
top_outliers = outliers_df.nlargest(num_annotations, 'abs_residual').copy()

# Initialize annotation positions slightly offset
top_outliers['x_offset'] = 10
top_outliers['y_offset'] = 10

# Simple repel for overlapping annotations
min_distance = 15  # in points
for i in range(len(top_outliers)):
    for j in range(i):
        dx = top_outliers.iloc[i]['x_offset'] - top_outliers.iloc[j]['x_offset']
        dy = top_outliers.iloc[i]['y_offset'] - top_outliers.iloc[j]['y_offset']
        distance = np.hypot(dx, dy)
        if distance < min_distance:
            top_outliers.at[top_outliers.index[i], 'y_offset'] += min_distance - distance

# Add annotations with repelling offsets
for idx, row in top_outliers.iterrows():
    plt.annotate(
        f"Size: {row['size']}\nPrice: €{row['y_true']:,.0f}",
        xy=(row['y_true'], row['y_pred']),
        xytext=(row['x_offset'], row['y_offset']),
        textcoords='offset points',
        fontsize=9,
        bbox=dict(boxstyle="round,pad=0.3", fc="lightyellow", alpha=0.5),
        arrowprops=dict(arrowstyle="->", color='gray', lw=1)
    )

# Format axes in euros with thousands separator
formatter = FuncFormatter(lambda x, _: f"€{int(x):,}")
plt.gca().xaxis.set_major_formatter(formatter)
plt.gca().yaxis.set_major_formatter(formatter)

# Labels, title, legend
plt.xlabel("Actual Price (€)")
plt.ylabel("Predicted Price (€)")
plt.title("Predicted vs Actual Prices with Extreme Listings Highlighted")
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid", font_scale=1.1)

# --- Define results for MAPE only ---
results_mape = {
    "Last Week Model": {
        'Test MAPE (%)': 10.0  # example value
    },
    "This Week Model": {
        'Test MAPE (%)': 9.05
    }
}

df_mape = pd.DataFrame(results_mape)

# Bar parameters
bar_width = 0.35
colors = ['skyblue', 'orange']

metrics = df_mape.index
y_pos = np.arange(len(metrics))

plt.figure(figsize=(8, 2.5))

# Horizontal bars: two bars per metric
plt.barh(y_pos - bar_width/2, df_mape['Last Week Model'], height=bar_width, color=colors[0], label='Last Week Model')
plt.barh(y_pos + bar_width/2, df_mape['This Week Model'], height=bar_width, color=colors[1], label='This Week Model')

plt.yticks(y_pos, metrics)
plt.gca().invert_yaxis()

# Data labels
for i, metric in enumerate(metrics):
    for j, col in enumerate(df_mape.columns):
        val = df_mape.loc[metric, col]
        offset = -bar_width/2 if j == 0 else bar_width/2
        plt.text(val + 0.1, i + offset, f"{val:.2f}%", va='center', fontsize=10)

plt.xlabel("MAPE (%)")
plt.title("XGBoost Test MAPE: Last Week vs This Week Model")
plt.legend(loc='center left', bbox_to_anchor=(1,0.5))
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid", font_scale=1.1)

# --- Define results for RMSE and MAE ---
results = {
    "Last Week Model": {
        'Test RMSE (€)': 211172,
        'Test MAE (€)': 72964
    },
    "This Week Model": {
        'Test RMSE (€)': 205449,
        'Test MAE (€)': 75442
    }
}

df_results = pd.DataFrame(results)

# Bar parameters
bar_width = 0.35
colors = ['skyblue', 'orange']

metrics = df_results.index
y_pos = np.arange(len(metrics))

plt.figure(figsize=(10, 5))

# Horizontal bars: two bars per metric
plt.barh(y_pos - bar_width/2, df_results['Last Week Model'], height=bar_width, color=colors[0], label='Phase 3A Model')
plt.barh(y_pos + bar_width/2, df_results['This Week Model'], height=bar_width, color=colors[1], label='Current Phase Model')

plt.yticks(y_pos, metrics)
plt.gca().invert_yaxis()

# Data labels
for i, metric in enumerate(metrics):
    for j, col in enumerate(df_results.columns):
        val = df_results.loc[metric, col]
        offset = -bar_width/2 if j == 0 else bar_width/2
        plt.text(val + 5000, i + offset, f"{val:,.0f}", va='center', fontsize=10)

plt.xlabel("Error (€)")
plt.title("XGBoost Test RMSE & MAE: Phase 3A vs Phase 3B Model Error Metrics")
plt.legend(loc='center left', bbox_to_anchor=(1,0.5))
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid", font_scale=1.1)

# --- Define metrics ---
results = {
    "Last Week Model": {
        'Train RMSE (€)': 87196,
        'Val RMSE (€)': 182978,
        'Test RMSE (€)': 211172,
        'Train MAE (€)': 45665,
        'Val MAE (€)': 81436,
        'Test MAE (€)': 72964
    },
    "This Week Model": {
        'Train RMSE (€)': 112722,
        'Val RMSE (€)': 152651,
        'Test RMSE (€)': 205449,
        'Train MAE (€)': 54341,
        'Val MAE (€)': 75441,
        'Test MAE (€)': 75442
    }
}

df = pd.DataFrame(results)

metrics = df.index
y_pos = np.arange(len(metrics))
bar_width = 0.25
colors = ['skyblue', 'orange']

plt.figure(figsize=(12, 6))

# Horizontal bars: last week vs this week
plt.barh(y_pos - bar_width/2, df['Last Week Model'], height=bar_width, color=colors[0], label='Last Week Model')
plt.barh(y_pos + bar_width/2, df['This Week Model'], height=bar_width, color=colors[1], label='This Week Model')

plt.yticks(y_pos, metrics)
plt.gca().invert_yaxis()

# Data labels
for i, metric in enumerate(metrics):
    for j, col in enumerate(df.columns):
        val = df.loc[metric, col]
        offset = -bar_width/2 if j == 0 else bar_width/2
        plt.text(val + 5000, i + offset, f"{val:,.0f}", va='center', fontsize=10)

plt.xlabel("Error (€)")
plt.title("XGBoost Train / Val / Test RMSE & MAE: Last Week vs This Week Model")
plt.legend(loc='center left', bbox_to_anchor=(1,0.5))
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid", font_scale=1.1)

# --- Define MAPE metrics ---
results_mape = {
    "Last Week Model": {
        'Train MAPE (%)': 6.5,
        'Val MAPE (%)': 10.52,
        'Test MAPE (%)': 8.5
    },
    "This Week Model": {
        'Train MAPE (%)': 7.57,
        'Val MAPE (%)': 10.34,
        'Test MAPE (%)': 9.05
    }
}

df_mape = pd.DataFrame(results_mape)

metrics = df_mape.index
y_pos = np.arange(len(metrics))
bar_width = 0.25
colors = ['skyblue', 'orange']

plt.figure(figsize=(10, 4))

# Horizontal bars: last week vs this week
plt.barh(y_pos - bar_width/2, df_mape['Last Week Model'], height=bar_width, color=colors[0], label='Last Week Model')
plt.barh(y_pos + bar_width/2, df_mape['This Week Model'], height=bar_width, color=colors[1], label='This Week Model')

plt.yticks(y_pos, metrics)
plt.gca().invert_yaxis()

# Data labels
for i, metric in enumerate(metrics):
    for j, col in enumerate(df_mape.columns):
        val = df_mape.loc[metric, col]
        offset = -bar_width/2 if j == 0 else bar_width/2
        plt.text(val + 0.1, i + offset, f"{val:.2f}%", va='center', fontsize=10)

plt.xlabel("MAPE (%)")
plt.title("XGBoost Train / Val / Test MAPE: Last Week vs This Week Model")
plt.legend(loc='center left', bbox_to_anchor=(1,0.5))
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.ticker import FuncFormatter

sns.set(style="whitegrid", font_scale=1.1)

# --- Results ---
results_rmse = {
    "Last Week Model": {
        "Train RMSE (€)": 87196,
        "Validation RMSE (€)": 182977.84,
        "Test RMSE (€)": 211172
    },
    "This Week Model": {
        "Train RMSE (€)": 112722.15,
        "Validation RMSE (€)": 152651.16,
        "Test RMSE (€)": 205448.59
    }
}

df_rmse = pd.DataFrame(results_rmse)
bar_width = 0.35
colors = ["skyblue", "orange"]

metrics = df_rmse.index
y_pos = np.arange(len(metrics))

plt.figure(figsize=(10, 5))
plt.barh(y_pos - bar_width/2, df_rmse["Last Week Model"], height=bar_width, color=colors[0],
         label="Last Phase(3A): XGB + Basic FE +\nOptuna")
plt.barh(y_pos + bar_width/2, df_rmse["This Week Model"], height=bar_width, color=colors[1],
         label="This Phase (3B): XGB + Log(target) +\nExtended FE + Optuna")

plt.yticks(y_pos, metrics)
plt.gca().invert_yaxis()

# --- Format x-axis with thousands separator and "k" suffix ---
def k_formatter(x, pos):
    if x >= 1000:
        return f"{x/1000:.0f}k"
    return f"{x:.0f}"
plt.gca().xaxis.set_major_formatter(FuncFormatter(k_formatter))

# --- Add grid for clarity ---
plt.grid(axis="x", linestyle="--", alpha=0.6)

# --- Data labels ---
for i, metric in enumerate(metrics):
    for j, col in enumerate(df_rmse.columns):
        val = df_rmse.loc[metric, col]
        offset = -8000 if j == 0 else 8000
        plt.text(val + offset, i - bar_width/2 + j*bar_width,
                 f"{val:,.0f}", va='center', fontsize=10)

plt.xlabel("Error (€)")
plt.title("RMSE Comparison: Phase 3A vs 3B Model (Train / Validation / Test)")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.ticker import FuncFormatter

sns.set(style="whitegrid", font_scale=1.1)

# --- Results ---
results = {
    "Metric": ["MAE (€)", "MAPE (%)"],
    "Last Week": [72964, 8.5],
    "This Week": [75441.86, 9.05]
}
df = pd.DataFrame(results).set_index("Metric")

# --- Bar setup ---
bar_width = 0.35
colors = ["skyblue", "orange"]
y_pos = np.arange(1)  # one bar per subplot

# --- Formatter for thousands ---
thousands_formatter = FuncFormatter(lambda x, pos: f'{int(x/1000)}k')

# --- Create subplots ---
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# -------------------- MAE (€) --------------------
ax1 = axes[0]
ax1.barh(y_pos - bar_width/2, df.loc["MAE (€)", "Last Week"], height=bar_width, color=colors[0], label="Phase 3A")
ax1.barh(y_pos + bar_width/2, df.loc["MAE (€)", "This Week"], height=bar_width, color=colors[1], label="Current (3B)")
ax1.set_yticks(y_pos)
ax1.set_yticklabels(["MAE (€)"])
ax1.invert_yaxis()
ax1.set_xlabel("Error (€)")
ax1.set_title("Test MAE Comparison")
ax1.xaxis.set_major_formatter(thousands_formatter)

# Add value labels
for j, col in enumerate(["Last Week", "This Week"]):
    val = df.loc["MAE (€)", col]
    offset = -2000 if j == 0 else 2000
    ax1.text(val + offset, y_pos[0] - bar_width/2 + j*bar_width, f"{val:,.0f}€", va='center', fontsize=10)

# -------------------- MAPE (%) --------------------
ax2 = axes[1]
ax2.barh(y_pos - bar_width/2, df.loc["MAPE (%)", "Last Week"], height=bar_width, color=colors[0])
ax2.barh(y_pos + bar_width/2, df.loc["MAPE (%)", "This Week"], height=bar_width, color=colors[1])
ax2.set_yticks(y_pos)
ax2.set_yticklabels(["MAPE (%)"])
ax2.invert_yaxis()
ax2.set_xlabel("MAPE (%)")
ax2.set_title("Test MAPE Comparison")
ax2.set_xlim(0, max(df.loc["MAPE (%)"]) * 1.5)

# Add value labels
for j, col in enumerate(["Last Week", "This Week"]):
    val = df.loc["MAPE (%)", col]
    offset = -0.3 if j == 0 else 0.3
    ax2.text(val + offset, y_pos[0] - bar_width/2 + j*bar_width, f"{val:.2f}%", va='center', fontsize=10)

# -------------------- Layout --------------------
ax1.legend(loc='lower right', frameon=True)
fig.suptitle("Model Comparison: Phase 3A vs Current Model Test Set Performance (MAE & MAPE)", fontsize=13)
plt.tight_layout(rect=[0, 0, 1, 0.92])
plt.show()
