In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import xgboost as xgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

tracking_uri = "../logs/mlruns"
os.makedirs(os.path.join(tracking_uri, ".trash"), exist_ok=True)

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("house_price_prediction")

import sys
import os
from pathlib import Path
import yaml


# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_loading.data_loading.data_loader import load_data_from_json
from src.data_loading.preprocessing.preprocessing import preprocess_df
from src.data_loading.preprocessing.imputation import impute_missing_values


# go two levels up from notebook dir -> project root
ROOT = (
    Path(__file__).resolve().parents[2]
    if "__file__" in globals()
    else Path.cwd().parents[1]
)
CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "preprocessing_config.yaml"
)

with open(CONFIG_PATH) as f:
    CONFIG = yaml.safe_load(f)

df_raw = load_data_from_json("../data/parsed_json/*.json")
df_clean = preprocess_df(
    df_raw,
    drop_raw=CONFIG["preprocessing"]["drop_raw"],
    numeric_cols=CONFIG["preprocessing"]["numeric_cols"],
)
df_clean = impute_missing_values(
    df_clean, CONFIG["preprocessing"]["imputation"]
)
# Drop price_num NaNs for the training of the model
df_clean = df_clean[df_clean["price_num"].notna()]
df_clean.drop(columns=["living_area"], inplace=True)

# df_clean = df_clean[:100] 
df = df_clean.copy()

In [None]:
from src.features.data_prep_for_modelling.data_preparation import prepare_data

FEATURES_CONFIG_PATH = (
    ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"
)

In [None]:
df_clean.columns

In [None]:
from src.model.evaluate import ModelEvaluator
from src.model.mlflow_logger import MLFlowLogger

evaluator = ModelEvaluator()
logger = MLFlowLogger()
FEATURES_AND_MODEL_CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "model_config.yaml"
)
# --- Prepare data for final modeling ---
X_train, X_test, y_train, y_test, X_val, y_val, scaler, feature_encoders = prepare_data(
    df=df_clean,
    config_path=FEATURES_AND_MODEL_CONFIG_PATH,
    model_name="xgboost_early_stopping",  
    use_extended_features=True,           
    cv=False                              
)

X_train_final = X_train.copy()
X_test_final = X_test.copy()
X_val_final = X_val.copy()

print("Train shape:", X_train_final.shape)
print("validation shape:", X_val_final.shape)
print("Test shape:", X_test_final.shape)



In [None]:
from functools import partial
from src.model.objectives_optuna import unified_objective

FEATURES_AND_MODEL_CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "model_config.yaml"
)

sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study_xgb = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)

objective_xgb_partial = partial(
    unified_objective,
    model_name="xgboost_early_stopping_optuna_feature_eng",
    df=df_clean,
    features_config=FEATURES_AND_MODEL_CONFIG_PATH,
    model_config=FEATURES_AND_MODEL_CONFIG_PATH,
    use_log=True,  
    n_splits=5,
    use_extended_features=True
)
study_xgb.optimize(objective_xgb_partial, n_trials=30)

In [None]:
# Initialize evaluator with log-transform if used
evaluator = ModelEvaluator(target_transform=np.log1p, inverse_transform=np.expm1)

# # --- Random Forest ---
# best_rf = RandomForestRegressor(**study_rf.best_params)
# trained_rf, y_train_pred, y_val_pred, y_test_pred, results_rf = evaluator.evaluate(
#     model=best_rf,
#     X_train=X_train_final,
#     y_train=y_train,
#     X_test=X_test_final,
#     y_test=y_test,
#     X_val=X_val_final,
#     y_val=y_val,
#     use_xgb_train=False,
# )
# logger.log_model(trained_rf, "RF_LogTransform_Optuna_feature_eng", results_rf, use_xgb_train=False)

# --- XGBoost ---
best_xgb_params = study_xgb.best_params
trained_xgb, y_train_pred, y_val_pred, y_test_pred, results_xgb = evaluator.evaluate(
    model=None,  # not used in XGBoost.train
    X_train=X_train_final,
    y_train=y_train,
    X_test=X_test_final,
    y_test=y_test,
    X_val=X_val_final,
    y_val=y_val,
    use_xgb_train=True,
    model_params=best_xgb_params,  # <--- crucial
    fit_params={"num_boost_round": 1000, "early_stopping_rounds": 50},
)
logger.log_model(trained_xgb, "XGB_Optuna_LogTransformed_feature_eng", results_xgb, use_xgb_train=True)


In [None]:
df_clean.address

## Extra features: 

#### Distance to Center (Dam)

Getting lat and long for my listings. 

In [None]:
# from geopy.geocoders import Nominatim
# import time

# df = df_clean.copy()
# addresses = df["address"].unique()

# geolocator = Nominatim(user_agent="house_price_project")
# lat_lon_cache = {}

# for addr in addresses:
#     try:
#         location = geolocator.geocode(addr)
#         if location:
#             lat_lon_cache[addr] = (location.latitude, location.longitude)
#         else:
#             lat_lon_cache[addr] = (None, None)
#     except Exception as e:
#         print(f"Error for {addr}: {e}")
#         lat_lon_cache[addr] = (None, None)
#     time.sleep(1)  # Respect Nominatim rate limit

# Map back to original dataframe
# df["lat"] = df["address"].map(lambda x: lat_lon_cache[x][0])
# df["lon"] = df["address"].map(lambda x: lat_lon_cache[x][1])

# pd.DataFrame(lat_lon_cache.items(), columns=["address", "lat_lon"]).to_csv("geocode_cache.csv", index=False)


In [None]:
# failed_addresses = [addr for addr, (lat, lon) in lat_lon_cache.items() if lat is None or lon is None]

# for addr in failed_addresses:
#     try:
#         location = geolocator.geocode(addr)
#         if location:
#             lat_lon_cache[addr] = (location.latitude, location.longitude)
#         else:
#             lat_lon_cache[addr] = (None, None)
#     except Exception as e:
#         print(f"Retry error for {addr}: {e}")
#         lat_lon_cache[addr] = (None, None)
#     time.sleep(2)


In [None]:
# # List of addresses still missing lat/lon
# failed_addresses = [addr for addr, (lat, lon) in lat_lon_cache.items() if lat is None or lon is None]

# # Subset the dataframe for these addresses
# df_failed = df[df['address'].isin(failed_addresses)]

# # Check their postal codes
# df_failed[['address', 'postal_code_clean']]


Distance to Ams city center

In [None]:
import pandas as pd
import numpy as np

# --- City center coordinates (Dam Square, Amsterdam) ---
city_center = (52.3730, 4.8923)

def haversine(lat1, lon1, lat2, lon2):
    """Great-circle distance in meters between two points."""
    R = 6371000  # meters
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(dlambda/2)**2
    return R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

# --- Step 1. Postal code centroids ---
postal_code_coords = (
    df[df['lat'].notna() & df['lon'].notna()]
    .groupby('postal_code_clean')[['lat','lon']]
    .mean()
    .to_dict(orient='index')
)

# --- Step 2. Neighborhood centroids (if available) ---
if 'neighborhood' in df.columns:
    neighborhood_coords = (
        df[df['lat'].notna() & df['lon'].notna()]
        .groupby('neighborhood')[['lat','lon']]
        .mean()
        .to_dict(orient='index')
    )
else:
    neighborhood_coords = {}

# --- Step 3. Fill missing with postal → neighborhood → fallback ---
for addr in df[df['lat'].isna()]['address']:
    postal = df.loc[df['address'] == addr, 'postal_code_clean'].values[0]
    neigh = df.loc[df['address'] == addr, 'neighborhood'].values[0] if 'neighborhood' in df else None

    if postal in postal_code_coords:
        lat_lon_cache[addr] = (postal_code_coords[postal]['lat'], postal_code_coords[postal]['lon'])
    elif neigh and neigh in neighborhood_coords:
        lat_lon_cache[addr] = (neighborhood_coords[neigh]['lat'], neighborhood_coords[neigh]['lon'])
    else:
        lat_lon_cache[addr] = (None, None)

# --- Step 4. Map back to DataFrame ---
df['lat'] = df['address'].map(lambda x: lat_lon_cache[x][0])
df['lon'] = df['address'].map(lambda x: lat_lon_cache[x][1])

# --- Step 5. Final fallback: mark missing and fill with -1 ---
df['lat_lon_missing'] = df['lat'].isna().astype(int)
df['lat'] = df['lat'].fillna(-1)
df['lon'] = df['lon'].fillna(-1)

# --- Step 6. Compute distance to city center ---
df['dist_to_center_m'] = df.apply(
    lambda row: haversine(row['lat'], row['lon'], city_center[0], city_center[1])
    if row['lat'] != -1 and row['lon'] != -1 else -1,
    axis=1
)

# --- Step 7. Create distance bins (categorical feature) ---
bins = [-1, 0, 2000, 5000, 10000, 20000, np.inf]  # -1 kept separate
labels = ["missing", "0–2km", "2–5km", "5–10km", "10–20km", "20km+"]
df['dist_to_center_bin'] = pd.cut(df['dist_to_center_m'], bins=bins, labels=labels, include_lowest=True)

# --- Step 8: one-hot encode the bins ---
dist_bin_dummies = pd.get_dummies(
    df["dist_to_center_bin"], prefix="distbin"
)

# --- Step 9: merge dummies back into df ---
df = pd.concat([df, dist_bin_dummies], axis=1)

# --- Step 10 (optional): drop the raw categorical bin if you don’t need it ---
# df = df.drop(columns=["dist_to_center_bin"])

df.loc[df["dist_to_center_m"].isna(), "dist_to_center_bin"] = "missing"
df = df.drop(columns=["dist_to_center_bin"])
print(df[["dist_to_center_m"] + list(dist_bin_dummies.columns)].head())

In [None]:
df.to_csv('../data/df_with_lat_lon_encoded.csv', index=False)

In [None]:
# df = pd.read_csv('../data/df_with_lat_lon_encoded.csv')

#### Distance to amenities

In [None]:
import overpy
import pandas as pd

api = overpy.Overpass()

# Amsterdam bounding box
bbox = (52.3100, 4.7680, 52.4100, 4.9500)  # Approximate Amsterdam area

# Amenity queries: lists allow multiple key=value pairs per amenity
amenity_queries = {
    "school": ["amenity=school"],
    "park": ["leisure=park"],
    "hospital": ["amenity=hospital"],
    "supermarket": ["shop=supermarket"],
    "bus_stop": ["highway=bus_stop"],
    "tram_stop": ["railway=tram_stop"],
    "metro_stop": ["railway=subway_entrance", "station=subway"],
    "light_rail_stop": ["railway=light_rail"]
}

all_amenities = []

for amenity_name, osm_tags in amenity_queries.items():
    for osm_tag in osm_tags:
        key, value = osm_tag.split("=")
        query = f"""
        node[{key}={value}]({bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]});
        out;
        """
        print(f"Fetching {amenity_name} ({osm_tag})...")
        try:
            result = api.query(query)
            for node in result.nodes:
                all_amenities.append({
                    "amenity_type": amenity_name,
                    "lat": float(node.lat),
                    "lon": float(node.lon)
                })
        except Exception as e:
            print(f"Error fetching {amenity_name} ({osm_tag}): {e}")

# Convert to DataFrame and save
amenities_df = pd.DataFrame(all_amenities)
print(amenities_df.head())
amenities_df.to_csv("../data/amsterdam_amenities.csv", index=False)


In [None]:
from sklearn.neighbors import BallTree

# --- 1. Convert degrees to radians ---
def deg2rad(df, lat_col='lat', lon_col='lon'):
    return np.radians(df[[lat_col, lon_col]].values)

# --- 2. Prepare listing coordinates ---
listing_coords_rad = deg2rad(df)  # shape (n_listings, 2)
earth_radius_km = 6371.0

# --- 3. Define radius per amenity (in km) ---
amenity_radius_map = {
    "school": [0.5, 1.0],
    "park": [0.5, 1.0],
    "supermarket": [0.5, 1.0],
    "hospital": [1.0, 2.0],
    "bus_stop": [0.5, 1.0],
    "tram_stop": [0.5, 1.0],
    "metro_stop": [0.5, 1.0],
    "light_rail_stop": [0.5, 1.0]
}

# --- 4. Loop over amenities ---
amenity_types = amenities_df['amenity_type'].unique()

for amenity in amenity_types:
    if amenity not in amenity_radius_map:
        continue
    subset = amenities_df[amenities_df['amenity_type'] == amenity]
    amenity_coords_rad = deg2rad(subset)
    
    # Build BallTree
    tree = BallTree(amenity_coords_rad, metric='haversine')
    
    # Compute counts for each radius
    for r_km in amenity_radius_map[amenity]:
        r_rad = r_km / earth_radius_km
        counts = tree.query_radius(listing_coords_rad, r=r_rad, count_only=True)
        col_name = f'count_{amenity}_within_{int(r_km*1000)}m'
        df[col_name] = counts

# --- 5. Bin counts and ordinal encode ---
amenity_count_cols = [col for col in df.columns if col.startswith('count_')]

# Ensure numeric
for col in amenity_count_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Define bins and ordinal mapping
bins = [-1, 0, 2, 5, 10, np.inf]
labels = ['0', '1-2', '3-5', '6-10', '10+']
ordinal_mapping = {label: i for i, label in enumerate(labels)}

for col in amenity_count_cols:
    bin_col = f'{col}_bin'
    ord_col = f'{col}_bin_encoded'
    df[bin_col] = pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)
    df[ord_col] = df[bin_col].map(ordinal_mapping)

# --- 6. Verify ---
print(df[[col for col in df.columns if 'count_' in col]].head())
print(df[[col for col in df.columns if 'bin_encoded' in col]].head())


In [None]:
df

In [None]:
df.drop(columns="Unnamed: 0", inplace=True)

In [None]:
raw_and_bin_cols = [col for col in df.columns if col.startswith('count_') and not col.endswith('_bin_encoded')]
df.drop(columns=raw_and_bin_cols, inplace=True)
df

In [None]:
df.columns

In [None]:
# --- 1. Get prepared data ---
X_train, X_test, y_train, y_test, X_val, y_val, scaler, feature_encoders = prepare_data(
    df=df_clean,
    config_path=FEATURES_AND_MODEL_CONFIG_PATH,
    model_name="xgboost_early_stopping",
    use_extended_features=True,
    cv=False
)

# --- 2. Concatenate back to single dataframe ---
train_df = X_train.copy()
train_df['target'] = y_train

val_df = X_val.copy() if X_val is not None else None
if val_df is not None:
    val_df['target'] = y_val

test_df = X_test.copy()
test_df['target'] = y_test

# --- 3. Merge new features (amenity counts, distance bins, etc.) ---
# Ensure the index or a unique ID is preserved for merge

new_feature_cols = [
    # Distance bins
    'distbin_0–2km',
    'distbin_2–5km',
    'distbin_5–10km',
    'distbin_10–20km',
    'distbin_20km+',

    # Amenity counts (encoded bins)
    'count_school_within_500m_bin_encoded',
    'count_school_within_1000m_bin_encoded',
    'count_park_within_500m_bin_encoded',
    'count_park_within_1000m_bin_encoded',
    'count_supermarket_within_500m_bin_encoded',
    'count_supermarket_within_1000m_bin_encoded',
    'count_bus_stop_within_500m_bin_encoded',
    'count_bus_stop_within_1000m_bin_encoded',
    'count_tram_stop_within_500m_bin_encoded',
    'count_tram_stop_within_1000m_bin_encoded',
    'count_metro_stop_within_500m_bin_encoded',
    'count_metro_stop_within_1000m_bin_encoded'
]

train_df = train_df.merge(df[new_feature_cols], left_index=True, right_index=True, how='left')
if val_df is not None:
    val_df = val_df.merge(df[new_feature_cols], left_index=True, right_index=True, how='left')
test_df = test_df.merge(df[new_feature_cols], left_index=True, right_index=True, how='left')

# --- Ensure new features are numeric ---
for col in new_feature_cols:
    if str(train_df[col].dtype) == 'category':
        # convert category to int codes
        train_df[col] = train_df[col].cat.codes
        if val_df is not None:
            val_df[col] = val_df[col].cat.codes
        test_df[col] = test_df[col].cat.codes
    else:
        # convert object (string) to numeric
        train_df[col] = pd.to_numeric(train_df[col], errors='coerce').fillna(0)
        if val_df is not None:
            val_df[col] = pd.to_numeric(val_df[col], errors='coerce').fillna(0)
        test_df[col] = pd.to_numeric(test_df[col], errors='coerce').fillna(0)

# --- Split again into X/y ---
X_train_final = train_df.drop(columns=['target'])
y_train_final = train_df['target']

if val_df is not None:
    X_val_final = val_df.drop(columns=['target'])
    y_val_final = val_df['target']
else:
    X_val_final, y_val_final = None, None

X_test_final = test_df.drop(columns=['target'])
y_test_final = test_df['target']


In [None]:
evaluator = ModelEvaluator(
    target_transform=np.log1p, 
    inverse_transform=np.expm1,
)

best_xgb_params = study_xgb.best_params

trained_xgb, y_train_pred, y_val_pred, y_test_pred, results_xgb = evaluator.evaluate(
    model=None,  # not used in XGBoost.train
    X_train=X_train_final,
    y_train=y_train_final,
    X_val=X_val_final,
    y_val=y_val_final,
    X_test=X_test_final,
    y_test=y_test_final,
    use_xgb_train=True,
    model_params=best_xgb_params,  # <--- crucial
    fit_params={"num_boost_round": 1000, "early_stopping_rounds": 50},
)

logger.log_model(
    trained_xgb,
    "XGB_Optuna_LogTransformed_with_new_features",
    results_xgb,
    use_xgb_train=True
)
