In [28]:
# Data handling
import os
import pandas as pd
import numpy as np
from pathlib import Path

# Train/test split
from sklearn.model_selection import train_test_split

# Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xgb


# Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [2]:
file_path = Path(r"C:\Users\vande\becode\immo-eliza-ml\cleaned_properties.csv")
df = pd.read_csv(file_path)

# Features & target
X = df.drop(columns=["price"])
y = df["price"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [3]:
# Example: price_per_m2, age_of_property, rooms_per_m2
X_train_fe = X_train.copy()
X_test_fe  = X_test.copy()

# price_per_m2
if "total_area_sqm" in X_train.columns:
    X_train["price_per_m2"] = y_train / X_train["total_area_sqm"]
    X_test["price_per_m2"] = y_test / X_test["total_area_sqm"]

# age_of_property
if "construction_year" in X_train.columns:
    current_year = pd.Timestamp.now().year
    X_train["age_of_property"] = current_year - X_train["construction_year"]
    X_test["age_of_property"] = current_year - X_test["construction_year"]

# rooms_per_m2
if "nbr_bedrooms" in X_train.columns and "total_area_sqm" in X_train.columns:
    X_train["rooms_per_m2"] = X_train["nbr_bedrooms"] / X_train["total_area_sqm"]
    X_test["rooms_per_m2"] = X_test["nbr_bedrooms"] / X_test["total_area_sqm"]

# Flags
for col, flag in [("fl_garden", "has_garden"), 
                  ("fl_garage", "has_garage"), 
                  ("fl_elevator", "has_elevator")]:
    if col in X_train.columns:
        X_train[flag] = X_train[col]
        X_test[flag] = X_test[col]

In [4]:
X_train_fe.dtypes


id                                  int64
property_type                      object
subproperty_type                   object
region                             object
province                           object
locality                           object
zip_code                            int64
latitude                          float64
longitude                         float64
construction_year                 float64
total_area_sqm                    float64
surface_land_sqm                  float64
nbr_frontages                     float64
nbr_bedrooms                      float64
equipped_kitchen                   object
fl_furnished                        int64
fl_open_fire                        int64
fl_terrace                          int64
terrace_sqm                       float64
fl_garden                           int64
garden_sqm                        float64
fl_swimming_pool                    int64
fl_floodzone                        int64
state_building                    

In [5]:
#3) Log-transform skewed numeric features
# -----------------------------
skewed_cols = ["price", "total_area_sqm", "surface_land_sqm"]
for col in skewed_cols:
    if col in X_train.columns:
        X_train[col + "_log"] = np.log1p(X_train[col])
        X_test[col + "_log"] = np.log1p(X_test[col])


In [6]:
# 4) One-hot encode categorical variables
# -----------------------------
categorical_cols = X_train.select_dtypes(include="object").columns.tolist()
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [7]:
# 5) Remove features with high multicollinearity
# -----------------------------
corr_matrix = X_train_encoded.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.85)]

X_train_selected = X_train_encoded.drop(columns=to_drop)
X_test_selected = X_test_encoded.drop(columns=to_drop)

print("Dropped features due to high correlation:", to_drop)

Dropped features due to high correlation: ['age_of_property', 'has_garden', 'locality_Brussels', 'locality_Nivelles']


In [8]:
numeric_cols = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train_fe.select_dtypes(exclude=np.number).columns.tolist()


In [9]:
X_train_encoded = pd.get_dummies(X_train_fe, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test_fe, columns=categorical_cols, drop_first=True)

# Align test set with training set
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)


In [10]:
# Example: numeric & categorical columns
numeric_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(exclude=np.number).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features)
    ]
)

# Fit only on training data
X_train_processed = preprocessor.fit_transform(X_train_fe)
X_test_processed  = preprocessor.transform(X_test_fe)

# Save preprocessor
Path("models").mkdir(parents=True, exist_ok=True)
joblib.dump(preprocessor, "models/preprocessor_v2.joblib")


['models/preprocessor_v2.joblib']

In [14]:
linreg_v2 = LinearRegression()
linreg_v2.fit(X_train_processed, y_train)
y_pred_lin = linreg_v2.predict(X_test_processed)

r2_lin = r2_score(y_test, y_pred_lin)
mae_lin = mean_absolute_error(y_test, y_pred_lin)
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))

print("Linear Regression v2")
print("R²:", r2_lin, "MAE:", mae_lin, "RMSE:", rmse_lin)

# Save model
joblib.dump(linreg_v2, "models/linear_reg_v2.joblib")


Linear Regression v2
R²: 0.3786624681244213 MAE: 154718.7996167557 RMSE: 352397.6300533496


['models/linear_reg_v2.joblib']

In [None]:
# Take first 5000 rows for training/testing quickly
X_train_sample = X_train_processed[:5000]
y_train_sample = y_train[:5000]
X_test_sample  = X_test_processed[:1000]
y_test_sample  = y_test[:1000]

rf = RandomForestRegressor(
    n_estimators=20,   # fewer trees
    max_depth=10,      # shallower trees
    random_state=42,
    n_jobs=-1          # use all cores
)

rf.fit(X_train_sample, y_train_sample)

y_pred_rf = rf.predict(X_test_sample)

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

r2_rf = r2_score(y_test_sample, y_pred_rf)
mae_rf = mean_absolute_error(y_test_sample, y_pred_rf)

# Option 1: for older sklearn versions
rmse_rf = np.sqrt(mean_squared_error(y_test_sample, y_pred_rf))

# Option 2: if your sklearn >=0.24, you could do:
# rmse_rf = mean_squared_error(y_test_sample, y_pred_rf, squared=False)

print(f"R2: {r2_rf:.4f}, MAE: {mae_rf:.2f}, RMSE: {rmse_rf:.2f}")



R2: 0.7515, MAE: 111725.90, RMSE: 210200.99


In [30]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_processed, y_train)
y_pred_rf = rf.predict(X_test_processed)

r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("Random Forest v1")
print("R²:", r2_rf, "MAE:", mae_rf, "RMSE:", rmse_rf)

joblib.dump(rf, "models/rf_v1.joblib")


Random Forest v1
R²: 0.7685385723927256 MAE: 78082.63112236791 RMSE: 215084.19520276738


['models/rf_v1.joblib']

In [29]:
xgb_model = xgb.XGBRegressor(random_state=42, use_label_encoder=False, eval_metric="rmse")
xgb_model.fit(X_train_processed, y_train)
y_pred_xgb = xgb_model.predict(X_test_processed)

r2_xgb = r2_score(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

print("XGBoost v1")
print("R²:", r2_xgb, "MAE:", mae_xgb, "RMSE:", rmse_xgb)

joblib.dump(xgb_model, "models/xgb_v1.joblib")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost v1
R²: 0.7497943760196182 MAE: 86395.88540179384 RMSE: 223623.6432660779


['models/xgb_v1.joblib']

In [31]:
import pandas as pd

results = pd.DataFrame({
    "Model": ["Linear Regression v2", "Random Forest v1", "XGBoost v1"],
    "R2": [r2_lin, r2_rf, r2_xgb],
    "MAE": [mae_lin, mae_rf, mae_xgb],
    "RMSE": [rmse_lin, rmse_rf, rmse_xgb]
})

results


Unnamed: 0,Model,R2,MAE,RMSE
0,Linear Regression v2,0.378662,154718.799617,352397.630053
1,Random Forest v1,0.768539,78082.631122,215084.195203
2,XGBoost v1,0.749794,86395.885402,223623.643266
