#### 1. Imports and Set-up


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

mlflow.set_tracking_uri("../logs/mlruns")
mlflow.set_experiment("house_price_prediction")

In [None]:
import sys
import os

# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_loading.data_loading.data_loader import load_data_from_json
from src.data_loading.preprocessing.preprocessing import preprocess_df


df_raw = load_data_from_json("../data/parsed_json/*.json")
df_clean = preprocess_df(df_raw)
df_clean
df = df_clean.copy()

In [None]:
df.head()

In [None]:
df.columns

#### 2. Load and prep data


In [None]:
def clean_year(year):
    if isinstance(year, str):
        if year.startswith("Voor"):  # e.g., "Voor 1906"
            return int(year.split()[-1]) - 1  # use 1905
        elif year.startswith("Na"):  # e.g., "Na 2020"
            return int(year.split()[-1]) + 1  # use 2021
        elif year.isdigit():
            return int(year)
        else:
            return None  # invalid string
    elif isinstance(year, (int, float)):
        return int(year)
    else:
        return None


df["year_of_construction"] = df["year_of_construction"].apply(clean_year)
df["year_of_construction"] = df["year_of_construction"].fillna(
    df["year_of_construction"].median()
)

In [None]:
numeric_cols = ["bedrooms", "nr_rooms", "bathrooms", "toilets"]

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

In [None]:
df = df[df["price_num"].notna()]

In [None]:
linear_features = [
    "size_num",
    "bedrooms",
    "year_of_construction",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "contribution_vve_num",
    "external_storage_num",
    "inhabitants_in_neighborhood",
    "families_with_children_pct",
    "price_per_m2_neighborhood",
]
target = "price_num"

X = df[linear_features].replace("N/A", np.nan).fillna(0)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### 3. Code for evaluating and logging models

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, metrics=None, fit_params=None):
    """
    Fit model, predict, and return evaluation metrics.
    """
    if fit_params is None:
         fit_params={}
    model.fit(X_train, y_train, **fit_params)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    results = {}
    if metrics is None:
        metrics = {
            "rmse": lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)),
            "r2": r2_score,
        }

    for name, func in metrics.items():
        results[f"train_{name}"] = func(y_train, y_train_pred)
        results[f"test_{name}"] = func(y_test, y_test_pred)
    
    return model, results

def log_to_mlflow(model, model_name, results):
    """
    Log model and metrics to MLflow.
    """
    with mlflow.start_run(run_name=model_name):
            mlflow.sklearn.log_model(model, f"{model_name}_model")
            mlflow.log_metrics(results)
            if hasattr(model, "get_params"):
                mlflow.log_params(model.get_params())   
            print(f"{model_name} -> {results}")


#### 4. Linear Regression


In [None]:
lr, lr_results = evaluate_model(
    LinearRegression(),
    X_train_scaled, y_train, X_test_scaled, y_test
)
log_to_mlflow(lr, "Linear_Regression", lr_results)

#### 5. Random Forest Regression


In [None]:
all_features = [
    "size_num",
    "bedrooms",
    "energy_label",
    "year_of_construction",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "contribution_vve_num",
    "external_storage_num",
    "has_mechanische_ventilatie",
    "has_tv_kabel",
    "has_lift",
    "has_natuurlijke_ventilatie",
    "has_n/a",
    "has_schuifpui",
    "has_glasvezelkabel",
    "has_frans_balkon",
    "has_buitenzonwering",
    "has_zonnepanelen",
    "has_airconditioning",
    "has_balansventilatie",
    "has_dakraam",
    "has_alarminstallatie",
    "has_domotica",
    "has_rookkanaal",
    "has_elektra",
    "has_sauna",
    "has_zonnecollectoren",
    "has_cctv",
    "has_rolluiken",
    "has_stromend_water",
    "has_satellietschotel",
    "num_facilities",
    "inhabitants_in_neighborhood",
    "families_with_children_pct",
    "price_per_m2_neighborhood",
]

X = df[all_features].replace("N/A", np.nan).fillna(0)

X["energy_label"] = X["energy_label"].replace({0: "G"})
energy_order = [
    "G",
    "F",
    "E",
    "D",
    "C",
    "B",
    "A",
    "A+",
    "A++",
    "A+++",
    "A++++",
]
encoder = OrdinalEncoder(categories=[energy_order])
X["energy_label_encoded"] = encoder.fit_transform(X[["energy_label"]])
X = X.drop(columns=["energy_label"])

y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [None]:
X.energy_label_encoded.unique()

In [None]:
rf, rf_results = evaluate_model(
    RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    X_train, y_train, X_test, y_test
)
log_to_mlflow(rf, "Random_Forest_Regression", rf_results)

#### 6. Compare models using MLflow


In [None]:
experiment_name = "house_price_prediction" 

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id


runs_df = mlflow.search_runs(experiment_ids=[experiment_id])


In [None]:
metrics_of_interest = ["metrics.train_rmse", "metrics.test_rmse", "metrics.train_r2", "metrics.test_r2"]
comparison_df = runs_df[["run_id", "tags.mlflow.runName"] + metrics_of_interest]

comparison_df.sort_values("metrics.test_r2", ascending=False, inplace=True)
comparison_df


In [None]:
best_model = comparison_df.sort_values("metrics.test_r2", ascending=False).iloc[0]
print("Best model based on test RÂ²:")
print(best_model)