#### 1. Imports and Set-up


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

mlflow.set_tracking_uri("../logs/mlruns")
mlflow.set_experiment("house_price_prediction")

In [None]:
import sys
import os

# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_loading.data_loading.data_loader import load_data_from_json
from src.data_loading.preprocessing.preprocessing import preprocess_df


df_raw = load_data_from_json("../data/parsed_json/*.json")
df_clean = preprocess_df(df_raw)
df_clean
df = df_clean.copy()

In [None]:
df.head()

In [None]:
df.columns

#### 2. Load and prep data


In [None]:
def clean_year(year):
    if isinstance(year, str):
        if year.startswith("Voor"):  # e.g., "Voor 1906"
            return int(year.split()[-1]) - 1  # use 1905
        elif year.startswith("Na"):  # e.g., "Na 2020"
            return int(year.split()[-1]) + 1  # use 2021
        elif year.isdigit():
            return int(year)
        else:
            return None  # invalid string
    elif isinstance(year, (int, float)):
        return int(year)
    else:
        return None


df["year_of_construction"] = df["year_of_construction"].apply(clean_year)
df["year_of_construction"] = df["year_of_construction"].fillna(
    df["year_of_construction"].median()
)

In [None]:
numeric_cols = ["bedrooms", "nr_rooms", "bathrooms", "toilets"]

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

In [None]:
df = df[df["price_num"].notna()]

In [None]:
linear_features = [
    "size_num",
    "bedrooms",
    "year_of_construction",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "contribution_vve_num",
    "external_storage_num",
    "inhabitants_in_neighborhood",
    "families_with_children_pct",
    "price_per_m2_neighborhood",
]
target = "price_num"

X = df[linear_features].replace("N/A", np.nan).fillna(0)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### 3. Linear Regression


In [None]:
with mlflow.start_run(run_name="Linear_Regression"):

    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)

    y_train_pred = lr.predict(X_train_scaled)
    y_test_pred = lr.predict(X_test_scaled)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_r2 = r2_score(y_train, y_train_pred)

    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)

    print(
        f"Linear Regression train RMSE: {train_rmse:.2f}, R2: {train_r2:.3f}"
    )
    print(f"Linear Regression test RMSE: {test_rmse:.2f}, R2: {test_r2:.3f}")

    # Log to MLflow
    mlflow.sklearn.log_model(lr, "linear_regression_model")
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)

#### 4. Random Forest Regression


In [None]:
all_features = [
    "size_num",
    "bedrooms",
    "energy_label",
    "year_of_construction",
    "nr_rooms",
    "bathrooms",
    "toilets",
    "contribution_vve_num",
    "external_storage_num",
    "has_mechanische_ventilatie",
    "has_tv_kabel",
    "has_lift",
    "has_natuurlijke_ventilatie",
    "has_n/a",
    "has_schuifpui",
    "has_glasvezelkabel",
    "has_frans_balkon",
    "has_buitenzonwering",
    "has_zonnepanelen",
    "has_airconditioning",
    "has_balansventilatie",
    "has_dakraam",
    "has_alarminstallatie",
    "has_domotica",
    "has_rookkanaal",
    "has_elektra",
    "has_sauna",
    "has_zonnecollectoren",
    "has_cctv",
    "has_rolluiken",
    "has_stromend_water",
    "has_satellietschotel",
    "num_facilities",
    "inhabitants_in_neighborhood",
    "families_with_children_pct",
    "price_per_m2_neighborhood",
]

X = df[all_features].replace("N/A", np.nan).fillna(0)

X["energy_label"] = X["energy_label"].replace({0: "G"})
energy_order = [
    "G",
    "F",
    "E",
    "D",
    "C",
    "B",
    "A",
    "A+",
    "A++",
    "A+++",
    "A++++",
]
encoder = OrdinalEncoder(categories=[energy_order])
X["energy_label_encoded"] = encoder.fit_transform(X[["energy_label"]])
X = X.drop(columns=["energy_label"])

y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [None]:
X.energy_label_encoded.unique()

In [None]:
with mlflow.start_run(run_name="Random_Forest_Regression"):

    rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    rf.fit(X_train, y_train)

    y_train_pred = rf.predict(X_train)
    y_test_pred = rf.predict(X_test)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_r2 = r2_score(y_train, y_train_pred)

    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)

    print(
        f"Linear Regression train RMSE: {train_rmse:.2f}, R2: {train_r2:.3f}"
    )
    print(f"Linear Regression test RMSE: {test_rmse:.2f}, R2: {test_r2:.3f}")

    # Log to MLflow
    mlflow.sklearn.log_model(rf, "random_forest_regression_model")
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)

#### 5. MLflow UI Reminder
