## Dutch Housing Market Price Prediction - Data Preprocessing 

Imports

In [21]:
# Import required libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

Data Load

In [22]:
# Create directory structure for processed data and results
preprocessed_dir = '../preprocessed_data'
results_dir = '../results'

# Create directories if they don't exist
for directory in [preprocessed_dir, results_dir]:
    os.makedirs(directory, exist_ok=True)

# Load raw housing data
df_preprocessing = pd.read_csv('../data/raw_data.csv')
print(f"Loaded dataset with {len(df_preprocessing)} houses and {len(df_preprocessing.columns)} features")

# Set random seed for reproducibility
np.random.seed(42)

Loaded dataset with 5555 houses and 16 features


Cleaning

In [23]:
def clean_price(price):
    """Convert Dutch price notation to float.
    Example: '€ 525.000' -> 525000.0
    """
    if isinstance(price, str):
        price = price.replace('€', '').replace(' ', '')
        if ',' in price:
            price = price.replace('.', '').replace(',', '.')
        else:
            price = price.replace('.', '')
        try:
            return float(price)
        except:
            return np.nan
    return np.nan

def clean_size(size):
    """Convert size with unit to float.
    Example: '251 m²' -> 251.0
    """
    if isinstance(size, str):
        size = size.replace('m²', '').strip()
        try:
            return float(size)
        except:
            return np.nan
    return np.nan

# Clean and preprocess features
print("Starting feature cleaning...")
initial_count = len(df_preprocessing)

# Clean numeric features
df_preprocessing['Price'] = df_preprocessing['Price'].apply(clean_price)
df_preprocessing['Living space size (m2)'] = df_preprocessing['Living space size (m2)'].apply(clean_size)
df_preprocessing['Lot size (m2)'] = df_preprocessing['Lot size (m2)'].apply(clean_size)
df_preprocessing['Build year'] = pd.to_numeric(df_preprocessing['Build year'], errors='coerce')

# Simplify house type
df_preprocessing['House type'] = df_preprocessing['House type'].apply(
    lambda x: x.split(',')[0] if isinstance(x, str) else x
)

# Remove rows with missing prices (our target variable)
df_preprocessing = df_preprocessing.dropna(subset=['Price'])

print(f"Cleaned data: removed {initial_count - len(df_preprocessing)} rows with invalid prices")

Starting feature cleaning...
Cleaned data: removed 13 rows with invalid prices


Feature selecting

In [None]:
# Select relevant features for house price prediction
selected_features = [
    'Price',                    # Target variable
    'Living space size (m2)',   # Key numeric features
    'Lot size (m2)',
    'Build year',
    'House type',               # Important categorical features
    'City',
    'Energy label'
]

# Create working dataset with selected features
df_selected = df_preprocessing[selected_features].copy()

# Handle any remaining missing values
numeric_features = ['Living space size (m2)', 'Lot size (m2)', 'Build year']
categorical_features = ['House type', 'City', 'Energy label']

# Fill missing numeric values with median (a common strategy for housing data)
for feature in numeric_features:
    median_value = df_selected[feature].median()
    df_selected[feature].fillna(median_value, inplace=True)
    print(f"Filled {df_selected[feature].isnull().sum()} missing values in {feature}")

# Fill missing categorical values with mode
for feature in categorical_features:
    mode_value = df_selected[feature].mode()[0]
    df_selected[feature].fillna(mode_value, inplace=True)
    print(f"Filled {df_selected[feature].isnull().sum()} missing values in {feature}")

# Encode categorical variables
print("\nEncoding categorical variables...")
# One-hot encode house types (since types aren't ordinal)
df_selected = pd.get_dummies(df_selected, columns=['House type'], drop_first=True)

# Label encode city and energy label
label_encoders = {}
for feature in ['City', 'Energy label']:
    label_encoders[feature] = LabelEncoder()
    df_selected[feature] = label_encoders[feature].fit_transform(df_selected[feature])
    print(f"Encoded {feature} into {df_selected[feature].nunique()} unique values")

Split preprocessed

In [None]:
# Separate features and target variable
X = df_selected.drop('Price', axis=1)
y = df_selected['Price']

# Split data into train, validation, and test sets (60/20/20)
print("\nSplitting data into train, validation, and test sets...")
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Scale numeric features using StandardScaler
print("Scaling numeric features...")
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_val[numeric_features] = scaler.transform(X_val[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Quick verification of scaled features
print("\nVerifying scaling results (mean should be ~0, std should be ~1):")
for feature in numeric_features:
    mean = X_train[feature].mean()
    std = X_train[feature].std()
    print(f"{feature}: mean={mean:.3f}, std={std:.3f}")

Save Preprocessed Data

In [20]:
# Save processed data for later use
X_train.to_csv('../preprocessed_data/X_train.csv', index=False)
X_val.to_csv('../preprocessed_data/X_val.csv', index=False)
X_test.to_csv('../preprocessed_data/X_test.csv', index=False)
y_train.to_csv('../preprocessed_data/y_train.csv', index=False)
y_val.to_csv('../preprocessed_data/y_val.csv', index=False)
y_test.to_csv('../preprocessed_data/y_test.csv', index=False)

print("Preprocessing complete!")
print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Preprocessing complete!
Train set size: 3324
Validation set size: 1109
Test set size: 1109
