In [4]:
# === STEP 1: IMPORT LIBRARIES ===
# These are essential libraries for data manipulation, plotting and modeling
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#First look after first cleaning
import sys
print(sys.executable)

import pandas as pd
print(pd.__version__)

# === STEP 2: LOAD CLEANED DATA ===
# This dataset was cleaned and saved by clean.py. No further cleaning needed here.
file_path = 'data/cleaned/immoweb-dataset_cleaned_mvg.csv'
data = pd.read_csv(file_path)
data = pd.read_csv("data/cleaned/immoweb-dataset_cleaned_mvg.csv")
print(f"✅ Data loaded. Rows: {len(df)}, Columns: {df.shape[1]}")

# Data Overview
print("Data Overview:")
print(f"Rows: {data.shape[0]}")
print(f"Columns: {data.shape[1]}")
print("\nData types:")
print(data.dtypes)

# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values:")
print(missing_values)

# Check for duplicates
duplicates = data.duplicated().sum()
print(f"\nDuplicates: {duplicates}")

# Display basic info about the dataset
print("\nBasic Information:")
print(data.info())

# Show the first few rows of the dataset to inspect the data
print("\nFirst few rows of the dataset:")
print(data.head())

# Count the number of "Unknown" values in the 'Region' column
unknown_count = data[data['region'] == 'Unknown'].shape[0]
print(f"Number of 'Unknown' values in the 'Region' column: {unknown_count}")


# === STEP 3: TRAIN-TEST SPLIT ===
# We split the data into training and test sets so that the model can learn on one part
# and be evaluated on unseen data.
X = df.drop(columns="price")
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"✅ Training set: {X_train.shape}, Test set: {X_test.shape}")

# === STEP 4: RANDOM FOREST MODEL ===
# Random Forest is a robust regression model that works well with both numerical and categorical (encoded) data.
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("🌲 Random Forest model trained.")

# === STEP 5: FEATURE IMPORTANCE ===
# This plot helps us understand which features are most important to the model’s predictions.
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
importances.head(10).plot(kind="barh", color='teal')
plt.title("Top 10 Feature Importances")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# === STEP 6: MODEL PERFORMANCE ===
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print("📊 Model Performance:")
print(f"MAE : {mae:,.0f} €")
print(f"RMSE: {rmse:,.0f} €")
print(f"R²  : {r2:.3f}")

# === STEP 7: PREDICTED VS ACTUAL ===
# This scatter plot lets us check how close our predictions are to actual values.
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.title("Predicted vs Actual Prices")
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.tight_layout()
plt.show()

# === STEP 8: MISSING VALUES PLOT ===
# Although cleaned, this plot is good for exploratory completeness.
missing = df.isnull().mean() * 100
plt.figure(figsize=(12, 6))
ax = sns.barplot(x=missing.index, y=missing.values, palette="Blues_r")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.ylabel('Missing Values (%)')
plt.title('Missing Values per Column')
plt.tight_layout()
plt.show()

# === STEP 9: PROPERTY TYPE DISTRIBUTION ===
if 'type' in df.columns:
    counts = df['type'].value_counts().sort_index()
    labels = ['House', 'Apartment']
    plt.figure()
    plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=90)
    plt.title('Property Type Distribution')
    plt.tight_layout()
    plt.show()

# === STEP 10: CORRELATION HEATMAP ===
# Shows linear relationships between features. Useful for feature selection or multicollinearity checks.
top_corr_cols = ['price', 'habitableSurface', 'bedroomCount', 'bathroomCount', 'buildingCondition', 'epcScore']
plt.figure(figsize=(8, 6))
sns.heatmap(df[top_corr_cols].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Key Features')
plt.tight_layout()
plt.show()

# === STEP 11: SCATTER PLOTS VS PRICE ===
# Visual inspection of how some numeric features relate to price (check linearity, patterns)
top_vars = ['habitableSurface', 'bedroomCount', 'bathroomCount']
for var in top_vars:
    plt.figure(figsize=(8, 5))
    sns.regplot(x=var, y='price', data=df, scatter_kws={'s': 10}, line_kws={'color': 'red'})
    plt.title(f'Price vs {var}')
    plt.xlabel(var)
    plt.ylabel('Price')
    plt.tight_layout()
    plt.show()

# === STEP 12: HISTOGRAM OF SURFACE AREAS ===
# Shows how properties are distributed by size, filters out extreme outliers for better view.
plt.figure(figsize=(10, 5))
sns.histplot(df[df['habitableSurface'] < 800]['habitableSurface'], bins=40, kde=True, color='green')
plt.title('Distribution of Habitable Surface (<800 m²)')
plt.xlabel('Habitable Surface (m²)')
plt.ylabel('Number of Properties')
plt.tight_layout()
plt.show()

# === STEP 13: PRICE PER M² BY REGION AND TYPE ===
# Tells us which regions/property types are most expensive per m². Key for investment insights.
if 'region_Flanders' in df.columns or 'region' in df.columns:
    df_copy = df.copy()
    if 'region' not in df_copy.columns:
        def reverse_region(row):
            if row.get('region_Flanders') == 1:
                return 'Flanders'
            elif row.get('region_Wallonia') == 1:
                return 'Wallonia'
            else:
                return 'Brussels'
        df_copy['region'] = df_copy.apply(reverse_region, axis=1)

    df_filtered = df_copy[(df_copy['price'] > 100000) & (df_copy['price'] < 1000000)]
    median_prices = df_filtered.groupby(['region', 'type'])['price_per_m2'].median().reset_index()

    plot = sns.catplot(x="region", y="price_per_m2", hue="type", kind="bar", data=median_prices, palette="Set2")
    plt.title("Median Price per m² by Region and Property Type")
    plot._legend.set_title("Property Type")
    plot._legend.texts[0].set_text("Apartment")
    plot._legend.texts[1].set_text("House")
    plt.tight_layout()
    plt.show()


/Users/Marc/Documents/GitHub/immo-eliza-lions/.venv/bin/python
2.3.0


FileNotFoundError: [Errno 2] No such file or directory: 'data/cleaned/immoweb-dataset_cleaned_mvg.csv'