In [4]:
# === IMPORT LIBRARIES ===
# These are essential libraries for data manipulation, plotting and modeling
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
#from geopy.geocoders import Nominatim
#from geopy.extra.rate_limiter import RateLimiter
#from tqdm import tqdm
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#First look after first cleaning
import sys
print(sys.executable)

import pandas as pd
print(pd.__version__)

# === LOAD CLEANED DATA ===
# This dataset was cleaned and saved by clean.py. No further cleaning needed here.
file_path = 'data/cleaned/immoweb-dataset_cleaned.csv'
data = pd.read_csv(file_path)
print(f"✅ Data loaded. Rows: {len(data)}, Columns: {data.shape[1]}")

# Folder to save figures/plots
os.makedirs("figures", exist_ok=True)

# === Get to know and visualize our data ===
# Data Overview
print("Data Overview:")
print(f"Rows: {data.shape[0]}")
print(f"Columns: {data.shape[1]}")
print("\nData types:")
print(data.dtypes)

# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values:")
print(missing_values)

# Check for duplicates
duplicates = data.duplicated().sum()
print(f"\nDuplicates: {duplicates}")

# Display basic info about the dataset
print("\nBasic Information:")
print(data.info())

# Show the first few rows of the dataset to inspect the data
print("\nFirst few rows of the dataset:")
print(data.head())

# Count the number of "Unknown" values in the 'Region' column
unknown_count = data[data['region'] == 'Unknown'].shape[0]
print(f"Number of 'Unknown' values in the 'Region' column: {unknown_count}")

# === EXPLORATORY ANALYSIS: NEW VISUALIZATIONS ===

# 1. Distribution of Property Prices (below €1.5M)
# This helps identify the shape of the price distribution and presence of outliers
plt.figure(figsize=(10, 6))
sns.histplot(data[data['price'] < 1_500_000]['price'], bins=50, kde=True, color='skyblue')
plt.title("Distribution of Property Prices (< €1.5M)")
plt.xlabel("Price (€)")
plt.ylabel("Number of Properties")
plt.tight_layout()
plt.savefig("figures/01_price_distribution_mvg.png")
plt.show()

# 2. Boxplot of Price per m² by Region
# Boxplots are great to compare central tendency and spread across categories
plt.figure(figsize=(8, 6))
sns.boxplot(data=data, x='region', y='price_per_m2', palette='pastel')
plt.title("Price per m² by Region")
plt.xlabel("Region")
plt.ylabel("Price per m² (€)")
plt.tight_layout()
plt.savefig("figures/02_price_per_m2_by_region_mvg.png")
plt.show()

# 3. Heatmap of Correlations Between Numeric Features
# Helps reveal which variables have a strong relationship, useful for feature selection
plt.figure(figsize=(10, 8))
sns.heatmap(data.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Heatmap of Numeric Variables")
plt.tight_layout()
plt.savefig("figures/03_correlation_heatmap_mvg.png")
plt.show()

# 4. Price vs. Surface Area by Region
# A colored scatterplot shows if the size–price relationship differs per region
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='habitableSurface', y='price', hue='region', alpha=0.6)
plt.title("Price vs. Habitable Surface Area by Region")
plt.xlabel("Habitable Surface (m²)")
plt.ylabel("Price (€)")
plt.tight_layout()
plt.savefig("figures/04_price_per_m2_by_subtype_mvg.png")
plt.show()

# 5. Average Price per m² by Property Subtype
# Shows which property subtypes tend to be more expensive per m²
avg_price_m2 = data.groupby("subtype")["price_per_m2"].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=avg_price_m2.index, y=avg_price_m2.values, palette="viridis")
plt.xticks(rotation=90)
plt.title("Average Price per m² by Property Subtype")
plt.ylabel("Price per m² (€)")
plt.xlabel("Property Subtype")
plt.tight_layout()
plt.savefig("figures/05_Averagepricem2_distribution_by_subtype_mvg.png")
plt.show()


# 6. Distribution of Houses and Apartments
# Purpose: Understand the relative number of listings per property type
# 'countplot' shows how many entries exist for each category in 'type'
# Insight: Helps assess class imbalance, useful for modeling or pricing trends.
plt.figure(figsize=(6, 4))
sns.countplot(data=data, x='type', palette='Set2')  # Set2 gives pastel colors
plt.title('Distribution: House vs Apartment')
plt.xticks(ticks=[0, 1], labels=['House', 'Apartment'])  # Map encoded type values to labels
plt.xlabel('Property Type')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig("figures/06_Distribution_House_vs_Apartment_mvg.png")
plt.show()


# 7. Surface Area Distribution by Property Type
# Purpose: Compare typical surface areas for houses vs apartments
# 'histplot' visualizes distributions with optional KDE (smooth curve)
# Insight: Confirms that houses generally have larger surfaces; KDE helps spot multimodal patterns.
fig, axs = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

sns.histplot(data[data['type'] == 0]['habitableSurface'], bins=40, kde=True, ax=axs[0], color='salmon')
axs[0].set_title('Surface Area (House)')
axs[0].set_xlabel('m²')

sns.histplot(data[data['type'] == 1]['habitableSurface'], bins=40, kde=True, ax=axs[1], color='skyblue')
axs[1].set_title('Surface Area (Apartment)')
axs[1].set_xlabel('m²')

plt.tight_layout()
plt.savefig("figures/07_Compare_surface_house_vs_apartment_mvg.png")
plt.show()


# 8. Correlation Heatmap (with Price)
# Purpose: Understand how numeric variables relate to price
# 'heatmap' visualizes correlations between -1 and 1; stronger colors = stronger relationships
# Insight: Reveals strongest predictors of price; useful for feature selection.
# 'annot=True' shows numeric values; 'coolwarm' color scale distinguishes + vs - correlation
selected_cols = ['price', 'bedroomCount', 'bathroomCount', 'habitableSurface', 'price_per_m2', 'buildingCondition', 'epcScore']
corr = data[selected_cols].corr()
# CORRELATION FOR HOUSES
house_data = data[data['type'] == 0]  # Filter for houses
corr_house = house_data[selected_cols].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_house, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation with Price – Houses')
plt.tight_layout()
plt.savefig("figures/81_corr_house_price_vertical_mvg.png")
plt.show()
# CORRELATION FOR APARTMENTS
apartment_data = data[data['type'] == 1]  # Filter for apartments
corr_apartment = apartment_data[selected_cols].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_apartment, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation with Price – Apartments')
plt.tight_layout()
plt.savefig("figures/82_corr_apartment_price_vertical_mvg.png")
plt.show()


# 9. Bedrooms vs Province by Property Type
# Purpose: Show how number of bedrooms varies by province and type
# 'boxplot' shows distribution (median, quartiles, outliers) for each group
# Insight: Detects regional trends in bedroom size; useful for regional modeling or segmentation
plt.figure(figsize=(14, 6))
sns.boxplot(data=data, x='province', y='bedroomCount', hue='type', palette='Set2')
plt.title('Bedrooms vs Province by Property Type')
plt.xticks(rotation=45)  # Rotate province names for readability
plt.xlabel('Province')
plt.ylabel('Number of Bedrooms')
plt.legend(title='Property Type', labels=['House', 'Apartment'])
plt.tight_layout()
plt.show()


# 10. Mean EPC Score by Province and Property Type
# Purpose: Assess energy efficiency patterns across regions and property types
# Higher EPC scores (closer to 9) mean more energy-efficient properties
# Insight: Useful for evaluating where more energy-efficient housing exists.
# May reflect newer building stock, renovations, or stricter local regulations.
epc_grouped = data.groupby(['province', 'type'])['epcScore'].mean().reset_index()
epc_grouped['type'] = epc_grouped['type'].map({0: 'House', 1: 'Apartment'})

plt.figure(figsize=(14, 6))
sns.barplot(data=epc_grouped, x='province', y='epcScore', hue='type', palette='pastel')
plt.title('Mean EPC Score by Province and Property Type')
plt.xticks(rotation=45)
plt.ylabel('Mean EPC Score (9=A++, ..., 1=G)')
plt.tight_layout()
plt.savefig("figures/10_Mean_EPC_by_Province_Property_type_mvg.png")
plt.show()

# 11. Price vs Building Condition
# You can clearly see that properties in better condition (JUST RENOVATED or AS NEW) command higher median prices and narrower spreads.
# The spread of prices increases as condition worsens (e.g. TO RESTORE).
# Outliers are visible as dots outside the whiskers.

# Map numerical buildingCondition values back to readable labels for better plot readability
# These labels reflect the original condition categories in the raw dataset
condition_labels = {
    6: "JUST RENOVATED", 
    5: "AS NEW", 
    4: "GOOD",
    3: "TO BE DONE UP", 
    2: "TO RENOVATE", 
    1: "TO RESTORE"
}
data['buildingConditionLabel'] = data['buildingCondition'].map(condition_labels)

# 📦 Create a boxplot showing the distribution of price for each building condition
plt.figure(figsize=(12, 6))  # Sets the size of the plot in inches (width, height)

sns.boxplot(
    x='buildingConditionLabel',  # Categorical variable on the x-axis
    y='price',                   # Numerical variable on the y-axis
    data=data,                   # Data source (our cleaned dataframe)
    palette='viridis'           # Color palette for consistent styling
)

plt.xticks(rotation=45)  # Rotates x-axis labels for better readability
plt.title("Comparison of House and Apartment Prices by Building Condition in Belgium")
plt.xlabel("State of the Building")
plt.ylabel("Price (€)")
plt.tight_layout()  # Automatically adjusts spacing to prevent label cut-off
plt.savefig("figures/11_House_Apart_ Prices_byBuildingCondition_Belgium_mvg.png")
plt.show()


# 12. Heatmap Price and Price per sqm for House/apartment
# Ensure 'type' column exists and is numeric:
# 0 = House, 1 = Apartment (already mapped in cleaning step)
# Step 1: Split the dataset
df_house = data[data['type'] == 0].copy()       # 0 = House
df_apartment = data[data['type'] == 1].copy()   # 1 = Apartment

# 🧮 Step 2: Compute correlation matrices
corr_house = df_house.corr(numeric_only=True)
corr_apartment = df_apartment.corr(numeric_only=True)

# 🎯 Step 3: Extract correlation rows for 'price' and 'price_per_m2' only
# This selects how strongly each numeric feature correlates with our targets
price_corr_house = corr_house.loc[["price"]].T
price_corr_apartment = corr_apartment.loc[["price"]].T
sqm_corr_house = corr_house.loc[["price_per_m2"]].T
sqm_corr_apartment = corr_apartment.loc[["price_per_m2"]].T

# 🎨 Step 4: Plot each one as a small vertical heatmap

# 1. Price correlations (Houses)
plt.figure(figsize=(6, 8))
sns.heatmap(price_corr_house, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)
plt.title("🏠 Correlation with Price (Houses)")
plt.tight_layout()
plt.savefig("figures/12_1_corr_house_price_vertical_mvg.png")
plt.show()

# 2. Price correlations (Apartments)
plt.figure(figsize=(6, 8))
sns.heatmap(price_corr_apartment, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)
plt.title("🏢 Correlation with Price (Apartments)")
plt.tight_layout()
plt.savefig("figures/12_2_corr_apartment_price_vertical_mvg.png")
plt.show()

# 3. Price per m² correlations (Houses)
plt.figure(figsize=(6, 8))
sns.heatmap(sqm_corr_house, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)
plt.title("🏠 Correlation with Price/m² (Houses)")
plt.tight_layout()
plt.savefig("figures/12_3_corr_house_priceperm2_vertical_mvg.png")
plt.show()

# 4. Price per m² correlations (Apartments)
plt.figure(figsize=(6, 8))
sns.heatmap(sqm_corr_apartment, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)
plt.title("🏢 Correlation with Price/m² (Apartments)")
plt.tight_layout()
plt.savefig("figures/12_4_corr_apartment_priceperm2_vertical_mvg.png")
plt.show()

# Plot horizontal and sorted ? 
import seaborn as sns
import matplotlib.pyplot as plt

# 🏗️ STEP 1: Split the dataset into houses and apartments
# This allows us to compare the correlation of features separately for each type
df_house = data[data['type'] == 0].copy()       # 0 = House
df_apartment = data[data['type'] == 1].copy()   # 1 = Apartment

# 🧮 STEP 2: Compute correlation matrices (only for numeric columns)
corr_house = df_house.corr(numeric_only=True)
corr_apartment = df_apartment.corr(numeric_only=True)

# 🎯 STEP 3: Extract and sort correlation values with 'price' and 'price_per_m2'
# We drop the self-correlation (with itself = 1.0) for a more meaningful ranking

# --- House correlations ---
price_corr_house = corr_house["price"].drop("price").sort_values(ascending=False)
sqm_corr_house = corr_house["price_per_m2"].drop("price_per_m2").sort_values(ascending=False)

# --- Apartment correlations ---
price_corr_apartment = corr_apartment["price"].drop("price").sort_values(ascending=False)
sqm_corr_apartment = corr_apartment["price_per_m2"].drop("price_per_m2").sort_values(ascending=False)

# 🎨 STEP 4: Plot all 4 as horizontal bar-style heatmaps

# 🏠 1. Price correlations (Houses)
plt.figure(figsize=(10, 4))
sns.heatmap(price_corr_house.to_frame().T, annot=True, cmap='coolwarm',
            vmin=-1, vmax=1, cbar=True, linewidths=0.5)
plt.title("🏠 Correlation with Price (Houses) — Sorted")
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig("figures/12_5_corr_house_price_sorted_mvg.png")
plt.show()

# 🏢 2. Price correlations (Apartments)
plt.figure(figsize=(10, 4))
sns.heatmap(price_corr_apartment.to_frame().T, annot=True, cmap='coolwarm',
            vmin=-1, vmax=1, cbar=True, linewidths=0.5)
plt.title("🏢 Correlation with Price (Apartments) — Sorted")
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig("figures/12_6_corr_apartment_price_sorted_mvg.png")
plt.show()

# 🏠 3. Price/m² correlations (Houses)
plt.figure(figsize=(10, 4))
sns.heatmap(sqm_corr_house.to_frame().T, annot=True, cmap='coolwarm',
            vmin=-1, vmax=1, cbar=True, linewidths=0.5)
plt.title("🏠 Correlation with Price per m² (Houses) — Sorted")
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig("figures/12_7_corr_house_priceperm2_sorted_mvg.png")
plt.show()

# 🏢 4. Price/m² correlations (Apartments)
plt.figure(figsize=(10, 4))
sns.heatmap(sqm_corr_apartment.to_frame().T, annot=True, cmap='coolwarm',
            vmin=-1, vmax=1, cbar=True, linewidths=0.5)
plt.title("🏢 Correlation with Price per m² (Apartments) — Sorted")
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig("figures/12_8_corr_apartment_priceperm2_sorted_mvg.png")
plt.show()



"**********************"
'''numeric_cols = data.select_dtypes("number").columns
inlier_mask = pd.Series(True, index=data.index)
for col in numeric_cols:
    if data[col].dtype == "float64" or data[col].dtype == "int64": # Check for any numeric dtype
        q1 = data[col].quantile(0.25)
        q3 = data[col].quantile(0.75)
        iqr = q3 - q1
        upper_fence = q3 + 1.5 * iqr
        lower_fence = q1 - 1.5 * iqr
        col_inlier_mask = (data[col] >= lower_fence) & (data[col] <= upper_fence)

        inlier_mask = inlier_mask & col_inlier_mask
    else:
        print(f"Skipping non-numeric column: {col}")
df_cleaned = data[inlier_mask].copy()
print(df_cleaned.shape)
sns.kdeplot(data=df_cleaned, x="price", hue="province",  multiple="stack")
plt.title(f"Distribution of price by Province")
plt.show()
sns.kdeplot(data=df_cleaned, x="habitableSurface", hue="province",  multiple="stack")
plt.title(f"Distribution of price by Province")
plt.show()
sns.kdeplot(data=df_cleaned, x="price", hue="bedroomCount",  multiple="stack")
plt.title(f"Distribution of price by Province")
plt.show()'''

c:\Users\preet\Anaconda\anaconda3\python.exe
2.2.3


FileNotFoundError: [Errno 2] No such file or directory: 'data/cleaned/immoweb-dataset_cleaned.csv'