In [None]:
import pandas as pd
import json
import glob

In [None]:
import re

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import sys
import os

# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_loading.data_loading.data_loader import load_data_from_json
from src.data_loading.preprocessing.preprocessing import preprocess_df


df_raw = load_data_from_json("../data/parsed_json/*.json")
df_clean = preprocess_df(df_raw)
df_clean
df = df_clean.copy()

In [None]:
df

In [None]:
df.info()

In [None]:
df.replace("N/A", pd.NA, inplace=True)

In [None]:
df["bedrooms_num"] = pd.to_numeric(df["bedrooms"], errors="coerce")
df["nr_rooms_num"] = pd.to_numeric(df["nr_rooms"], errors="coerce")
df["bathrooms_num"] = pd.to_numeric(df["bathrooms"], errors="coerce")
df["year_of_construction_num"] = pd.to_numeric(
    df["year_of_construction"], errors="coerce"
)

df[
    [
        "price_num",
        "size_num",
        "bedrooms_num",
        "nr_rooms_num",
        "bathrooms_num",
        "year_of_construction_num",
    ]
].describe()

In [None]:
# Price Distribution
sns.histplot(df["price_num"].dropna(), bins=15)
plt.title("Price Distribution")
plt.xlabel("Price (€)")
plt.show()

# Size Distribution
sns.histplot(df["size_num"].dropna(), bins=15)
plt.title("Size Distribution")
plt.xlabel("Size (m²)")
plt.show()

# Bedrooms Count
sns.countplot(x="bedrooms_num", data=df)
plt.title("Bedrooms Count")
plt.show()

# Year of construction histogram
sns.histplot(df["year_of_construction_num"].dropna(), bins=10)
plt.title("Year of Construction")
plt.show()

In [None]:
# Neighborhood Counts
print(df["neighborhood"].value_counts())
# Energy label counts
print(df["energy_label"].value_counts())

In [None]:
# Facilities unique values (might be comma separated string)
print(df["facilities"].dropna().head())

# Missing values
print(df.isna().sum())

#### 0. Missing values


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(df.isna(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

print(df.isna().sum().sort_values(ascending=False))

#### 1. Correlation Analysis of Numeric Features


In [None]:
df.corr(numeric_only=True)["price_num"].sort_values(ascending=False)

In [None]:
corr = df.corr(numeric_only=True)
print(corr.columns)

In [None]:
plt.scatter(df["size_num"], df["price_num"])
plt.xlabel("Living Area (m²)")
plt.ylabel("Price (€)")
plt.title("Price vs Living Area")
plt.show()

In [None]:
numeric_cols = [
    "price_num",
    "size_num",
    "bedrooms_num",
    "nr_rooms_num",
    "bathrooms_num",
    "year_of_construction_num",
]

corr = df[numeric_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Numeric Features")
plt.show()

#### 2. Price per m² Calculation & Analysis

Derive price per m² from price and size, check its distribution and relation to neighborhood average


In [None]:
df["price_per_m2"] = df["price_num"] / df["size_num"]

sns.histplot(df["price_per_m2"].dropna(), bins=20)
plt.title("Price per m² Distribution")
plt.xlabel("Price per m² (€)")
plt.show()

In [None]:
df.columns

In [None]:
sns.scatterplot(x="price_per_m2_neighborhood", y="price_per_m2", data=df)
plt.title("Listing Price/m² vs Neighborhood Avg Price/m²")
plt.xlabel("Neighborhood Avg Price/m² (€)")
plt.ylabel("Listing Price/m² (€)")
plt.show()

#### 3. Outlier Detection in Price or Price per m²

Visualize and identify listings with suspiciously high or low prices


In [None]:
Q1 = df["price_num"].quantile(0.25)
Q3 = df["price_num"].quantile(0.75)

IQR = Q3 - Q1

outliers = df[
    (df["price_num"] < (Q1 - 1.5 * IQR)) | (df["price_num"] > (Q3 + 1.5 * IQR))
]

print(f"Found {len(outliers)} price outliers")
print(outliers[["address", "price_num"]])

#### 4. Feature Engineering: Extract Useful Info from Text

Example: Count number of facilities


In [None]:
df["num_facilities"] = df["facilities"].apply(
    lambda x: len(x.split(",")) if pd.notna(x) else 0
)
sns.histplot(df["num_facilities"], bins=10)
plt.title("Distribution of Number of Facilities")
plt.show()

#### 5. Explore Missing Data Patterns


In [None]:
import missingno as msno

msno.matrix(df)
plt.show()

msno.bar(df)
plt.show()

#### 6. Categorical Analysis

Distribution of energy labels or balconies


In [None]:
sns.countplot(x="energy_label", data=df)
plt.title("Energy Label Distribution")
plt.show()

sns.countplot(x="balcony", data=df)
plt.title("Balcony Availability")
plt.xticks(rotation=45)
plt.show()

In [None]:
df