*How many qualitative and quantitative variables are there? What are appropiate visuals for quantitative vs qualitative data? What are appropiate measures for correlations when dealing with qualitative and quantitative variables?

What is the correlation between the variables and the price? Why do you think some variables are more correlated than others?

How are the variables themselves correlated to each other? Can you find groups of variables that are correlated together?*

In [None]:
import pandas as pd
import numpy as np

def log_transform_column(df: pd.DataFrame, column_name: str, new_col_name: str = None) -> pd.DataFrame:
    """
    Applies the natural log transformation (log(1+x)) to a specified column
    and adds it as a new column to the DataFrame.

    Log transformation is primarily used for features with a high positive skew
    (like price or area) to make the distribution more normal.

    Parameters:
    -----------
    df : pd.DataFrame
        The input DataFrame.
    column_name : str
        The name of the column to be transformed (e.g., 'price').
    new_col_name : str, optional
        The name of the new log-transformed column.
        Defaults to f'log_{column_name}'.

    Returns:
    --------
    pd.DataFrame
        The DataFrame with the new log-transformed column added.
    """
    if new_col_name is None:
        new_col_name = f'log_{column_name}'

    # 1. Check if the column exists
    if column_name not in df.columns:
        print(f"Error: Column '{column_name}' not found in DataFrame.")
        return df

    # 2. Check for non-positive values (logarithm is undefined for <= 0)
    # Since we use log1p, we only check for negative values
    if (df[column_name] < 0).any():
        print(f"Warning: Column '{column_name}' contains negative values. "
              "Applying log transformation to negative numbers is problematic.")

    try:
        # 3. Apply the log(1 + x) transformation (log1p)
        # We use .copy() to ensure we are operating on a new DataFrame
        df_copy = df.copy()

        # We use .fillna(0) inside log1p to handle any NaN values gracefully
        # (they become log(1+0) = 0 in the new column, or NaN if they should remain)
        # If NaN should remain NaN after transformation, use .dropna() first or ensure NaNs are skipped.
        # Here we apply it directly, NaNs will result in NaNs in the new column.
        df_copy[new_col_name] = np.log1p(df_copy[column_name])

        print(f"✅ Successfully created new column '{new_col_name}' using np.log1p.")
        return df_copy

    except Exception as e:
        print(f"An error occurred during log transformation: {e}")
        return df

In [None]:
#Import the clean data file
import pandas as pd
import numpy as np
import seaborn as sns

file_path = "cleaned_properties.csv"
with open(file_path, 'r', encoding='utf-8') as f:
    first_line = f.readline()
    sep = ';' if ';' in first_line else ','
df = pd.read_csv(file_path, sep=sep, low_memory=False)

In [None]:
# Capping and log transformations

df_before = df.copy() #Keeping a copy of data before capping
cap_vars = ['price', 'surface_land_sqm', 'total_area_sqm','garden_sqm', 'terrace_sqm', 'nbr_bedrooms', 'nbr_frontages']
lower_cap = 0.01
upper_cap = 0.99
for var in cap_vars:
    lower = df[var].quantile(lower_cap)
    upper = df[var].quantile(upper_cap)
    df[var] = np.where(df[var] < lower, lower,
                       np.where(df[var] > upper, upper, df[var]))

# Initiating the log transfiormations
log_vars = ['price', 'surface_land_sqm', 'total_area_sqm','garden_sqm', 'terrace_sqm']
for var in log_vars:
    df = log_transform_column(df, var, f"{var}_log")
df[[f'{v}_log' for v in log_vars]].skew()


In [None]:
# Corr between price and continuous (numeric) variables (Correlation Matrix) - Pearson's corr
import matplotlib.pyplot as plt

#Numeric vars
num_cols = ['price_log', 'total_area_sqm_log', 'surface_land_sqm_log', 'terrace_sqm_log','garden_sqm_log', 'nbr_bedrooms', 'construction_year', 'primary_energy_consumption_sqm','cadastral_income']

corr = df[num_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Matrix (numeric variables)")
plt.show()

In [None]:
# which regions/ provinces have the highest median prices

df.groupby('region')['price'].median().sort_values(ascending=False)

In [None]:
# Holistic .corr Analysis with Numeric/log-tranformed variables - Pearson's 
'''
Interpretation
Pearson r:
0.0–0.3 -> weak correlation
0.3–0.6 -> moderate correlation
0.6–0.9 -> strong correlation
0.9+ -> very strong correlation
p-value:
Small p (<0.05) -> correlation is statistically significant
Large p (≥0.05) -> not significant; could be noise
'''

from scipy.stats import pearsonr

num_cols = ['price_log', 'total_area_sqm_log', 'surface_land_sqm_log', 'terrace_sqm_log','garden_sqm_log', 'nbr_bedrooms', 'construction_year', 'primary_energy_consumption_sqm', 'cadastral_income','latitude', 'longitude', 'nbr_frontages']

rows = []

for i, col1 in enumerate(num_cols):
    for j, col2 in enumerate(num_cols):
        if i <= j:  # avoid repeating pairs
            r, p = pearsonr(df[col1].fillna(0), df[col2].fillna(0))  # fill NA safely
            rows.append({'Variable 1': col1,'Variable 2': col2,'Pearson r': r,'p-value': p})

# Convert to DataFrame
corr_table = pd.DataFrame(rows)

# Sort by correlation with price
price_corr = corr_table[corr_table['Variable 1']=='price_log'].sort_values(by='Pearson r', ascending=False)
price_corr

In [None]:
# Inter-numeric-variables correaltions - detect possible multicollinearity in regressions later on

corr_matrix = df[num_cols].corr(method='pearson') 
corr_matrix
corr_pairs = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)).stack().reset_index())
corr_pairs.columns = ['Variable 1', 'Variable 2', 'Pearson r']
corr_pairs = corr_pairs.sort_values(by='Pearson r', ascending=False)
corr_pairs



In [None]:
#Checking the categorical variables for unique values
#fl_swimming_pool causing error in ANOVA due to single unique value

# for var in cat_vars:
#     print(var, df[var].nunique())

In [None]:
# Categorical vs price analysis - ANOVA
# F-statistic and p-value indicate whether the categorical variables have significant effects on price
# F- statistic: Measures how large the price differences are between categories relative to within-category variation (higher F -> stronger signal -> the variable kilely affects price).
# p-value: probability these differences happened by chance
# Notes: locality is left out due to granularity; zip_code better used only for clustering and mapping


import statsmodels.api as sm
from statsmodels.formula.api import ols

# Example for variable property_type
model = ols('price_log ~ C(property_type)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

# Iterating over all categorical variables
cat_vars = ['property_type', 'subproperty_type', 'region', 'province','epc', 'heating_type', 'state_building','fl_swimming_pool','fl_floodzone',
    'fl_open_fire', 'fl_terrace', 'fl_garden', 'fl_double_glazing'] #'fl_furnished',
anova_results = []

for var in cat_vars:
    model = ols(f'price_log ~ C({var})', data=df).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    F = anova_table['F'].iloc[0]
    p = anova_table['PR(>F)'].iloc[0]
    anova_results.append({'Variable': var, 'F-statistic': F, 'p-value': p})

anova_df = pd.DataFrame(anova_results)
anova_df = anova_df.sort_values(by='F-statistic', ascending=False)
display(anova_df)

#Interpretation of Results: 
# - Prices differ massively across Belgium’s regions - top predictor
# - Swimming pool in place - top predictor
# - Houses vs apartments have big price differences - storng predictor
# - Provincial differences in prices are large - strong predictor
# - State of building (good/renovated/to renovate) heavily impacts price - strong predictor
# - Subproperty type - important predictor
# - [NOT stat significant] Floodzone - important predictor
# - [NOT stat significant] Whether a terrace exists significantly impacts price - strong predictor
# - Energy efficiency (epc) has medium-strong influence - medium-strong predictor
# - [NOT stat significant] Garden - medium predictor
# - [NOT stat significant] Heating type - weak-medium predictor
# - [NOT stat significant] Double glazing - weak predictor
# - [NOT stat significant] Open fire - very weak predictor


In [None]:
# Visualization of the ANOVA results

import matplotlib.pyplot as pl

plt.figure(figsize=(8, 4))
sns.barplot(data=anova_df,x="F-statistic",y="Variable",palette="viridis")

plt.title("ANOVA F-statistics by Categorical Variable")
plt.xlabel("F-statistics (effect size)")
plt.ylabel("Categorical Variables")
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

def interactive_joint_and_marginal_distributions_plot(
    df: pd.DataFrame,
    x_col: str,
    y_col: str,
    title: str,
    hue: str = "province",
    marginal_type: str = "histogram" # Options: 'histogram', 'violin', 'box', 'rug'
):
    """
    Creates an INTERACTIVE joint plot with marginal distributions using Plotly Express.

    Args:
        df (pd.DataFrame): The input DataFrame.
        x_col (str): The column for the x-axis.
        y_col (str): The column for the y-axis.
        title (str): The plot title.
        hue (str): The column to use for coloring points and marginals.
        marginal_type (str): The type of plot to use for the marginal distributions.
    """

    plot_df = df.dropna(subset=[x_col, y_col]).copy()

    # Use Plotly Express scatter plot with marginal distributions
    fig = px.scatter(
        plot_df,
        x=x_col,
        y=y_col,
        color=hue,
        title=title,
        opacity=0.6,
        # Add marginal plots on top and right
        marginal_x=marginal_type,
        marginal_y=marginal_type,
        # Optimize for large datasets
        render_mode='webgl',
        # Add original price and area to hover data for inspection
        hover_data=['price', 'total_area_sqm', hue]
    )

    fig.update_layout(
        title_font_size=20,
        legend_title_text=hue,
        height=800,
    )

    # Update marginal axis labels for clarity
    fig.update_xaxes(title_text=f'{x_col} (Marginal {marginal_type} on top)')
    fig.update_yaxes(title_text=f'{y_col} (Marginal {marginal_type} on right)')

    fig.show()

In [None]:
interactive_joint_and_marginal_distributions_plot(
    df=df,
    x_col="total_area_sqm",
    y_col="price_log",
    title="Interactive Joint Plot: Area vs. LOG-TRANSFORMED Price",
    hue="province",
    marginal_type="histogram"
)