In [5]:
import pandas as pd

# Load raw data
df_raw = pd.read_csv("../../data/raw/immoweb_data.csv")

# Show basic info
print("Data type of price column:", df_raw['price'].dtype)
print("\nFirst 20 raw price values:")
print(df_raw['price'].head(20))

# Show unique patterns (optional)
print("\nUnique first 30 entries (to see formatting):")
print(df_raw['price'].astype(str).head(30).unique())

# Quick statistics (ignore non-numeric entries)
raw_prices_numeric = pd.to_numeric(df_raw['price'].astype(str).str.replace(r'[^\d,\.]', '', regex=True), errors='coerce')
print("\nRaw price summary (numeric):")
print(raw_prices_numeric.describe())


Data type of price column: float64

First 20 raw price values:
0      225000.0
1      449000.0
2      335000.0
3      501000.0
4      982700.0
5      548514.0
6      325000.0
7      424000.0
8      185000.0
9     3500000.0
10     385000.0
11     957000.0
12     294000.0
13     347000.0
14     110000.0
15     920000.0
16     258000.0
17     499000.0
18     270000.0
19    1495000.0
Name: price, dtype: float64

Unique first 30 entries (to see formatting):
['225000.0' '449000.0' '335000.0' '501000.0' '982700.0' '548514.0'
 '325000.0' '424000.0' '185000.0' '3500000.0' '385000.0' '957000.0'
 '294000.0' '347000.0' '110000.0' '920000.0' '258000.0' '499000.0'
 '270000.0' '1495000.0' '549000.0' '450000.0' '175000.0' '272000.0'
 '298500.0' '319000.0' '130000.0' '249000.0' '310000.0' '165000.0']

Raw price summary (numeric):
count    7.551100e+04
mean     4.227709e+05
std      4.383586e+05
min      7.600000e+04
25%      2.420000e+05
50%      3.290000e+05
75%      4.550000e+05
max      2.250000e+07

In [None]:
import pandas as pd
import numpy as np

# HELPER FUNCTION

def clean_numeric_column(df, column, pattern='[^\\d.,]', as_int=False, is_price=False):
    """
    Cleans numeric-like columns:
    - removes unwanted characters
    - handles European numbers if needed
    - converts to numeric with NaN for invalid/missing
    - if as_int=True → keeps nullable Int64 dtype (allows <NA>)
    
    NOTE: Updated to **skip European formatting** if column is already numeric (fixes 10× price issue)
    """

    # If already numeric, just convert to integer if needed
    if pd.api.types.is_numeric_dtype(df[column]):
        if as_int:
            df[column] = df[column].astype("Int64")
        return df

    # Otherwise, treat as string for cleaning
    df[column + '_clean'] = df[column].astype(str).replace(pattern, '', regex=True)

    if is_price:
        # Only apply European-style replacements if column is string
        # Remove thousands separators (dots) only when used as thousands
        df[column + '_clean'] = df[column + '_clean'].str.replace(r'(?<=\d)\.(?=\d{3}\b)', '', regex=True)
        df[column + '_clean'] = df[column + '_clean'].str.replace(',', '.', regex=False)

    df[column] = pd.to_numeric(df[column + '_clean'], errors='coerce')
    df.drop(columns=[column + '_clean'], inplace=True)

    if as_int:
        df[column] = df[column].astype("Int64")
    return df

# MAIN CLASS

class DataProcessing:
    def __init__(self, file_path='../data/raw/immoweb_data.csv'):
        if hasattr(self, 'df'):
            del self.df
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            sep = ';' if ';' in first_line else ','
        self.df = pd.read_csv(file_path, sep=sep, dtype={"id": str}, low_memory=False)

        print("Detected separator:", repr(sep))
        print("\nBefore any cleaning:")
        print(self.df.dtypes, "\n")
        print(self.df.head(5))
        print("\nNumber of rows raw data loaded:", len(self.df))

        self.numeric_int_columns = [
            'nbr_bedrooms',
            'nbr_frontages',
            'construction_year',
            'total_area_sqm',
            'terrace_sqm',
            'garden_sqm',
            'surface_land_sqm',
            'cadastral_income'
        ]

        self.categorical_columns = [
            'equipped_kitchen',
            'state_building',
            'heating_type'
        ]

        self.epc_mapping = {
            "Flanders": {"A+": "excellent", "A": "excellent", "B": "good",
                         "C": "poor", "D": "poor", "E": "bad", "F": "bad"},
            "Brussels-Capital": {"A": "excellent", "B": "good", "C": "good",
                                  "D": "poor", "E": "poor", "F": "bad", "G": "bad"},
            "Wallonia": {"A++": "excellent", "A+": "excellent", "A": "good",
                         "B": "good", "C": "poor", "D": "poor", "E": "poor",
                         "F": "bad", "G": "bad"}
        }

    def process_data(self):
        self.clean_price()
        self.clean_areas()
        self.clean_other_numeric_columns()
        self.remove_duplicates()
        self.remove_empty_rows()
        self.clean_missing()
        self.clean_epc()
        self.recode_epc()
        self.strip_text_columns()

    # UPDATED METHOD
    def clean_price(self):
        if 'price' in self.df.columns:
            # Use updated helper function
            self.df = clean_numeric_column(self.df, 'price', as_int=True, is_price=True)
            print("Cleaning price fields...")

    def clean_areas(self):
        for col in ['total_area_sqm', 'terrace_sqm', 'garden_sqm']:
            if col in self.df.columns:
                self.df[col] = self.df[col].astype(str).str.replace(r'\s*m[²2]', '', regex=True)
                self.df = clean_numeric_column(self.df, col, as_int=True)
        print("Cleaning area fields...")

    def clean_other_numeric_columns(self):
        for col in ['nbr_bedrooms', 'nbr_frontages', 'construction_year']:
            if col in self.df.columns:
                self.df = clean_numeric_column(self.df, col, as_int=True)
        print("Cleaning other numeric fields...")

    def remove_duplicates(self):
        cols_to_check = [col for col in self.df.columns if col != 'id']
        duplicates_mask = self.df.duplicated(subset=cols_to_check, keep=False)
        num_duplicates = duplicates_mask.sum()
        if num_duplicates > 0:
            print(f"\nFound {num_duplicates} duplicate row(s)")
        else:
            print("\nNo duplicate rows found.")
        self.df.drop_duplicates(subset=cols_to_check, keep='first', inplace=True)
        print(f"Number of rows left after removing duplicates = {len(self.df)}")

    def remove_empty_rows(self):
        critical_cols = [col for col in self.df.columns if col != 'id']
        empty_mask = pd.Series(True, index=self.df.index)
        for col in critical_cols:
            if pd.api.types.is_numeric_dtype(self.df[col]):
                col_empty = self.df[col].isna()
            else:
                col_empty = self.df[col].astype(str).str.strip().eq('') | self.df[col].isna()
            empty_mask &= col_empty
        missing_id_mask = self.df['id'].isna() | (self.df['id'].astype(str).str.strip() == '')
        self.df = self.df.loc[~missing_id_mask]
        self.df.drop(index=self.df[empty_mask].index, inplace=True)
        print(f"Number of rows left after removing empty rows = {len(self.df)}")

    def strip_text_columns(self):
        text_cols = self.df.select_dtypes(include='object').columns
        for col in text_cols:
            self.df[col] = self.df[col].astype(str).str.strip()

    def clean_missing(self):
        for col in self.df.columns:
            if col == 'id' or col in self.categorical_columns:
                continue
            self.df[col] = self.df[col].replace(r'(?i)^MISSING$', np.nan, regex=True)
            if self.df[col].dtype == 'object':
                self.df[col] = self.df[col].replace(r'^\s*$', np.nan, regex=True)

    def clean_epc(self):
        if 'epc' in self.df.columns:
            self.df['epc'] = self.df['epc'].replace(r'^\s*$', np.nan, regex=True).fillna("MISSING")

    def recode_epc(self):
        if 'epc' not in self.df.columns or 'region' not in self.df.columns:
            return
        def _map_row(row):
            region = row.get('region')
            epc = row.get('epc')
            if pd.isna(region) or pd.isna(epc) or epc == "MISSING":
                return "MISSING"
            rules = self.epc_mapping.get(region)
            return rules.get(epc, "MISSING") if rules else "MISSING"
        self.df['epc'] = self.df.apply(_map_row, axis=1)

    def save_to_csv(self, output_path='../../data/processed/cleaned_properties.csv'):
        self.df.to_csv(output_path, index=False)
        print("Saving cleaned output as CSV ...")


In [7]:
dp = DataProcessing(file_path='../../data/raw/immoweb_data.csv')  # adjust path
dp.process_data()
print("\nFirst 5 rows after cleaning:")
print(dp.df.head(5))
dp.save_to_csv('../../data/processed/cleaned_properties.csv') # adjust path

Detected separator: ','

Before any cleaning:
id                                 object
price                             float64
property_type                      object
subproperty_type                   object
region                             object
province                           object
locality                           object
zip_code                            int64
latitude                          float64
longitude                         float64
construction_year                 float64
total_area_sqm                    float64
surface_land_sqm                  float64
nbr_frontages                     float64
nbr_bedrooms                      float64
equipped_kitchen                   object
fl_furnished                        int64
fl_open_fire                        int64
fl_terrace                          int64
terrace_sqm                       float64
fl_garden                           int64
garden_sqm                        float64
fl_swimming_pool              