All below blocks need to be run to clean the raw data file and save it as cleaned_properties.csv.

In [1]:
import pandas as pd
import numpy as np

# MAIN CLASS
class DataProcessing:
    def __init__(self, file_path):
        if hasattr(self, 'df'):
            del self.df
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            sep = ';' if ';' in first_line else ','
        self.df = pd.read_csv(file_path, sep=sep, dtype={"id": str}, low_memory=False)

        print("Detected separator:", repr(sep))
        print("\nBefore any cleaning:")
        print(self.df.dtypes, "\n")
        print(self.df.head(5))
        print("\nNumber of rows raw data loaded:", len(self.df))

        self.numeric_columns = [
            'price',
            'nbr_bedrooms',
            'nbr_frontages',
            'construction_year',
            'total_area_sqm',
            'terrace_sqm',
            'garden_sqm',
            'surface_land_sqm',
            'cadastral_income',
            'primary_energy_consumption_sqm'
        ]

        self.categorical_columns = [
            'equipped_kitchen',
            'state_building',
            'heating_type'
        ]

        self.epc_mapping = {
            "Flanders": {"A+": "excellent", "A": "excellent", "B": "good",
                         "C": "poor", "D": "poor", "E": "bad", "F": "bad"},
            "Brussels-Capital": {"A": "excellent", "B": "good", "C": "good",
                                  "D": "poor", "E": "poor", "F": "bad", "G": "bad"},
            "Wallonia": {"A++": "excellent", "A+": "excellent", "A": "good",
                         "B": "good", "C": "poor", "D": "poor", "E": "poor",
                         "F": "bad", "G": "bad"}
        }

    def process_data(self):
        self.remove_empty_rows()
        self.remove_duplicates()
        self.clean_missing()
        self.strip_text_columns()
        self.clean_epc()
        self.recode_epc()

        # Print dtypes after cleaning
        print("\nAfter cleaning:")
        print(self.df.dtypes, "\n")
        print(self.df.head(5))

    def remove_empty_rows(self):
        critical_cols = [col for col in self.df.columns if col != 'id']
        empty_mask = pd.Series(True, index=self.df.index)
        for col in critical_cols:
            if pd.api.types.is_numeric_dtype(self.df[col]):
                col_empty = self.df[col].isna()
            else:
                col_empty = self.df[col].astype(str).str.strip().eq('') | self.df[col].isna()
            empty_mask &= col_empty
        missing_id_mask = self.df['id'].isna() | (self.df['id'].astype(str).str.strip() == '')
        self.df = self.df.loc[~missing_id_mask]
        self.df.drop(index=self.df[empty_mask].index, inplace=True)
        print(f"Number of rows left after removing empty rows = {len(self.df)}")

    def remove_duplicates(self):
        cols_to_check = [col for col in self.df.columns if col != 'id']
        duplicates_mask = self.df.duplicated(subset=cols_to_check, keep=False)
        num_duplicates = duplicates_mask.sum()
        if num_duplicates > 0:
            print(f"\nFound {num_duplicates} duplicate row(s)")
        else:
            print("\nNo duplicate rows found.")
        self.df.drop_duplicates(subset=cols_to_check, keep='first', inplace=True)
        print(f"Number of rows left after removing duplicates = {len(self.df)}")

    def clean_missing(self):
        for col in self.df.columns:
            if col == 'id' or col in self.categorical_columns:
                continue
            self.df[col] = self.df[col].replace(r'(?i)^MISSING$', np.nan, regex=True)
            if self.df[col].dtype == 'object':
                self.df[col] = self.df[col].replace(r'^\s*$', np.nan, regex=True)

    def strip_text_columns(self):
        text_cols = self.df.select_dtypes(include='object').columns
        for col in text_cols:
            self.df[col] = self.df[col].astype(str).str.strip()

    def clean_epc(self):
        if 'epc' in self.df.columns:
            self.df['epc'] = self.df['epc'].replace(r'^\s*$', np.nan, regex=True).fillna("MISSING")

    def recode_epc(self):
        if 'epc' not in self.df.columns or 'region' not in self.df.columns:
            return
        def _map_row(row):
            region = row.get('region')
            epc = row.get('epc')
            if pd.isna(region) or pd.isna(epc) or epc == "MISSING":
                return "MISSING"
            rules = self.epc_mapping.get(region)
            return rules.get(epc, "MISSING") if rules else "MISSING"
        self.df['epc'] = self.df.apply(_map_row, axis=1)

    def save_to_csv(self, output_path):
        self.df.to_csv(output_path, index=False)
        print("Saving cleaned output as CSV ...")


In [2]:
dp = DataProcessing(file_path='../data/raw/immoweb_data.csv')  # adjust path
dp.process_data()
print("\nFirst 5 rows after cleaning:")
print(dp.df.head(5))
dp.save_to_csv('../data/processed/cleaned_properties.csv') # adjust path

Detected separator: ','

Before any cleaning:
id                                 object
price                             float64
property_type                      object
subproperty_type                   object
region                             object
province                           object
locality                           object
zip_code                            int64
latitude                          float64
longitude                         float64
construction_year                 float64
total_area_sqm                    float64
surface_land_sqm                  float64
nbr_frontages                     float64
nbr_bedrooms                      float64
equipped_kitchen                   object
fl_furnished                        int64
fl_open_fire                        int64
fl_terrace                          int64
terrace_sqm                       float64
fl_garden                           int64
garden_sqm                        float64
fl_swimming_pool              