REVIEWING ALL CODE

In [7]:
import pandas as pd
import numpy as np

# HELPER FUNCTION

def clean_numeric_column(df, column, pattern='[^\\d.,]', as_int=False, is_price=False):
    """
    Cleans numeric-like columns:
    - removes unwanted characters
    - handles European numbers if needed
    - converts to numeric with NaN for invalid/missing
    - if as_int=True â†’ keeps nullable Int64 dtype (allows <NA>)
    """

    df[column + '_clean'] = df[column].astype(str).replace(pattern, '', regex=True)

    if is_price:
        df[column + '_clean'] = df[column + '_clean'].str.replace(r'(?<=\d)\.(?=\d)', '', regex=True)
        df[column + '_clean'] = df[column + '_clean'].str.replace(',', '.', regex=False)

    df[column] = pd.to_numeric(df[column + '_clean'], errors='coerce')
    df.drop(columns=[column + '_clean'], inplace=True)

    if as_int:
        df[column] = df[column].astype("Int64")

    return df


# MAIN CLASS

class DataProcessing:
    def __init__(self, file_path='../data/raw/immoweb_data.csv'):

        if hasattr(self, 'df'):
            del self.df

        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            sep = ';' if ';' in first_line else ','

        self.df = pd.read_csv(file_path, sep=sep, dtype={"id": str}, low_memory=False)

        print("Detected separator:", repr(sep))
        print("\nBefore any cleaning:")
        print(self.df.dtypes, "\n")
        print(self.df.head(5))
        print("\nNumber of rows raw data loaded:", len(self.df))

        self.numeric_int_columns = [
            'nbr_bedrooms',
            'nbr_frontages',
            'construction_year',
            'total_area_sqm',
            'terrace_sqm',
            'garden_sqm',
            'surface_land_sqm',
            'cadastral_income'
        ]

        # Categorical fields where "MISSING" must stay unchanged
        self.categorical_columns = [
            'equipped_kitchen',
            'state_building',
            'heating_type'
        ]

    def process_data(self):
        self.clean_price()
        self.clean_areas()
        self.clean_other_numeric_columns()
        self.remove_duplicates()
        self.remove_empty_rows()
        self.clean_missing()
        self.strip_text_columns()

    def clean_price(self):
        if 'price' in self.df.columns:
            self.df = clean_numeric_column(self.df, 'price', as_int=True, is_price=True)
            print("Cleaning price fields...")

    def clean_areas(self):
        for col in ['total_area_sqm', 'terrace_sqm', 'garden_sqm']:
            if col in self.df.columns:
                self.df[col] = self.df[col].astype(str).str.replace(r'\s*m[Â²2]', '', regex=True)
                self.df = clean_numeric_column(self.df, col, as_int=True)

        print("Cleaning area fields...")

    def clean_other_numeric_columns(self):
        for col in ['nbr_bedrooms', 'nbr_frontages', 'construction_year']:
            if col in self.df.columns:
                self.df = clean_numeric_column(self.df, col, as_int=True)

        print("Cleaning other numeric fields...")

    def remove_duplicates(self):
        cols_to_check = [col for col in self.df.columns if col != 'id']

        duplicates_mask = self.df.duplicated(subset=cols_to_check, keep=False)
        num_duplicates = duplicates_mask.sum()

        if num_duplicates > 0:
            print(f"\nFound {num_duplicates} duplicate row(s)")
        else:
            print("\nNo duplicate rows found.")

        self.df.drop_duplicates(subset=cols_to_check, keep='first', inplace=True)

        print("Removing duplicates...")
        print(f"Number of rows left after removing duplicates = {len(self.df)}")

    def remove_empty_rows(self):
        critical_cols = [col for col in self.df.columns if col != 'id']

        empty_mask = pd.Series(True, index=self.df.index)
        for col in critical_cols:
            if pd.api.types.is_numeric_dtype(self.df[col]):
                col_empty = self.df[col].isna()
            else:
                col_empty = self.df[col].astype(str).str.strip().eq('') | self.df[col].isna()
            empty_mask &= col_empty

        missing_id_mask = (
            self.df['id'].isna() |
            (self.df['id'].astype(str).str.strip() == '')
        )
        num_missing = missing_id_mask.sum()
        print(f"\nFound {num_missing} row(s) with missing id")

        self.df = self.df.loc[~missing_id_mask]

        rows_to_drop = self.df[empty_mask].index
        num_empty_rows = len(rows_to_drop)

        print(f"Found {num_empty_rows} row(s) where all non-id fields are empty")

        if num_empty_rows > 0:
            print("Preview of up to 10 rows to be removed (by id):")
            display(self.df.loc[rows_to_drop[:10], :])

        self.df.drop(index=rows_to_drop, inplace=True)

        print("Removing empty rows...")
        print(f"Number of rows left after removing empty rows = {len(self.df)}")

    def strip_text_columns(self):
        text_cols = self.df.select_dtypes(include='object').columns
        for col in text_cols:
            self.df[col] = self.df[col].astype(str).str.strip()

        print("\nStripping leading/trailing spaces from all text columns...")

    def clean_missing(self):
        for col in self.df.columns:
            if col == 'id':
                continue

            # ðŸ”¥ NEW RULE: do NOT touch "MISSING" in specific categorical columns
            if col in self.categorical_columns:
                continue

            # Convert "MISSING" to NaN
            self.df[col] = self.df[col].replace(r'(?i)^MISSING$', np.nan, regex=True)

            # Convert empty strings to NaN
            if self.df[col].dtype == 'object':
                self.df[col] = self.df[col].replace(r'^\s*$', np.nan, regex=True)

        print("\nCleaning missing values: converting 'MISSING' and empty strings to NaN (except categorical columns)...")

    def save_to_csv(self, output_path='../Kristin/cleaned_properties.csv'):
        self.df.to_csv(output_path, index=False)
        print("\nSaving cleaned output as CSV ...")


In [8]:
dp = DataProcessing(file_path='../data/raw/immoweb_data.csv')  # adjust path
dp.process_data()
print("\nFirst 5 rows after cleaning:")
print(dp.df.head(5))
dp.save_to_csv('../Kristin/cleaned_properties.csv') # adjust path

Detected separator: ','

Before any cleaning:
id                                 object
price                             float64
property_type                      object
subproperty_type                   object
region                             object
province                           object
locality                           object
zip_code                            int64
latitude                          float64
longitude                         float64
construction_year                 float64
total_area_sqm                    float64
surface_land_sqm                  float64
nbr_frontages                     float64
nbr_bedrooms                      float64
equipped_kitchen                   object
fl_furnished                        int64
fl_open_fire                        int64
fl_terrace                          int64
terrace_sqm                       float64
fl_garden                           int64
garden_sqm                        float64
fl_swimming_pool              