In this notebook I will be re-processing the raw data to convert any missing values, or MISSING to -1 instead of NaN.

In [None]:
import pandas as pd
import numpy as np

# HELPER FUNCTION
def clean_numeric_column(df, column, pattern='[^\\d.,]', as_int=False, is_price=False): 
    # method to clean a column by removing all unwanted characters like currencies,
    # normalize European number format
    # then converting the cleaned string to integers
    # id should not go through this function, as it is expected to be a combination of letters and numbers
    # create a temporary cleaned string version
    df[column + '_clean'] = df[column].astype(str).replace(pattern, '', regex=True)

    if is_price:
        # remove thousands separator dots
        df[column + '_clean'] = df[column + '_clean'].str.replace(r'(?<=\d)\.(?=\d)', '', regex=True)
        # convert decimal comma to dot
        df[column + '_clean'] = df[column + '_clean'].str.replace(',', '.', regex=False)
    # convert cleaned string to numeric
    df[column] = pd.to_numeric(df[column + '_clean'], errors='coerce')
    # drop the temporary clean column
    df.drop(columns=[column + '_clean'], inplace=True)
    if as_int:
        df[column] = df[column].fillna(0).astype(int)
    return df

# MAIN CLASS
class DataProcessing:
    def __init__(self, file_path='../data/raw/immoweb_data.csv'):
        # path updated with raw data file <---
        # ensure old df is cleared so a new file will truly be read (and not a cached file)
        if hasattr(self, 'df'):
            del self.df
        # auto-detect separator in CSV file
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            sep = ';' if ';' in first_line else ','  # choose ';' if present, else ','
        # load full csv file
        self.df = pd.read_csv(file_path, sep=sep, dtype={"id": str}, low_memory=False)
        print("Detected separator:", repr(sep))
        print("\nBefore any cleaning:")
        print(self.df.dtypes,"\n")
        print(self.df.head(5))
        print("\nNumber of rows raw data loaded:", len(self.df))

    def process_data(self): # main method to process data, further methods detailed below
        self.clean_price()
        self.clean_areas()
        self.convert_yes_no_columns()
        self.clean_other_numeric_columns()
        self.remove_duplicates()
        self.remove_empty_rows()
        self.clean_missing()
        self.strip_text_columns()

    def clean_price(self): # method to clean the price column
        if 'price' in self.df.columns:
            self.df = clean_numeric_column(self.df, 'price', as_int=True, is_price=True)
            print("Cleaning price fields...")

    def clean_areas(self): # method to clean the area columns
        for col in ['total_area_sqm', 'terrace_sqm', 'garden_sqm']:
            if col in self.df.columns:
                # Remove units like 'm2', 'm²' (case-insensitive)
                self.df[col] = self.df[col].astype(str).str.replace(r'\s*m[²2]', '', regex=True)
                self.df = clean_numeric_column(self.df, col, as_int=True)
            print("Cleaning area fields...")
  
    def convert_yes_no_columns(self): # method to convert yes/no to 1/0
        yes_no_map = {'yes': 1, 'y': 1, 'no': 0, 'n': 0}
        for col in ['fl_furnished', 'open_fire', 'fl_swimming_pool']:
            if col in self.df.columns:
                self.df[col] = (
                    self.df[col]
                    .astype(str)
                    .str.strip()
                    .str.lower()
                    .map(yes_no_map)
                    .fillna(0)
                    .astype(int)
                )
        print("Converting Yes/No columns to 1/0 integers...")

    def clean_other_numeric_columns(self): # convert other numeric columns to integers
        for col in ['nbr_bedrooms', 'nbr_frontages', 'construction_year']:
            if col in self.df.columns:
                self.df = clean_numeric_column(self.df, col, as_int=True)
        print("Cleaning other numeric fields...")

    def remove_duplicates(self): # method toemove duplicates based on all columns except id
        cols_to_check = [col for col in self.df.columns if col != 'id']
        # Find duplicates
        duplicates_mask = self.df.duplicated(subset=cols_to_check, keep=False)
        num_duplicates = duplicates_mask.sum()
        if num_duplicates > 0:
            print(f"\nFound {num_duplicates} duplicate row(s)")
            # print(self.df[duplicates_mask].sort_values(by=cols_to_check).head(10)) # showing first 10 duplicates
        else:
            print("\nNo duplicate rows found.")
        self.df.drop_duplicates(subset=cols_to_check, keep='first', inplace=True)
        print("Removing duplicates...")
        print(f"Number of rows left after removing duplicates = {len(self.df)}")

    def remove_empty_rows(self): # method to remove rows where id is missing or all other fields are empty
        critical_cols = [col for col in self.df.columns if col != 'id']
        # identify rows where all non-id columns are empty
        # for numeric columns check NaN, for others, check empty string after stripping
        empty_mask = pd.Series(True, index=self.df.index)
        for col in critical_cols:
            if self.df[col].dtype in [int, float]:
                col_empty = self.df[col].isna()
            else:
                col_empty = self.df[col].astype(str).str.strip().eq('') | self.df[col].isna()
            empty_mask &= col_empty
        
        missing_id_mask = self.df['id'].isna() | (self.df['id'].astype(str).str.strip() == '') # remove rows without id
        num_missing = missing_id_mask.sum()
        print(f"\nFound {num_missing} row(s) with missing id")
        self.df = self.df.loc[~missing_id_mask] # drop rows with missing id

        rows_to_drop = self.df[empty_mask].index # remove rows where all non-id fields are empty
        num_empty_rows = len(rows_to_drop)
        print(f"Found {num_empty_rows} row(s) where all non-id fields are empty")
        if num_empty_rows >0:
            print("Preview of up to 10 rows to be removed (by id):")
            display(self.df.loc[rows_to_drop[:10], :])  # this will print the first 10 rows to be removed

        self.df.drop(index=rows_to_drop, inplace=True)
        print("Removing empty rows...")
        print(f"Number of rows left after removing empty rows = {len(self.df)}")

    def strip_text_columns(self): # strip leading and trailing spaces from text
        text_cols = self.df.select_dtypes(include='object').columns
        for col in text_cols:
            self.df[col] = self.df[col].astype(str).str.strip()
        print("\nStripping leading/trailing spaces from all text columns...")

    # def clean_missing(self): # method to clean any "MISSING" string with NaN and ensure missing fields remain NaN
    #     for col in self.df.columns:
    #         if col != 'id':
    #             # Convert 'MISSING' (case-insensitive) to NaN
    #             self.df[col] = self.df[col].replace(r'(?i)^MISSING$', np.nan, regex=True)
    #             # Also ensure empty strings are treated as NaN
    #             if self.df[col].dtype == 'object':
    #                 self.df[col] = self.df[col].replace(r'^\s*$', np.nan, regex=True)
    #     print("\nCleaning missing values: 'MISSING' and converting empty strings to NaN...")

# NEW CLEAN_MISSING FUNCTION TO REPLACE MISSING OR EMPTY WITH -1
    def clean_missing(self): # Convert missing values ('MISSING' or empty strings) to -1 for all columns except 'id'.
        for col in self.df.columns:
            if col == 'id':
                continue
            # Replace 'MISSING' (case-insensitive) with -1
            self.df[col] = self.df[col].replace(r'(?i)^MISSING$', -1, regex=True)
            # Replace empty strings with -1
            self.df[col] = self.df[col].replace(r'^\s*$', -1, regex=True)
            # If column is numeric, ensure type stays numeric
            if self.df[col].dtype in [int, float, 'float64', 'int64']:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce').fillna(-1).astype(int)
        print("\nCleaning missing values: all 'MISSING' and empty strings replaced with -1...")

    def save_to_csv(self, output_path='../Kristin/cleaned_properties_-1.csv'): # method to create the output file, update file path <---
        self.df.to_csv(output_path, index=False)
        print("\nSaving cleaned output as csv ...")

In [None]:
dp = DataProcessing(file_path='../data/raw/immoweb_data.csv')  # adjust path
dp.process_data()
print("\nFirst 5 rows after cleaning:")
print(dp.df.head(5))
dp.save_to_csv('../Kristin/cleaned_properties_-1.csv') # adjust path