In [1]:
from data_utils import DataUtils
from feature_utils import FeatureUtils
from model_utils import ModelUtils
from sklearn.model_selection import train_test_split

In [2]:
# Import data
file_path = "./data/real_estate.csv"
df = DataUtils.import_csv(file_path)

In [3]:
"""Data Cleaning"""

# Drop missing values in target column
df = DataUtils.handle_missing_values(df, column='price', strategy='drop')

# Fill missing values in 'facade_number' based on median per 'subtype_of_property'
df = DataUtils.fill_missing_by_group(df, 'facade_number', group_column='subtype_of_property', agg_func='median')

# Fill missing values in 'building_condition' with mode
df = DataUtils.fill_missing_with_mode(df, 'building_condition', strategy='fill')

# Drop outliers in price (IQR), 'living_area (<=12), 'facade_number' (>=6), bedroom_nr (>=24) -> compared to other data points, don't seem realistic
df = DataUtils.drop_outliers(df, 'price')
df = DataUtils.drop_outliers(df, 'living_area', lower_bound=12)
df = DataUtils.drop_outliers(df, 'facade_number', upper_bound=6)
df = DataUtils.drop_outliers(df, 'bedroom_nr', upper_bound=24)

# Remove substring 'unit' from 'subtype_of_property'
df = DataUtils.remove_substring(df, 'subtype_of_property', ' unit')

# Correct value for commune
df = DataUtils.correct_value(df, column='commune', old_value='Petit-Rulx-lez-Nivelles', new_value='Petit-Rœulx-lez-Nivelles')

In [4]:
"""Data Encoding & Feature Engineering"""

'Data Encoding & Feature Engineering'

In [5]:
# Change 'terrace' to binary -> due to many instances when surface not given
df = FeatureUtils.encode_binary(df, 'terrace', threshold=0)

In [6]:
# Reduce categories for 'equipped_kitchen'
df = FeatureUtils.map_manual(df, 'equipped_kitchen', {
    'installed': 'installed',
    'semi equipped': 'semi equipped',
    'hyper equipped': 'hyper equipped',
    'not installed': 'not installed',
    'usa installed': 'installed',
    'usa hyper equipped': 'hyper equipped',
    'usa semi equipped': 'semi equipped',
    'usa uninstalled': 'not installed',
    '0': 'not installed'
})

In [7]:
kitchens = df['equipped_kitchen'].unique()
kitchens

array(['installed', 'not installed', 'hyper equipped', 'semi equipped'],
      dtype=object)

In [8]:
# Use Manual Ordinal Encoding on reduced categories of 'equipped_kitchen'
df = FeatureUtils.encode_manual(df, 'equipped_kitchen', {
    'hyper equipped': 3,
    'installed': 2,
    'semi equipped': 1,
    'not installed': 0
})

In [10]:
# Use Manual Ordinal Encoding on 'building_condition'
df = FeatureUtils.encode_manual(df, 'building_condition', {
    'as new': 5,
    'just renovated': 4,
    'good': 3,
    'to be done up': 2,
    'to renovate': 1,
    'to restore': 0
})

In [12]:
# Reduce categories for 'subtype_of property' by mapping
df = FeatureUtils.map_manual(df, 'subtype_of_property', {
    'kot': 'apartment',
    'chalet': 'house',
    'flat studio': 'apartment',
    'service flat': 'apartment',
    'bungalow': 'house',
    'town house': 'house',
    'ground floor': 'apartment',
    'apartment': 'apartment',
    'house': 'house',
    'mixed use building': 'mixed use building',
    'triplex': 'house',
    'farmhouse': 'mixed use building',
    'loft': 'luxury',
    'duplex': 'house',
    'apartment block': 'other',
    'country cottage': 'house',
    'penthouse': 'luxury',
    'mansion': 'luxury',
    'other property': 'other',
    'villa': 'luxury',
    'exceptional property': 'luxury',
    'manor house': 'luxury',
    'castle': 'luxury'
})

# Use Manual Ordinal Encoding on 'subtype_of_property'
df = FeatureUtils.encode_manual(df, 'subtype_of_property', {
    'luxury': 4,
    'other': 3,
    'house': 2,
    'mixed use building': 1,
    'apartment': 0
})

In [15]:
# Use Standard Ordinal Encoding on communes
df = FeatureUtils.encode_ordinal(df, column='commune')

In [16]:
df.tail(5)

Unnamed: 0,zip_code,commune,province,type_of_property,subtype_of_property,price,building_condition,facade_number,living_area,equipped_kitchen,...,furnished,open_fire,terrace,garden,plot_surface,terrace_encoded,equipped_kitchen_encoded,building_condition_encoded,subtype_of_property_encoded,commune_encoded
26140,1130,Haren,Bruxelles,1,house,481000.0,good,2.0,149,hyper equipped,...,0,0,0,46,0,0,3,3,2,148.0
26141,1130,Haren,Bruxelles,1,house,481000.0,good,2.0,149,hyper equipped,...,0,0,0,46,0,0,3,3,2,148.0
26142,1130,Haren,Bruxelles,1,house,496000.0,good,2.0,157,hyper equipped,...,0,0,0,46,0,0,3,3,2,148.0
26143,1130,Haren,Bruxelles,1,house,506000.0,good,2.0,156,hyper equipped,...,0,0,0,103,0,0,3,3,2,148.0
26144,1130,Haren,Bruxelles,1,house,521000.0,good,2.0,156,hyper equipped,...,0,0,0,103,0,0,3,3,2,148.0


In [None]:
# TODO: Export csv file to check
DataUtils.export_csv(df, './data/encoded_real_estate_data.csv')

# Select features for ML model & split data
target = 'price'
features = ['commune_encoded', 'living_area', 'building_condition_encoded', 'terrace_encoded', 'equipped_kitchen_encoded', 'subtype_of_property_encoded', 'garden']