In [1117]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler


def display_variables(df):
    all_counts = {}
    for i in df.columns:
        entries = len(df[i])
        unique = len(df[i].unique())
        empty_count = df[i].isnull().sum()
        missing_count = 0
        if df[i].apply(lambda x: isinstance(x, str)).all():
            missing_count = df[i].str.contains('MISSING').sum()
        all_counts[i] = {'all': entries, 'unique': unique, 'empty': empty_count, 'missing': missing_count}
    bad_values = pd.DataFrame(all_counts).T
    return display(bad_values)

def duplicates(df):
    if df.duplicated().any():
        rows = df.shape[0]
        df = df.drop_duplicates()
        print(f'{rows - df.shape[0]} duplicates removed')
    else:
        print('No duplicates found')
    return df

def remove_missing(df):
    rows_orig = df.shape[0]
    for i in df.columns:
        rows = df.shape[0]
        df = df[df[i] != 'MISSING']
        if (rows - df.shape[0]) > 0:
            print(f'For column ', i, ': ', rows - df.shape[0], ' rows containing "MISSING" were removed')
    print(f'TOTAL rows containing "MISSING" removed: ', rows_orig - df.shape[0], '\n')
    return df

def remove_empty(df):
    rows_orig = df.shape[0]
    for i in df.columns:
        if i == 'surface_land_sqm':
            continue
        rows = df.shape[0]
        df = df[df[i].notnull()]
        if (rows - df.shape[0]) > 0:
            print(f'For column ', i, ': ', rows - df.shape[0], ' rows containing empty values were removed')
    print(f'TOTAL rows containing empty values removed: ', rows_orig - df.shape[0], '\n')
    return df

# encoding categorical variables (or try labler?)
def encode_categorical(df):
    ordinals = {'state_building': [['AS_NEW', 'JUST_RENOVATED', 'GOOD', 'TO_BE_DONE_UP', 'TO_RENOVATE', 'TO_RESTORE']], 
           'province': [['West Flanders', 'East Flanders', 'Walloon Brabant', 'Brussels', 'Hainaut', 'Antwerp', 'Liège', 'Namur', 'Flemish Brabant', 'Limburg', 'Luxembourg']], 
           'equipped_kitchen': [['NOT_INSTALLED', 'USA_UNINSTALLED', 'INSTALLED', 'USA_INSTALLED', 'SEMI_EQUIPPED', 'USA_SEMI_EQUIPPED', 'HYPER_EQUIPPED', 'USA_HYPER_EQUIPPED']], 
           'epc': [['A++', 'A+', 'A', 'B', 'C', 'D', 'E', 'F', 'G']], 
           'heating_type': [['SOLAR', 'ELECTRIC', 'GAS', 'PELLET', 'WOOD', 'FUELOIL', 'CARBON']]}
    for column, categories in ordinals.items():
        if column in df:
            encoder = OrdinalEncoder(categories=categories, dtype=int)
            df[column] = encoder.fit_transform(df[[column]].to_numpy()) + 1
    return df

# normalize float variables
def normalize(df):
    floats = df.drop('price', axis=1).select_dtypes(include='float').columns
    scaler = MinMaxScaler()
    df[floats] = scaler.fit_transform(df[floats])
    return df



# import csv from scraping
df = pd.read_csv("data/properties.csv")

# examine features: rows, unique values, count empty/'MISSING' values
display_variables(df)

# check for duplicates
df = duplicates(df)

# Define limitations of 'standard' house
df = df[df['price'] < 1000000]
df = df[df['nbr_bedrooms']< 6]
df = df[df['subproperty_type'] != 'CASTLE']

# Drop features without use
df = df.drop(['id', 'region', 'locality', 'zip_code', 'latitude', 'longitude', 'subproperty_type'], axis='columns')
# Drop features to prevent overfitting / correlatetion
df = df.drop(['fl_terrace', 'nbr_bedrooms', 'equipped_kitchen', 'epc', 'fl_double_glazing', 'construction_year', 'heating_type', 'fl_furnished'], axis='columns')

# clean data
df_no_missing = remove_missing(df)
df_no_empty = remove_empty(df_no_missing)

# examine features after cleaning
display_variables(df_no_empty) 

# Encode and normalize
df_encoded = encode_categorical(df_no_empty)
df_clean = normalize(df_encoded)


Unnamed: 0,all,unique,empty,missing
id,75511,48817,0,0
price,75511,8274,0,0
property_type,75511,2,0,0
subproperty_type,75511,23,0,0
region,75511,4,0,3
province,75511,12,0,3
locality,75511,44,0,3
zip_code,75511,1076,0,0
latitude,75511,39491,14098,0
longitude,75511,39543,14098,0


No duplicates found
For column  province :  2  rows containing "MISSING" were removed
For column  state_building :  25385  rows containing "MISSING" were removed
TOTAL rows containing "MISSING" removed:  25387 

For column  total_area_sqm :  4184  rows containing empty values were removed
For column  nbr_frontages :  11886  rows containing empty values were removed
For column  terrace_sqm :  6616  rows containing empty values were removed
For column  garden_sqm :  831  rows containing empty values were removed
For column  primary_energy_consumption_sqm :  5352  rows containing empty values were removed
For column  cadastral_income :  6140  rows containing empty values were removed
TOTAL rows containing empty values removed:  35009 



Unnamed: 0,all,unique,empty,missing
price,9893,727,0,0
property_type,9893,2,0,0
province,9893,11,0,0
total_area_sqm,9893,471,0,0
surface_land_sqm,9893,1736,2837,0
nbr_frontages,9893,9,0,0
fl_open_fire,9893,2,0,0
terrace_sqm,9893,151,0,0
fl_garden,9893,2,0,0
garden_sqm,9893,851,0,0


In [1118]:


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV, Lasso
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


def apply_regressor(df, house_apt, enc_name):
    # define encoders
    encoders = {'linear': LinearRegression(), 'lasso': Lasso(), 'randomforest': RandomForestRegressor(n_estimators=12, random_state=42), 'xgboost': XGBRegressor(learning_rate=0.01, n_estimators=1050, max_depth=5, subsample=0.6, colsample_bytree=0.5, random_state=42)}
    # state type of data used
    if house_apt not in ['apartment', 'house', 'full']:
        return print('Please input "house", "apartment" of "full" as a second argument to call this function')
    # split dataset
    X, y = np.array(df.drop(columns='price')), np.array(df['price'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
    # initialize and train model
    enc = encoders[enc_name]
    enc.fit(X_train, y_train)
    # apply model
    y_train_pred = enc.predict(X_train)
    y_test_pred = enc.predict(X_test)
    # evaluate model prediction
    print(f'{enc_name.upper()} REGRESSION EVALUATION FOR {house_apt.upper()} DATASET\n')
    print(f'Train MSE: {mean_squared_error(y_train, y_train_pred)}, Test MSE: {mean_squared_error(y_test, y_test_pred)}')
    print(f'Train RMSE: {mean_squared_error(y_train, y_train_pred) ** 0.5}, Test RMSE: {mean_squared_error(y_test, y_test_pred) ** 0.5}')
    print(f'Train MAE: {mean_absolute_error(y_train, y_train_pred)}, Test MAE: {mean_absolute_error(y_test, y_test_pred)}')
    print(f'Train R²: {r2_score(y_train, y_train_pred)}, Test R²: {r2_score(y_test, y_test_pred)} \n')


# create seperate datasets: APARTMENT / HOUSE
df_house = df_clean[df_clean['property_type'] == 'HOUSE'].drop(columns=['property_type', 'garden_sqm'])
df_apartment = df_clean[df_clean['property_type'] == 'APARTMENT'].drop(columns=['property_type', 'surface_land_sqm'])

apply_regressor(df_house, 'house', 'xgboost')
apply_regressor(df_apartment, 'apartment', 'linear')

XGBOOST REGRESSION EVALUATION FOR HOUSE DATASET

Train MSE: 4523951504.390675, Test MSE: 6995821981.57919
Train RMSE: 67260.32637737253, Test RMSE: 83641.03049089717
Train MAE: 48488.97636386539, Test MAE: 58505.657317855876
Train R²: 0.8591169862500305, Test R²: 0.7767348334168116 

LINEAR REGRESSION EVALUATION FOR APARTMENT DATASET

Train MSE: 16575174496.117405, Test MSE: 15819416119.008022
Train RMSE: 128744.6095808186, Test RMSE: 125775.26036151951
Train MAE: 85540.53021250505, Test MAE: 86558.76612726368
Train R²: 0.38385822100079436, Test R²: 0.3860787145714297 



In [1119]:
import joblib
import pandas as pd

def save_model(enc):
    joblib.dump(enc, 'models/model_immo_eliza.pkl')

def fill_missing_values(df):
    for column in df.columns:
        # Check if the column contains floats
        if df[column].dtype == 'float64':
            # Replace empty values with the mean
            mean_value = df[column].mean()
            df[column].fillna(mean_value, inplace=True)
        # Check if the column contains integers
        elif df[column].dtype == 'int64':
            # Replace empty values with the mode
            mode_value = df[column].mode()[0]
            df[column].fillna(mode_value, inplace=True)
        # Remaining columns are categorical
        else:
            # Replace 'MISSING' wit 'NaN', and fill with the mode
            mode_value = df[column].mode()[0]
            df[column].replace('MISSING', mode_value, inplace=True)
    return df

# load new data
new_data = pd.read_csv('new_data.csv')

# Select features
columns = ['property_type', 'province', 'total_area_sqm', 'surface_land_sqm', 'nbr_frontages', 'fl_open_fire', 'terrace_sqm', 'fl_garden', 'garden_sqm', 'fl_swimming_pool', 'fl_floodzone', 'state_building', 'primary_energy_consumption_sqm', 'cadastral_income']
df_new = df[columns].copy()

# imputate
df_filled = fill_missing_values(df_new)

# Encode and normalize
df_encoded = encode_categorical(df_filled)
df_normalized = normalize(df_encoded)

# create seperate datasets: APARTMENT / HOUSE
df_new_house = df_normalized[df_normalized['property_type'] == 'HOUSE'].drop(columns=['property_type', 'garden_sqm'])
df_new_apartment = df_normalized[df_normalized['property_type'] == 'APARTMENT'].drop(columns=['property_type', 'surface_land_sqm'])

# load model
loaded_model_house = joblib.load('models/model_file_house.pkl')
loaded_model_apartment = joblib.load('models/model_file_apartment.pkl')
# predict price and put into DataFrame
predictions_house = loaded_model_house.predict(df_new_house)
df_new_house['price'] = predictions_house
predictions_apartment = loaded_model_apartment.predict(df_new_apartment)
df_new_apartment['price'] = predictions_apartment
df_combined = pd.concat([df_new_house, df_new_apartment], ignore_index=True)

display(df_combined)



FileNotFoundError: [Errno 2] No such file or directory: 'new_data.csv'