# Imports
Below are listed imports used in current notebook

In [36]:
import sys
sys.path.append('../../code')
from functions import *

from sklearn.ensemble import RandomForestRegressor

# Price prediction
In chapter below, we predict apartments prices based on acquired data from last year

## Data loading and preprocessing
We start with defining which datasets will be used

In [37]:
price_datasets_array : list = [
    '../../data/apartments_pl_2023_08.csv',
    '../../data/apartments_pl_2023_09.csv', 
    '../../data/apartments_pl_2023_10.csv',
    '../../data/apartments_pl_2023_11.csv', 
    '../../data/apartments_pl_2023_12.csv',
    '../../data/apartments_pl_2024_01.csv', 
    '../../data/apartments_pl_2024_02.csv',
    '../../data/apartments_pl_2024_03.csv', 
    '../../data/apartments_pl_2024_04.csv' 
]

Next we define table structure

In [38]:
numerical_columns = ['squareMeters', 'rooms', 'floor', 'floorCount', 'buildYear', 'latitude', 'longitude', 'centreDistance', 'poiCount', 'schoolDistance', 'clinicDistance', 'postOfficeDistance', 'kindergartenDistance', 'restaurantDistance', 'collegeDistance', 'pharmacyDistance']
categorical_columns = ['city', 'type', 'ownership', 'buildingMaterial', 'condition']
boolean_columns = ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom']
drop_columns = []
output_column = 'price'

And use prepared functions (`/code/functions.py`) see more at (preprocessing notebook)[[Preprocessing.ipynb]]

In [39]:
data = normalize_data(
    df = load_data(price_datasets_array).drop('id', axis=1),
    numerical_columns=numerical_columns,
    categorical_columns=categorical_columns, 
    boolean_columns=boolean_columns,
    fill_method='median'
)

data = remove_exceptions(data)
X_train, X_test, y_train, y_test = split_train_test(data, 'price')

## Model training and evaluation

In [40]:
rf_model = RandomForestRegressor()

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

evaluate_model(y_test, y_pred)

MSE: 3649908620.5637393
MAE: 32089.91089336799
R-squared: 0.952755915360589


In [41]:
rf_model = RandomForestRegressor(
    n_estimators=500, 
    max_depth=50, 
    max_features='sqrt', 
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

evaluate_model(y_test, y_pred)

MSE: 3738515321.293168
MAE: 33389.19193328033
R-squared: 0.9496735602441211


In [42]:
rf_model = RandomForestRegressor(
    n_estimators=1000, 
    max_depth=100, 
    max_features='log2', 
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

evaluate_model(y_test, y_pred)

MSE: 3841832830.2231593
MAE: 33885.85196065256
R-squared: 0.9479044235676543


In [43]:
rf_model = RandomForestRegressor(
    n_estimators=50, 
    max_depth=32, 
    max_features='log2', 
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

evaluate_model(y_test, y_pred)

MSE: 4020698662.565246
MAE: 35000.632723075025
R-squared: 0.9455605278330115


In [44]:
rf_model = RandomForestRegressor(
    n_estimators=50, 
    max_depth=5, 
    max_features='sqrt', 
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

evaluate_model(y_test, y_pred)

MSE: 37162949951.71342
MAE: 149187.6006599503
R-squared: -1.0162722255429317


In [45]:
rf_model = RandomForestRegressor(
    n_estimators=500, 
    max_depth=32, 
    max_features='sqrt', 
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

evaluate_model(y_test, y_pred)

MSE: 3760493456.7774806
MAE: 33620.15323906389
R-squared: 0.949331663562574


# Rent prediction

In [46]:
price_data_array_rent : list = [
    '../../data/apartments_rent_pl_2023_11.csv', 
    '../../data/apartments_rent_pl_2023_12.csv',
    '../../data/apartments_rent_pl_2024_01.csv', 
    '../../data/apartments_rent_pl_2024_02.csv',
    '../../data/apartments_rent_pl_2024_03.csv', 
    '../../data/apartments_rent_pl_2024_04.csv' 
]

In [47]:
numerical_columns = ['squareMeters', 'rooms', 'floor', 'floorCount', 'buildYear', 'latitude', 'longitude', 'centreDistance', 'poiCount', 'schoolDistance', 'clinicDistance', 'postOfficeDistance', 'kindergartenDistance', 'restaurantDistance', 'collegeDistance', 'pharmacyDistance']
categorical_columns = ['city', 'type', 'ownership', 'buildingMaterial', 'condition']
boolean_columns = ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom']
drop_columns = []
output_column = 'price'

In [48]:
data = normalize_data(
    df = load_data(price_data_array_rent).drop('id', axis=1),
    numerical_columns=numerical_columns,
    categorical_columns=categorical_columns, 
    boolean_columns=boolean_columns,
    fill_method='median'
)

data = remove_exceptions(data)
X_train, X_test, y_train, y_test = split_train_test(data, 'price')

## Model training and evaluation

In [49]:
rf_model = RandomForestRegressor()

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

evaluate_model(y_test, y_pred)

MSE: 135932.46691682245
MAE: 221.5173518748868
R-squared: 0.9175921157426181


In [50]:
rf_model = RandomForestRegressor(
    n_estimators=500, 
    max_depth=50, 
    max_features='sqrt', 
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

evaluate_model(y_test, y_pred)

MSE: 129959.66007977576
MAE: 219.13451196167915
R-squared: 0.9176432129762908


In [51]:
rf_model = RandomForestRegressor(
    n_estimators=1000, 
    max_depth=100, 
    max_features='log2', 
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

evaluate_model(y_test, y_pred)

MSE: 131385.13844605247
MAE: 220.67748635187095
R-squared: 0.9160201467172926
