# List of project's dependencies
Below are listed imports that are needed for program to work properly commented with required install (see also requirements.txt)

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetRegressor

import sys
sys.path.append('../../code')
from functions import *

Price prediction

Data loading and preprocessing

In [7]:
#apartment rental data
price_datasets_array : list = [
    '../../data/apartments_pl_2023_08.csv',
    '../../data/apartments_pl_2023_09.csv', 
    '../../data/apartments_pl_2023_10.csv',
    '../../data/apartments_pl_2023_11.csv', 
    '../../data/apartments_pl_2023_12.csv',
    '../../data/apartments_pl_2024_01.csv', 
    '../../data/apartments_pl_2024_02.csv',
    '../../data/apartments_pl_2024_03.csv', 
    '../../data/apartments_pl_2024_04.csv' 
]

Table structure

In [8]:

numerical_columns = ['squareMeters', 'rooms', 'floor', 'floorCount', 'buildYear', 'latitude', 'longitude', 'centreDistance', 'poiCount', 'schoolDistance', 'clinicDistance', 'postOfficeDistance', 'kindergartenDistance', 'restaurantDistance', 'collegeDistance', 'pharmacyDistance']
categorical_columns = ['city', 'type', 'ownership', 'buildingMaterial', 'condition']
boolean_columns = ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom']
drop_columns = []
output_column = 'price'

And use prepared functions (`/code/functions.py`) see more at (preprocessing notebook)[[Preprocessing.ipynb]]

In [15]:
data = normalize_data(
    df = load_data(price_datasets_array).drop('id', axis=1),
    numerical_columns=numerical_columns,
    categorical_columns=categorical_columns, 
    boolean_columns=boolean_columns,
    fill_method='median'
)

data = remove_exceptions(data)
y = data['price']
X = data.drop(columns=['price'])
#X_train, X_test, y_train, y_test = split_train_test(data, 'price')
#X_train, X_validate, y_train, y_validate = split_train_test(data, 'price')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
print(X_train)
print(X_test)
print(X_validate)

X_train = np.array(X_train, dtype=np.float32)
X_test  = np.array(X_test, dtype=np.float32)
X_validate = np.array(X_validate, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32).reshape(-1, 1)
y_test  = np.array(y_test, dtype=np.float32).reshape(-1, 1)
y_validate  = np.array(y_validate, dtype=np.float32).reshape(-1, 1)


       squareMeters  rooms     floor  floorCount  buildYear  latitude  \
5514          0.288    0.4  0.035714    0.535714   0.988506       1.0   
7802          0.176    0.2  0.000000    0.000000   0.781609       0.4   
407           0.168    0.2  0.107143    0.107143   0.402299       0.8   
10571         0.272    0.4  0.071429    0.107143   0.844828       0.4   
7369          0.192    0.2  0.071429    0.107143   0.620690       0.4   
...             ...    ...       ...         ...        ...       ...   
2313          0.760    0.6  0.071429    0.071429   0.908046       0.2   
4818          0.288    0.4  0.071429    0.321429   0.747126       1.0   
4221          0.056    0.2  0.071429    0.142857   0.988506       0.2   
8253          0.272    0.4  0.035714    0.035714   0.931034       0.2   
5428          0.344    0.4  0.000000    0.000000   0.287356       1.0   

       longitude  centreDistance  poiCount  schoolDistance  ...  \
5514    0.444444          0.3125  0.056604            0.

Model training and evaluation

In [None]:
tab_regressor = TabNetRegressor()

tab_regressor.fit(X_train, y_train)
y_pred = tab_regressor.predict(X_test)

evaluate_model(y_test, y_pred)



epoch 0  | loss: 582944382758.5132|  0:00:06s
epoch 1  | loss: 582168495094.9382|  0:00:13s
epoch 2  | loss: 581097934376.7786|  0:00:19s
epoch 3  | loss: 579111597953.1323|  0:00:25s
epoch 4  | loss: 576545803236.814|  0:00:31s
epoch 5  | loss: 572857098747.4689|  0:00:37s
epoch 6  | loss: 568577221550.443|  0:00:44s
epoch 7  | loss: 563588894973.7345|  0:00:50s
epoch 8  | loss: 558430884383.7168|  0:00:56s
epoch 9  | loss: 552953568482.5485|  0:01:03s
epoch 10 | loss: 546570020773.38055|  0:01:10s
epoch 11 | loss: 540222232521.6283|  0:01:16s
epoch 12 | loss: 533133034341.9469|  0:01:23s
epoch 13 | loss: 525541199690.76105|  0:01:29s
epoch 14 | loss: 518116379475.82294|  0:01:35s
epoch 15 | loss: 509759490609.8407|  0:01:42s
epoch 16 | loss: 501198877578.1947|  0:01:48s
epoch 17 | loss: 492581076267.04425|  0:01:55s
epoch 18 | loss: 483441764696.354|  0:02:01s
epoch 19 | loss: 473762097496.354|  0:02:08s
epoch 20 | loss: 462640315836.03546|  0:02:14s
epoch 21 | loss: 451563934094.725

In [None]:
tab_regressor = TabNetRegressor(
    n_d=64,
    n_a=64,
    n_steps = 5,
    gamma=1.5,
    lambda_sparse=0,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',
    scheduler_params={"step_size": 10, "gamma": 0.9}
)

tab_regressor.fit(X_train, y_train)
y_pred = tab_regressor.predict(X_test)

evaluate_model(y_test, y_pred)



epoch 0  | loss: 581306381484.1769|  0:00:09s
epoch 1  | loss: 570349415904.2828|  0:00:19s
epoch 2  | loss: 548704139137.1327|  0:00:28s
epoch 3  | loss: 516544085871.00885|  0:00:37s
epoch 4  | loss: 480645580491.89374|  0:00:47s
epoch 5  | loss: 445703544034.54865|  0:00:56s
epoch 6  | loss: 404160450233.7699|  0:01:06s
epoch 7  | loss: 362517886704.14136|  0:01:16s
epoch 8  | loss: 318642293297.8407|  0:01:25s
epoch 9  | loss: 272455596195.11517|  0:01:35s
epoch 10 | loss: 230054138834.69028|  0:01:44s
epoch 11 | loss: 188021400349.45132|  0:01:54s
epoch 12 | loss: 147962155551.71677|  0:02:03s
epoch 13 | loss: 114622049905.27434|  0:02:13s
epoch 14 | loss: 88272655532.17699|  0:02:22s
epoch 15 | loss: 65394381814.93805|  0:02:32s
epoch 16 | loss: 48122866334.58407|  0:02:42s
epoch 17 | loss: 35889165049.20354|  0:02:51s
epoch 18 | loss: 26331727147.04424|  0:03:01s
epoch 19 | loss: 20918860337.8407|  0:03:10s
epoch 20 | loss: 17207990498.54867|  0:03:20s
epoch 21 | loss: 142906491

In [None]:
tab_regressor.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_validate, y_validate), (X_train, y_train)],
    max_epochs=150,
    patience=20,
    batch_size=128, 
    virtual_batch_size=16,
    num_workers=0,
    drop_last=False
)

y_pred = tab_regressor.predict(X_test)

evaluate_model(y_test, y_pred)

epoch 0  | loss: 479992070059.7028| val_0_mse: 314018267136.0| val_1_mse: 314277167104.0|  0:01:30s
epoch 1  | loss: 196609376753.91553| val_0_mse: 97419165696.0| val_1_mse: 97442349056.0|  0:02:59s
epoch 2  | loss: 50244103894.99299| val_0_mse: 28977188864.0| val_1_mse: 28946909184.0|  0:04:28s
epoch 3  | loss: 23407468201.75345| val_0_mse: 23636119552.0| val_1_mse: 23931154432.0|  0:05:57s
epoch 4  | loss: 22142042570.0147| val_0_mse: 18283622400.0| val_1_mse: 18410532864.0|  0:07:25s
epoch 5  | loss: 20606856975.19104| val_0_mse: 18564988928.0| val_1_mse: 18553438208.0|  0:08:54s
epoch 6  | loss: 19068331179.053| val_0_mse: 15517053952.0| val_1_mse: 15432919040.0|  0:10:23s
epoch 7  | loss: 18105095143.06209| val_0_mse: 20012877824.0| val_1_mse: 19971977216.0|  0:11:52s
epoch 8  | loss: 16936351711.54558| val_0_mse: 16000439296.0| val_1_mse: 15924458496.0|  0:13:21s
epoch 9  | loss: 16208254293.47383| val_0_mse: 14334036992.0| val_1_mse: 14206529536.0|  0:14:50s
epoch 10 | loss: 157



MSE: 9808369664.0
MAE: 72202.1640625
R-squared: 0.8359438267960689
