# List of project's dependencies
Below are listed imports that are needed for program to work properly commented with required install (see also requirements.txt)

In [46]:
#List of imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

# Loading data
To properly load data you need to download dataset from [link](https://www.kaggle.com/datasets/krzysztofjamroz/apartment-prices-in-poland/data)
and paste it to `./data` directory.

In [47]:
#apartment rental data
price_data_array : list = [
    './data/apartments_pl_2023_08.csv',
    './data/apartments_pl_2023_09.csv', 
    './data/apartments_pl_2023_10.csv',
    './data/apartments_pl_2023_11.csv', 
    './data/apartments_pl_2023_12.csv',
    './data/apartments_pl_2024_01.csv', 
    './data/apartments_pl_2024_02.csv',
    './data/apartments_pl_2024_03.csv', 
    './data/apartments_pl_2024_04.csv' 
]

price_data_array_rent : list = [
    './data/apartments_rent_pl_2023_11.csv', 
    './data/apartments_rent_pl_2023_12.csv',
    './data/apartments_rent_pl_2024_01.csv', 
    './data/apartments_rent_pl_2024_02.csv',
    './data/apartments_rent_pl_2024_03.csv', 
    './data/apartments_rent_pl_2024_04.csv' 
]

data = pd.concat([pd.read_csv(data_set)for data_set in price_data_array])
data.head()

Unnamed: 0,id,city,type,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,...,pharmacyDistance,ownership,buildingMaterial,condition,hasParkingSpace,hasBalcony,hasElevator,hasSecurity,hasStorageRoom,price
0,f8524536d4b09a0c8ccc0197ec9d7bde,szczecin,blockOfFlats,63.0,3.0,4.0,10.0,1980.0,53.378933,14.625296,...,0.413,condominium,concreteSlab,,yes,yes,yes,no,yes,415000
1,accbe77d4b360fea9735f138a50608dd,szczecin,blockOfFlats,36.0,2.0,8.0,10.0,,53.442692,14.55969,...,0.205,cooperative,concreteSlab,,no,yes,yes,no,yes,395995
2,8373aa373dbc3fe7ca3b7434166b8766,szczecin,tenement,73.02,3.0,2.0,3.0,,53.452222,14.553333,...,0.28,condominium,brick,,no,no,no,no,no,565000
3,0a68cd14c44ec5140143ece75d739535,szczecin,tenement,87.6,3.0,2.0,3.0,,53.4351,14.5329,...,0.087,condominium,brick,,yes,yes,no,no,yes,640000
4,f66320e153c2441edc0fe293b54c8aeb,szczecin,blockOfFlats,66.0,3.0,1.0,3.0,,53.410278,14.503611,...,0.514,condominium,,,no,no,no,no,no,759000


# Preparing data
Functions to prepare data

In [48]:
def fill_na(df, column_list, method='median'):
    for column in column_list:
        fill_value = None
        match method:
            case 'median':
                fill_value = df[column].median()
            case 'mean':
                fill_value = df[column].mean()
            case 'first_value':
                fill_value = df[column][1]
            case 'false':
                fill_value = False
            case _:
                fill_value = 0
                
        df[column] = df[column].fillna(fill_value)


def normalize_numerical_columns(df, column_list):
    df[column_list] = (df[column_list] - df[column_list].min()) / (df[column_list].max() - df[column_list].min())

In [49]:
data=data.drop('id', axis=1)

numerical_columns = ['squareMeters', 'rooms', 'floor', 'floorCount', 'buildYear', 'latitude', 'longitude', 'centreDistance', 'poiCount', 'schoolDistance', 'clinicDistance', 'postOfficeDistance', 'kindergartenDistance', 'restaurantDistance', 'collegeDistance', 'pharmacyDistance']
categorical_columns = ['city', 'type', 'ownership', 'buildingMaterial', 'condition']
boolean_columns = ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom']
drop_columns = []
output_column = 'price'

fill_na(data, numerical_columns, 'mean')
fill_na(data, boolean_columns, 'false')

data = pd.get_dummies(data, columns=categorical_columns)
data = pd.get_dummies(data, columns=boolean_columns, drop_first=True).astype(int)

normalize_numerical_columns(data, numerical_columns)

data

Unnamed: 0,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,centreDistance,poiCount,schoolDistance,...,buildingMaterial_brick,buildingMaterial_concreteSlab,condition_low,condition_premium,hasParkingSpace_yes,hasBalcony_yes,hasElevator_no,hasElevator_yes,hasSecurity_yes,hasStorageRoom_yes
0,0.304,0.4,0.107143,0.321429,0.747126,0.8,0.000000,0.3750,0.042453,0.0,...,0,1,0,0,1,1,0,1,0,1
1,0.088,0.2,0.250000,0.321429,0.775862,0.8,0.000000,0.1250,0.075472,0.0,...,0,1,0,0,0,1,0,1,0,1
2,0.384,0.4,0.035714,0.071429,0.775862,0.8,0.000000,0.1875,0.042453,0.0,...,1,0,0,0,0,0,1,0,0,0
3,0.496,0.4,0.035714,0.071429,0.775862,0.8,0.000000,0.1250,0.150943,0.0,...,1,0,0,0,1,1,1,0,0,1
4,0.328,0.4,0.000000,0.071429,0.775862,0.8,0.000000,0.2500,0.004717,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19254,0.648,0.8,0.035714,0.035714,0.775862,0.8,0.333333,0.0000,0.202830,0.0,...,1,0,0,0,0,1,1,0,0,0
19255,0.584,0.4,0.071429,0.071429,0.431034,0.8,0.444444,0.0000,0.231132,0.0,...,1,0,0,0,0,0,1,0,0,1
19256,0.664,0.8,0.035714,0.107143,0.224138,0.8,0.444444,0.0625,0.132075,0.0,...,1,0,0,0,0,0,1,0,0,1
19257,0.200,0.2,0.000000,0.000000,0.775862,0.8,0.444444,0.0000,0.198113,0.0,...,1,0,0,0,1,0,1,0,0,0


### Drop least relevant data

In [50]:
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound)&(data['price'] <= upper_bound)]

### Divide data into tensors

In [51]:
X = data.drop('price', axis=1)
Y = data['price']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train = torch.tensor(np.array(X_train))
X_test  = torch.tensor(np.array(X_test))
Y_train = torch.tensor(np.array(Y_train))
Y_test  = torch.tensor(np.array(Y_test))