# List of project's dependencies
Below are listed imports that are needed for program to work properly commented with required install (see also requirements.txt)

In [1]:
#List of imports
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

# Loading data
To properly load data you need to download dataset from [link](https://www.kaggle.com/datasets/krzysztofjamroz/apartment-prices-in-poland/data)
and paste it to `./data` directory.


In [2]:
#apartment rental data
price_data_array : list = [
    './data/apartments_pl_2023_08.csv',
    './data/apartments_pl_2023_09.csv', 
    './data/apartments_pl_2023_10.csv',
    './data/apartments_pl_2023_11.csv', 
    './data/apartments_pl_2023_12.csv',
    './data/apartments_pl_2024_01.csv', 
    './data/apartments_pl_2024_02.csv',
    './data/apartments_pl_2024_03.csv', 
    './data/apartments_pl_2024_04.csv' 
]

price_data_array_rent : list = [
    './data/apartments_rent_pl_2023_11.csv', 
    './data/apartments_rent_pl_2023_12.csv',
    './data/apartments_rent_pl_2024_01.csv', 
    './data/apartments_rent_pl_2024_02.csv',
    './data/apartments_rent_pl_2024_03.csv', 
    './data/apartments_rent_pl_2024_04.csv' 
]


def load_data(source_list: list):
    return pd.concat([pd.read_csv(data_set)for data_set in source_list])


data = load_data(price_data_array)

FileNotFoundError: [Errno 2] No such file or directory: './data/apartments_pl_2023_08.csv'

# Data statistics visualization

In [None]:
data.head()

In [None]:
data.describe()

Unnamed: 0,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,centreDistance,poiCount,schoolDistance,clinicDistance,postOfficeDistance,kindergartenDistance,restaurantDistance,collegeDistance,pharmacyDistance,price
count,153836.0,153836.0,126215.0,151874.0,128199.0,153836.0,153836.0,153836.0,153836.0,153688.0,153232.0,153628.0,153666.0,153432.0,149541.0,153611.0,153836.0
mean,59.106762,2.692712,3.317126,5.270599,1985.703336,52.033875,19.463143,4.327628,20.705901,0.41322,0.969583,0.517386,0.370083,0.347417,1.442457,0.360438,773112.0
std,21.576561,0.920367,2.515351,3.290654,34.070959,1.341967,1.78695,2.846466,24.42579,0.467913,0.890867,0.502589,0.450806,0.470033,1.104005,0.463813,402775.1
min,25.0,1.0,1.0,1.0,1850.0,49.978999,14.4471,0.01,0.0,0.002,0.001,0.001,0.001,0.001,0.004,0.001,150000.0
25%,44.39,2.0,2.0,3.0,1966.0,51.110218,18.51888,1.98,7.0,0.175,0.355,0.239,0.157,0.114,0.58,0.143,510000.0
50%,55.0,3.0,3.0,4.0,1994.0,52.1951,19.897,3.94,14.0,0.291,0.675,0.393,0.263,0.23,1.119,0.24,690000.0
75%,69.0,3.0,4.0,6.0,2016.0,52.41737,20.990211,6.13,24.0,0.468,1.236,0.623,0.418,0.41,2.054,0.407,919000.0
max,150.0,6.0,29.0,29.0,2024.0,54.60646,23.207128,16.94,212.0,4.946,4.999,4.97,4.961,4.985,5.0,4.992,3250000.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 153836 entries, 0 to 19258
Data columns (total 28 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    153836 non-null  object 
 1   city                  153836 non-null  object 
 2   type                  120096 non-null  object 
 3   squareMeters          153836 non-null  float64
 4   rooms                 153836 non-null  float64
 5   floor                 126215 non-null  float64
 6   floorCount            151874 non-null  float64
 7   buildYear             128199 non-null  float64
 8   latitude              153836 non-null  float64
 9   longitude             153836 non-null  float64
 10  centreDistance        153836 non-null  float64
 11  poiCount              153836 non-null  float64
 12  schoolDistance        153688 non-null  float64
 13  clinicDistance        153232 non-null  float64
 14  postOfficeDistance    153628 non-null  float64
 15  kinder

# Preparing data

Functions to prepare data

In [None]:
def fill_na(df, column_list, method='median'):
    for column in column_list:
        fill_value: float
        match method:
            case 'median':
                fill_value = df[column].median()
            case 'mean':
                fill_value = df[column].mean()
            case 'first_value':
                fill_value = df[column][0]
            case _:
                fill_value = 0
                
        df[column] =  df[column].fillna(fill_value)


def normalize_numerical_columns(df, column_list):
    df[column_list] = (df[column_list] - df[column_list].min()) / (df[column_list].max() - df[column_list].min())


def normalize_data(df, numerical_columns=[], categorical_columns=[], boolean_columns=[], drop_columns=[], fill_method='mean'):
    df = pd.get_dummies(df, columns=categorical_columns)
    df = pd.get_dummies(df, columns=boolean_columns, drop_first=True).astype(int)

    fill_na(df, numerical_columns, fill_method)
    normalize_numerical_columns(df, numerical_columns)

In [None]:
data=data.drop('id', axis=1)

numerical_columns = ['squareMeters', 'rooms', 'floor', 'floorCount', 'buildYear', 'latitude', 'longitude', 'centreDistance', 'poiCount', 'schoolDistance', 'clinicDistance', 'postOfficeDistance', 'kindergartenDistance', 'restaurantDistance', 'collegeDistance', 'pharmacyDistance']
categorical_columns = ['city', 'type', 'ownership', 'buildingMaterial', 'condition']
boolean_columns = ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom']
drop_columns = []
output_column = 'price'

normalize_data(
    data, 
    numerical_columns=numerical_columns, 
    categorical_columns=categorical_columns, 
    boolean_columns=boolean_columns,
    fill_method='median'
)

data

Unnamed: 0,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,centreDistance,poiCount,schoolDistance,...,ownership_udział,buildingMaterial_brick,buildingMaterial_concreteSlab,condition_low,condition_premium,hasParkingSpace_yes,hasBalcony_yes,hasElevator_yes,hasSecurity_yes,hasStorageRoom_yes
0,0.30400,0.4,0.107143,0.321429,0.747126,0.734730,0.020342,0.385115,0.042453,0.023463,...,False,False,True,False,False,True,True,True,False,True
1,0.08800,0.2,0.250000,0.321429,0.779904,0.748508,0.012853,0.126403,0.075472,0.054814,...,False,False,True,False,False,False,True,True,False,True
2,0.38416,0.4,0.035714,0.071429,0.779904,0.750568,0.012127,0.190786,0.042453,0.055218,...,False,True,False,False,False,False,False,False,False,False
3,0.50080,0.4,0.035714,0.071429,0.779904,0.746868,0.009794,0.133491,0.150943,0.034992,...,False,True,False,False,False,True,True,False,False,True
4,0.32800,0.4,0.000000,0.071429,0.779904,0.741504,0.006451,0.239811,0.004717,0.043689,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19254,0.64800,0.8,0.035714,0.035714,0.779904,0.679100,0.404918,0.025399,0.202830,0.058859,...,False,True,False,False,False,False,True,False,False,False
19255,0.58400,0.4,0.082754,0.071429,0.431034,0.679780,0.406547,0.034259,0.231132,0.022451,...,False,True,False,False,False,False,False,False,False,True
19256,0.67168,0.8,0.035714,0.107143,0.224138,0.681313,0.405655,0.063201,0.132075,0.056432,...,False,True,False,False,False,False,False,False,False,True
19257,0.20096,0.2,0.000000,0.000000,0.779904,0.680861,0.406025,0.051388,0.198113,0.034790,...,False,True,False,False,False,True,False,False,False,False


In [None]:
data.describe()

Unnamed: 0,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,centreDistance,poiCount,schoolDistance,clinicDistance,postOfficeDistance,kindergartenDistance,restaurantDistance,collegeDistance,pharmacyDistance,price
count,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0,153836.0
mean,0.272854,0.338542,0.082754,0.152521,0.779904,0.444061,0.572606,0.255028,0.097669,0.083176,0.193794,0.103921,0.074412,0.069506,0.287922,0.072017,773112.0
std,0.172612,0.184073,0.08137,0.116772,0.178751,0.290001,0.203989,0.168131,0.115216,0.094597,0.177894,0.101076,0.090838,0.094184,0.217871,0.092862,402775.1
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150000.0
25%,0.15512,0.2,0.035714,0.071429,0.689655,0.244458,0.464814,0.116361,0.033019,0.035194,0.070828,0.047897,0.031452,0.022673,0.117094,0.028652,510000.0
50%,0.24,0.4,0.071429,0.107143,0.779904,0.478902,0.622133,0.232132,0.066038,0.058455,0.135454,0.07909,0.053024,0.046148,0.228383,0.048087,690000.0
75%,0.352,0.4,0.107143,0.178571,0.931034,0.526935,0.746928,0.361488,0.113208,0.094256,0.246299,0.125176,0.083871,0.081862,0.404924,0.081146,919000.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3250000.0


# Prepare for fitting
### Remove 

In [None]:
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound)&(data['price'] <= upper_bound)]

### Split into X, Y sets

In [None]:
X = data.drop('price', axis=1)
Y = data['price']

### Split into training and test sets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Training and evaluation 
### Evaluation function

In [None]:
def evaluate_model(true_data, predicted_data):
    mse = mean_squared_error(predicted_data, true_data)
    mae = mean_absolute_error(predicted_data, true_data)
    r2 = r2_score(predicted_data, true_data)

    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'R-squared: {r2}')

### Linear Regression

In [None]:
lm = LinearRegression(fit_intercept = True)
lm.fit(X_train, Y_train)
Y_pred = lm.predict(X_test)

evaluate_model(Y_test, Y_pred)

MSE: 19868346492.39705
MAE: 105345.1257760247
R-squared: 0.6824880753393883


### Random forest regressor

In [None]:
rf_model = RandomForestRegressor(
    n_estimators=500, 
    max_depth=50, 
    max_features='sqrt', 
    random_state=42
)

rf_model.fit(X_train, Y_train)
Y_pred = rf_model.predict(X_test)

evaluate_model(Y_test, Y_pred)

In [None]:
gb_model = GradientBoostingRegressor(n_estimators=500,max_depth=50)
gb_model.fit(X_train, Y_train)
Y_pred = gb_model.predict(X_test)

evaluate_model(Y_test, Y_pred)

MSE: 5024743176.337333
MAE: 29378.873907089815
R-squared: 0.9383900991059032
