In [1]:
import sys
sys.path.append('../')

In [2]:
import numpy as np
import pandas as pd
from build_features import build_data_sets
from build_model import check_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/processed/otodom_cleaned.csv', index_col=0)

X_train, X_test, y_train, y_test = build_data_sets(df=df)

In [4]:
df.describe()

Unnamed: 0,price,price_m2,area_m2,rooms,floor,parking,build_yr,elevator,max_floor,balcony,backyard,terrace,building_age
count,1753.0,1753.0,1753.0,1751.0,1737.0,1753.0,1517.0,1753.0,1704.0,1753.0,1753.0,1753.0,1517.0
mean,843683.1,12696.556189,64.43522,2.816105,2.37795,0.524815,1996.787739,0.438677,5.071009,0.54421,0.115231,0.146606,25.212261
std,680896.2,5613.12549,30.839518,0.938931,2.231378,0.499526,28.309187,0.496367,4.232692,0.498184,0.319392,0.353813,28.309187
min,48000.0,3200.0,15.0,1.0,-0.5,0.0,1920.0,0.0,1.0,0.0,0.0,0.0,-2.0
25%,469000.0,9259.0,45.6,2.0,1.0,0.0,1976.0,0.0,3.0,0.0,0.0,0.0,1.0
50%,620000.0,11200.0,58.79,3.0,2.0,1.0,2009.0,0.0,4.0,1.0,0.0,0.0,13.0
75%,950000.0,14300.0,75.61,3.0,3.0,1.0,2021.0,1.0,5.0,1.0,0.0,0.0,46.0
max,5999000.0,82569.0,500.0,7.0,10.0,1.0,2024.0,1.0,34.0,1.0,1.0,1.0,102.0


In [5]:
gb_model = GradientBoostingRegressor(random_state=42)

params = [
    {
        "loss": ["squared_error", "absolute_error"],
        "learning_rate": [0.01, 0.1, 1],
        "n_estimators": [100,150],
        "min_samples_split": [4,6],
        "max_depth": [3,5,7]
    }
]

grid_result = GridSearchCV(gb_model, params, cv=10, verbose=1, scoring="neg_mean_absolute_error")
grid_result.fit(X_train, y_train)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


In [6]:
print(grid_result.best_estimator_)

GradientBoostingRegressor(loss='absolute_error', max_depth=7,
                          min_samples_split=4, n_estimators=150,
                          random_state=42)


In [7]:
model1 = GradientBoostingRegressor(loss='absolute_error', max_depth=9, min_samples_split=6, n_estimators=150, random_state=42)
check_model(model1, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 51176 |  143253
r2	 0.95 	|  0.72
rmse	 151095 |  329343


In [8]:
model2 = GradientBoostingRegressor(loss='absolute_error', max_depth=4, min_samples_split=7, n_estimators=150, random_state=42)
check_model(model2, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 123664 |  165291
r2	 0.8 	|  0.63
rmse	 308041 |  377374


# Model evaluation

In [9]:
pred = model2.predict(X_test)

eval_dct = {
    "true": y_test,
    'pred': np.round(pred,1)
}

eval_df = pd.DataFrame(eval_dct)

eval_df['abs_err'] = np.abs(eval_df.true - eval_df.pred)

eval_df.describe()

Unnamed: 0,true,pred,abs_err
count,351.0,351.0,351.0
mean,840265.5,805097.2,165291.4
std,620365.5,465342.5,339733.6
min,260000.0,312654.0,43.0
25%,479000.0,492699.9,20211.95
50%,620000.0,639283.0,60175.1
75%,965000.0,996369.3,153106.5
max,5400000.0,3249827.0,3821088.0


in my dataset 75% of all apartments had price below 1_000_000pln.

In [10]:
low_price = eval_df.loc[eval_df.true < 1_000_000]
high_price = eval_df.loc[eval_df.true >= 1_000_000]

In [11]:
low_price.describe()

Unnamed: 0,true,pred,abs_err
count,272.0,272.0,272.0
mean,594375.088235,623895.3,81784.688603
std,180406.570007,230126.4,112877.405369
min,260000.0,312654.0,43.0
25%,453483.75,473468.0,15161.975
50%,556670.5,553617.2,49694.2
75%,721500.0,725475.6,91008.975
max,999350.0,1547788.0,681701.9


In [12]:
high_price.describe()

Unnamed: 0,true,pred,abs_err
count,79.0,79.0,79.0
mean,1686876.0,1428982.0,452808.2
std,822829.0,528160.4,604644.1
min,1000000.0,766642.0,6114.6
25%,1150000.0,1099036.0,124842.7
50%,1361713.0,1306700.0,228946.2
75%,1925440.0,1530768.0,550561.1
max,5400000.0,3249827.0,3821088.0
