# Importing libraries and data

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Cufflinks wrapper on plotly
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot
cufflinks.go_offline()

# Set global theme

import plotly.figure_factory as ff

import plotly.graph_objects as go

## Importing data

After removing duplicate entries in same quarter we are left with 80 k entries unique in terms of their collection period and offer id. Our features vary from property characteristics such as size, price, number of rooms to location features such as distance to subway, drivetime to center or number of nearby restaurants. 

In [4]:
df = pd.read_csv("price_analysis_data.csv")

In [5]:
df.shape

(86090, 72)

In [6]:
df.columns

Index(['index', 'Id', 'offer_date', 'Area', 'Price', 'latitude', 'longitude',
       'build_year', 'building_floors_num', 'rooms_num', 'lon_mod', 'lat_mod',
       'geoId', 'City', 'subdistrict', 'market', 'address',
       'Building_material', 'Building_ownership', 'Building_type',
       'Construction_status', 'floor_no', 'Heating', 'Windows_type', 'url',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_cable_television',
       'Media_types_electricity', 'Medi

In [7]:
df.head()

Unnamed: 0,index,Id,offer_date,Area,Price,latitude,longitude,build_year,building_floors_num,rooms_num,...,time_transit,restaurant_price_level,restaurant_mean_rating,restaurant_mean_popularity,restaurant_count,restaurant_ratings_count,distance_to_subway,nearest_subway,east_bank,offer_date_q
0,20204,61017653,2020-12-28,198.0,1782000.0,52.28693,21.02651,2001.0,4.0,6,...,25.216667,1.666667,4.34,201.7,10,2017,2.339944,Trocka,1,2020Q4
1,832,61234785,2020-12-28,75.0,793500.0,52.224724,21.093448,2008.0,16.0,3,...,35.983333,1.473684,4.355172,238.827586,29,6926,4.231919,Stadion Narodowy,1,2020Q4
2,7117,60981957,2020-12-28,47.6,648900.0,52.19703,20.98003,1970.0,3.0,3,...,34.566667,1.727273,4.21875,237.588235,17,4039,2.199701,Racławicka,0,2020Q4
3,1749,60822327,2020-12-28,68.9,630000.0,52.22494,21.09275,2011.0,8.0,2,...,35.983333,1.473684,4.355172,238.827586,29,6926,4.179323,Stadion Narodowy,1,2020Q4
4,1392,61247094,2020-12-28,26.0,329900.0,52.226714,21.0914,1999.0,5.0,1,...,35.983333,1.473684,4.355172,238.827586,29,6926,3.989925,Stadion Narodowy,1,2020Q4


# EDA - BI style analysis

## Mean and quantiles

In [8]:
df_temp=df[["offer_date_q","unit_price"]].groupby("offer_date_q", as_index=False).mean()
df_temp["top_quantile"]=df[["offer_date_q","unit_price"]].groupby("offer_date_q", as_index=False).quantile(0.8).unit_price
df_temp["bottom_quantile"]=df[["offer_date_q","unit_price"]].groupby("offer_date_q", as_index=False).quantile(0.2).unit_price
df_temp["sample"]=df[["offer_date_q","unit_price"]].groupby("offer_date_q", as_index=False).count().unit_price

In [9]:
#Creating trend variable by dividing each variable by its Q2 value
df_temp["unit_price_static"]=round(df_temp["unit_price"]/df_temp.unit_price[0],4)*100
df_temp["bottom_quantile_static"]=round(df_temp["bottom_quantile"]/df_temp.bottom_quantile[0],4)*100
df_temp["top_quantile_static"]=round(df_temp["top_quantile"]/df_temp.top_quantile[0],4)*100

In [10]:

trace0=go.Scatter(
            x=df_temp.offer_date_q,
            y=df_temp.unit_price,
            mode='lines',
            marker=dict(
            color="blue",
            size=5
            
            ),
        name="mean",

        )


trace1=go.Scatter(
            x=df_temp.offer_date_q,
            y=df_temp.top_quantile,
            mode='lines',
            marker=dict(
            color="green",
            size=5
            
            ),
        name="top quintile",

        )

trace2=go.Scatter(
            x=df_temp.offer_date_q,
            y=df_temp.bottom_quantile,
            mode='lines',
            marker=dict(
            color="red",
            size=5
            
            ),
        name="bottom quintile",

        )

data=[trace0, trace1, trace2]
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Price per m2 trend [PLN/m2]",
            yaxis=dict(title="Price"),
        xaxis=dict(title="Quarter",type="category")

    
    ))
iplot(figure)



In [11]:

trace0=go.Scatter(
            x=df_temp.offer_date_q,
            y=df_temp.unit_price_static,
            mode='lines',
            marker=dict(
            color="blue",
            size=5
            
            ),
        name="mean",

        )


trace1=go.Scatter(
            x=df_temp.offer_date_q,
            y=df_temp.top_quantile_static,
            mode='lines',
            marker=dict(
            color="green",
            size=5
            
            ),
        name="top quintile",

        )

trace2=go.Scatter(
            x=df_temp.offer_date_q,
            y=df_temp.bottom_quantile_static,
            mode='lines',
            marker=dict(
            color="red",
            size=5
            
            ),
        name="bottom quintile",

        )

data=[trace0, trace1, trace2]
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Price per m2 trend [2020Q2=100]",
            yaxis=dict(title="Price"),
        xaxis=dict(title="Quarter",type="category")

    
    ))
iplot(figure)




## Sizes

In [12]:
df_temp = df[["offer_date_q","unit_price","Area"]]

In [13]:
# Segmenting data into size buckets
bins = [0,38, 60,90,200]
labels = ["small","medium","large","very large"]
df_temp ['Area_group'] = pd.cut(df.Area,bins= bins, labels = labels, retbins=False)

In [14]:
df_graph=df_temp[["offer_date_q","unit_price",'Area_group']].groupby(["offer_date_q",'Area_group'], as_index=False).mean()

In [15]:
df_graph_Q2 = df_graph.query("offer_date_q == '2020Q2'")
df_graph_Q2.rename(columns = {"unit_price":"q2_unit_price"},inplace=True)
df_graph_Q2.drop(columns = ["offer_date_q"],inplace=True)
df_graph = df_graph.merge(df_graph_Q2, on =["Area_group"])
df_graph["unit_price_static"] = round(df_graph.unit_price / df_graph.q2_unit_price*100,2)

In [16]:
data = []
for area_group in df_graph.Area_group.unique():
    df_temp = df_graph.query("Area_group == '{}'".format(area_group))
    
    
    trace=go.Scatter(
                x=df_temp.offer_date_q,
                y=df_temp.unit_price,
                mode='lines',
                marker=dict(
                size=5

                ),
            name=area_group,

            )
    data.append(trace)
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Price per m2 segmented by property size",
            yaxis=dict(title="Price"),
        xaxis=dict(title="Month",type="category")

    
    ))
iplot(figure)




In [17]:
data = []
for area_group in df_graph.Area_group.unique():
    df_temp = df_graph.query("Area_group == '{}'".format(area_group))
    
    
    trace=go.Scatter(
                x=df_temp.offer_date_q,
                y=df_temp.unit_price_static,
                mode='lines',
                marker=dict(
                size=5

                ),
            name=area_group,

            )
    data.append(trace)
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Price per m2 trend [2020Q2=100]",
            yaxis=dict(title="Price"),
        xaxis=dict(title="Quarter",type="category")

    
    ))
iplot(figure)




## Location

In [18]:
df_temp = df[["offer_date_q","unit_price","distance_driving"]]

In [19]:
# Segmenting data into location buckets
bins = [0,3, 5,10,25]
labels = ["strict center","near center","moderate distance","outskirts"]
df_temp ['Area_group'] = pd.cut(df.distance_driving,bins= bins, labels = labels, retbins=False)

In [20]:
df_temp.groupby('Area_group').count()

Unnamed: 0_level_0,offer_date_q,unit_price,distance_driving
Area_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
strict center,8883,8883,8883
near center,16454,16454,16454
moderate distance,37493,37493,37493
outskirts,23260,23260,23260


In [21]:
df_graph=df_temp[["offer_date_q","unit_price",'Area_group']].groupby(["offer_date_q",'Area_group'], as_index=False).mean()

In [22]:
df_graph_Q2 = df_graph.query("offer_date_q == '2020Q2'")
df_graph_Q2.rename(columns = {"unit_price":"q2_unit_price"},inplace=True)
df_graph_Q2.drop(columns = ["offer_date_q"],inplace=True)
df_graph = df_graph.merge(df_graph_Q2, on =["Area_group"])
df_graph["unit_price_static"] = round(df_graph.unit_price / df_graph.q2_unit_price*100,2)

In [23]:
data = []
for area_group in df_graph.Area_group.unique():
    df_temp = df_graph.query("Area_group == '{}'".format(area_group))
    
    
    trace=go.Scatter(
                x=df_temp.offer_date_q,
                y=df_temp.unit_price,
                mode='lines',
                marker=dict(
                size=5

                ),
            name=area_group,

            )
    data.append(trace)
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Price per m2 segmented by distance to center",
            yaxis=dict(title="Price"),
        xaxis=dict(title="Quarter",type="category")

    
    ))
iplot(figure)




In [24]:
data = []
for area_group in df_graph.Area_group.unique():
    df_temp = df_graph.query("Area_group == '{}'".format(area_group))
    
    
    trace=go.Scatter(
                x=df_temp.offer_date_q,
                y=df_temp.unit_price_static,
                mode='lines',
                marker=dict(
                size=5

                ),
            name=area_group,

            )
    data.append(trace)
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Price per m2 trend [2020Q2=100]",
            yaxis=dict(title="Price"),
        xaxis=dict(title="Quarter",type="category")

    
    ))
iplot(figure)




# Transforming data for models

## Converting categorical features to one-hot columns

In order to analyze feature importance or use our data in models, we need to transform all categorical features to one-hot columns. 

In [25]:
unique_features=["district","Building_material","Building_type","Construction_status","Heating",
"Windows_type","market","Building_ownership",'offer_date_q']

In [26]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
df_cat=df[unique_features]

In [28]:
enc=OneHotEncoder()
enc.fit(df_cat)

one_hot_val=enc.fit_transform(df_cat).toarray().astype(int)
one_hot_columns=enc.get_feature_names(unique_features)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [29]:
df_one_hot=pd.DataFrame(one_hot_val, columns=one_hot_columns)

In [30]:
df_one_hot.columns

Index(['district_Bemowo', 'district_Bialoleka', 'district_Bielany',
       'district_Downtown', 'district_Mokotow', 'district_Ochota',
       'district_Other', 'district_Praga', 'district_Southern Praga',
       'district_Subburbs', 'district_Targowek', 'district_Ursynow',
       'district_Wawer', 'district_Wilanow', 'district_Wlochy',
       'district_Wola', 'district_Zoliborz', 'Building_material_breezeblock',
       'Building_material_brick', 'Building_material_cellular_concrete',
       'Building_material_concrete', 'Building_material_concrete_plate',
       'Building_material_hydroton', 'Building_material_not_specified',
       'Building_material_other', 'Building_material_reinforced_concrete',
       'Building_material_silikat', 'Building_material_wood',
       'Building_type_apartment', 'Building_type_block', 'Building_type_house',
       'Building_type_infill', 'Building_type_loft',
       'Building_type_not_specified', 'Building_type_ribbon',
       'Building_type_tenement', '

## Dropping one column from each feature to avoid colinearity as we will be working with linear regression models

First let's drop all "not_specified" columns as they do not carry any information 

In [31]:
not_specified_idx=[]
for i in range(0,df_one_hot.columns.shape[0]):
    if "not_specified" in df_one_hot.columns[i]:
        not_specified_idx.append(i)
not_specified_idx=np.asarray(not_specified_idx)

We also need to drop one column for district, market, nearest_subway and offer_date features. They will serve as a baseline for our linear regression model.

In [32]:
drop_collinear_cols=list(df_one_hot.columns[not_specified_idx])
drop_collinear_cols.append('district_Mokotow')
drop_collinear_cols.append('market_secondary')
drop_collinear_cols.append('nearest_subway_Swietokrzyska')
drop_collinear_cols.append('offer_date_q_2020Q2')

In [33]:
# Substract components of two lists, while keeping order of the remaining components
def list_diff(list1, list2):
    out = []
    for item in list1:
        if not item in list2:
            out.append(item)
    return out

In [34]:
df_cat_columns=list_diff(df_one_hot.columns,drop_collinear_cols)

In [35]:
df_cat=df_one_hot[df_cat_columns]

In [36]:
columns_base=list_diff(df.columns, unique_features)

In [37]:
# Dropping columns not needed in our model
drop_columns_base=[
    'Price','latitude','longitude','lon_mod','lat_mod','grid_price','sample_size','City','subdistrict', 'geoId', 'address', 'floor_no',
     'nearest_subway' ]

In [38]:
columns_base=list_diff(columns_base, drop_columns_base)

In [39]:
df_base=df[columns_base]


In [40]:
df_base.columns

Index(['index', 'Id', 'offer_date', 'Area', 'build_year',
       'building_floors_num', 'rooms_num', 'url', 'Equipment_types_dishwasher',
       'Equipment_types_fridge', 'Equipment_types_furniture',
       'Equipment_types_oven', 'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_cable_television',
       'Media_types_electricity', 'Media_types_internet', 'Media_types_phone',
       'Media_types_sewage', 'Media_types_water', 'Security_types_alarm',
       'Security_types_anti_burglary_door', 'Security_types_closed_area',
       'Security_types_entryphone', 'Security_types_monitoring',
      

In [41]:
df_base["rooms_num"]=pd.to_numeric(df_base["rooms_num"]) 

## Combining data for modeling

In [42]:
ml_data=pd.concat([df_base,df_cat],axis=1)

In [43]:
ml_data.shape

(86090, 104)

In [44]:
ml_data.head()

Unnamed: 0,index,Id,offer_date,Area,build_year,building_floors_num,rooms_num,url,Equipment_types_dishwasher,Equipment_types_fridge,...,Windows_type_aluminium,Windows_type_plastic,Windows_type_wooden,market_primary,Building_ownership_co_operative_ownership,Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe,Building_ownership_full_ownership,Building_ownership_share,offer_date_q_2020Q3,offer_date_q_2020Q4
0,20204,61017653,2020-12-28,198.0,2001.0,4.0,6,https://www.otodom.pl/pl/oferta/200-m2-2-pozio...,0,0,...,0,0,0,0,0,0,1,0,0,1
1,832,61234785,2020-12-28,75.0,2008.0,16.0,3,https://www.otodom.pl/pl/oferta/goclaw-bora-ko...,0,0,...,0,1,0,0,0,0,1,0,0,1
2,7117,60981957,2020-12-28,47.6,1970.0,3.0,3,https://www.otodom.pl/pl/oferta/3-pokoje-ochot...,0,0,...,0,0,0,0,0,0,1,0,0,1
3,1749,60822327,2020-12-28,68.9,2011.0,8.0,2,https://www.otodom.pl/pl/oferta/swietnie-skomu...,0,0,...,0,0,0,0,0,1,0,0,0,1
4,1392,61247094,2020-12-28,26.0,1999.0,5.0,1,https://www.otodom.pl/pl/oferta/kawalerka-na-p...,0,0,...,0,0,1,0,0,1,0,0,0,1


In [45]:
ml_data.columns[:]

Index(['index', 'Id', 'offer_date', 'Area', 'build_year',
       'building_floors_num', 'rooms_num', 'url', 'Equipment_types_dishwasher',
       'Equipment_types_fridge',
       ...
       'Windows_type_aluminium', 'Windows_type_plastic', 'Windows_type_wooden',
       'market_primary', 'Building_ownership_co_operative_ownership',
       'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe',
       'Building_ownership_full_ownership', 'Building_ownership_share',
       'offer_date_q_2020Q3', 'offer_date_q_2020Q4'],
      dtype='object', length=104)

# Selecting best features and splitting data

In [46]:
counts, bins = np.histogram(ml_data.unit_price, bins=range(0, 30000, 1000))
bins = bins[1:]
import plotly.express as px
fig = px.bar(x=bins, y=counts, labels={'x': 'unit price PLN/m2', 'y':'count'}, title = 'Unit price distribution')
fig.show()

We can see that our price distribution is slightly skewed, this might make log model more reliable

In [47]:
#Preparing explained variable and model features
X= ml_data.copy()
X=X.query(" unit_price<=30000 and unit_price>=5000")
y=X.unit_price
X.drop(columns=["unit_price","offer_date","Id","url","index"],inplace=True)

In [48]:
X.isna().sum().sort_values()

Area                                     0
Building_material_silikat                0
Building_material_reinforced_concrete    0
Building_material_other                  0
Building_material_hydroton               0
                                        ..
Media_types_water                        0
Media_types_sewage                       0
Media_types_phone                        0
Media_types_electricity                  0
offer_date_q_2020Q4                      0
Length: 99, dtype: int64

## Selecting K Best features

To select best features before working with linear models it is best to use of the shelf function such as KBest. As we are facing a regression problem, f_regression will be the optimal scoring function

In [49]:
from sklearn.feature_selection import SelectKBest, f_regression

In [50]:
bestfeatures = SelectKBest(score_func=f_regression, k="all")

In [51]:
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)


invalid value encountered in true_divide


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in less_equal



In [52]:
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']

### Analyzing top features

In [53]:
featureScores.nlargest(50,'Score')

Unnamed: 0,Specs,Score
36,distance_driving,38536.379416
38,distance_transit,37951.868939
45,distance_to_subway,31867.446802
37,time_driving,31159.45383
39,time_transit,28073.459914
43,restaurant_count,20260.722345
50,district_Downtown,17006.789644
44,restaurant_ratings_count,15122.374279
42,restaurant_mean_popularity,14506.059305
46,east_bank,11903.217308


In [54]:
top_scores = featureScores.nlargest(30,'Score').sort_values(by="Score", ascending=False)

import plotly.express as px
fig = px.bar(top_scores, x='Specs', y='Score')
fig.show()

In [55]:
## To avoid overfitting we will use top 30 features for our linear models
top_features=featureScores.nlargest(30,'Score').Specs.unique()
top_features

array(['distance_driving', 'distance_transit', 'distance_to_subway',
       'time_driving', 'time_transit', 'restaurant_count',
       'district_Downtown', 'restaurant_ratings_count',
       'restaurant_mean_popularity', 'east_bank', 'district_Bialoleka',
       'restaurant_mean_rating', 'restaurant_price_level',
       'Building_type_block', 'Building_type_tenement',
       'district_Subburbs', 'Building_type_apartment', 'market_primary',
       'Windows_type_wooden', 'Construction_status_ready_to_use',
       'district_Wola', 'build_year', 'rooms_num', 'district_Targowek',
       'Extras_types_balcony', 'Building_ownership_full_ownership',
       'Extras_types_air_conditioning', 'floor_num',
       'Building_material_brick', 'Construction_status_to_completion'],
      dtype=object)

In [56]:
X_top=X[top_features]

In [57]:
## We will use wider set of 50 top features for our DNN models as it is able to learn more complex relations between features
top_features_DNN=featureScores.nlargest(50,'Score').Specs.unique()

## Spliting into train and test sets

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=10)

In [59]:
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

# Statistical approach

## Base model

In [60]:
import statsmodels.api as sm


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.


The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version



In [61]:
X_stats = X_top.copy()
X_stats[['offer_date_q_2020Q4','offer_date_q_2020Q3']] = X[['offer_date_q_2020Q4','offer_date_q_2020Q3']]

In [62]:
X_stats.drop(columns = ["time_driving","distance_transit","district_Bialoleka"],inplace=True)

In [63]:
## Changing build year data as relational to 2000 to make it easier to interpret
X_stats.build_year = X_stats.build_year-2000

In [64]:
# Adding constant - important to make your model work well
X_stats=sm.add_constant(X_stats)

In [65]:
# Selecting WLS model due to price Heteroscedasticity
model = sm.WLS(y,X_stats)
results = model.fit()

In [66]:

results.summary()

0,1,2,3
Dep. Variable:,unit_price,R-squared:,0.52
Model:,WLS,Adj. R-squared:,0.519
Method:,Least Squares,F-statistic:,3210.0
Date:,"Thu, 07 Jan 2021",Prob (F-statistic):,0.0
Time:,20:16:09,Log-Likelihood:,-768830.0
No. Observations:,86090,AIC:,1538000.0
Df Residuals:,86060,BIC:,1538000.0
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5639.1540,296.412,19.025,0.000,5058.190,6220.118
distance_driving,-146.3122,3.531,-41.441,0.000,-153.232,-139.392
distance_to_subway,-215.8042,6.555,-32.925,0.000,-228.651,-202.957
time_transit,-6.2527,1.377,-4.542,0.000,-8.951,-3.554
restaurant_count,9.2051,0.618,14.884,0.000,7.993,10.417
district_Downtown,1334.0148,31.677,42.113,0.000,1271.929,1396.101
restaurant_ratings_count,-0.0130,0.001,-20.346,0.000,-0.014,-0.012
restaurant_mean_popularity,1.0130,0.061,16.583,0.000,0.893,1.133
east_bank,-803.4996,20.857,-38.524,0.000,-844.379,-762.620

0,1,2,3
Omnibus:,9493.717,Durbin-Watson:,1.812
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20256.983
Skew:,0.692,Prob(JB):,0.0
Kurtosis:,4.931,Cond. No.,2560000.0


In [67]:
# Checking if selected features are statistically significant 
alpha=0.05
results.pvalues<alpha

const                                True
distance_driving                     True
distance_to_subway                   True
time_transit                         True
restaurant_count                     True
district_Downtown                    True
restaurant_ratings_count             True
restaurant_mean_popularity           True
east_bank                            True
restaurant_mean_rating               True
restaurant_price_level               True
Building_type_block                  True
Building_type_tenement               True
district_Subburbs                    True
Building_type_apartment              True
market_primary                       True
Windows_type_wooden                  True
Construction_status_ready_to_use     True
district_Wola                        True
build_year                           True
rooms_num                            True
district_Targowek                    True
Extras_types_balcony                 True
Building_ownership_full_ownership 

## Log model

In [68]:
y_log=np.log(y)

In [69]:
X_stats_log = X_top.copy()
X_stats_log[['offer_date_q_2020Q4','offer_date_q_2020Q3']] = X[['offer_date_q_2020Q4','offer_date_q_2020Q3']]
X_stats_log=sm.add_constant(X_stats)

In [70]:
X_stats_log.drop(columns = ["time_transit"],inplace=True)

In [71]:
# Selecting WLS model due to price Heteroscedasticity
model = sm.WLS(y_log,X_stats_log)
results = model.fit()

In [72]:
results.summary()

0,1,2,3
Dep. Variable:,unit_price,R-squared:,0.553
Model:,WLS,Adj. R-squared:,0.553
Method:,Least Squares,F-statistic:,3798.0
Date:,"Thu, 07 Jan 2021",Prob (F-statistic):,0.0
Time:,20:16:10,Log-Likelihood:,40414.0
No. Observations:,86090,AIC:,-80770.0
Df Residuals:,86061,BIC:,-80500.0
Df Model:,28,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.7733,0.025,357.839,0.000,8.725,8.821
distance_driving,-0.0150,0.000,-61.763,0.000,-0.015,-0.014
distance_to_subway,-0.0220,0.001,-43.321,0.000,-0.023,-0.021
restaurant_count,0.0009,5.1e-05,17.386,0.000,0.001,0.001
district_Downtown,0.0922,0.003,35.221,0.000,0.087,0.097
restaurant_ratings_count,-1.298e-06,5.3e-08,-24.503,0.000,-1.4e-06,-1.19e-06
restaurant_mean_popularity,8.347e-05,5.01e-06,16.670,0.000,7.37e-05,9.33e-05
east_bank,-0.0797,0.002,-47.234,0.000,-0.083,-0.076
restaurant_mean_rating,0.1580,0.006,28.146,0.000,0.147,0.169

0,1,2,3
Omnibus:,2679.71,Durbin-Watson:,1.807
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6664.174
Skew:,0.124,Prob(JB):,0.0
Kurtosis:,4.34,Cond. No.,2560000.0


In [73]:
# Checking if selected features are statistically significant 
alpha=0.05
results.pvalues<alpha

const                                True
distance_driving                     True
distance_to_subway                   True
restaurant_count                     True
district_Downtown                    True
restaurant_ratings_count             True
restaurant_mean_popularity           True
east_bank                            True
restaurant_mean_rating               True
restaurant_price_level               True
Building_type_block                  True
Building_type_tenement               True
district_Subburbs                    True
Building_type_apartment              True
market_primary                       True
Windows_type_wooden                  True
Construction_status_ready_to_use     True
district_Wola                        True
build_year                           True
rooms_num                            True
district_Targowek                    True
Extras_types_balcony                 True
Building_ownership_full_ownership    True
Extras_types_air_conditioning     

# Neural networks model

## Spliting into train and test sets

In [74]:
X_dnn = X.query("offer_date_q_2020Q3==0 and offer_date_q_2020Q4==0")[top_features_DNN]

In [75]:
y_dnn = y[X.query("offer_date_q_2020Q3==0 and offer_date_q_2020Q4==0").index]

In [76]:
y_dnn[:5]

66991    14717.045455
66992    14290.910981
66993    10416.666667
66994     7000.000000
66995    12079.183376
Name: unit_price, dtype: float64

In [77]:
X_dnn.reset_index(inplace=True,drop=True)
y_dnn.reset_index(drop=True, inplace=True)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_dnn, y_dnn, test_size=0.2, random_state=10)

In [79]:
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

## Transforming and scaling data for NN

In [80]:
def DNN_plot_loss(history, starting_epoch):

        trace0=go.Scatter(
                y=history.history['loss'][starting_epoch:],
                x=history.epoch[starting_epoch:],
                mode='lines',
                marker=dict(
                color="blue",
                size=5,
                opacity=0.5
                ),
                name="Training Loss"
            )


        trace1=go.Scatter(
                y=history.history['val_loss'][starting_epoch:],
                x=history.epoch[starting_epoch:],
                mode='lines',
                marker=dict(
                color="red",
                size=5,
                opacity=0.5
                ),
                name="Validation Loss"
            )

        data=[trace0, trace1]
        figure=go.Figure(
            data=data,
            layout=go.Layout(
                title="Learning curve",
                yaxis=dict(title="Loss"),
                xaxis=dict(title="Epoch",range=(starting_epoch,history.epoch[-1])),
                legend=dict(
                    x=1,
                    y=1,
                    traceorder="normal",
                    font=dict(
                        family="sans-serif",
                        size=12,
                        color="black"
                    ),
                bgcolor=None


            )))
        iplot(figure)

In [81]:
y_train=np.asarray(y_train).reshape(-1,1)
y_train.shape

(15279, 1)

In [82]:
y_test=np.asarray(y_test).reshape(-1,1)
y_test.shape

(3820, 1)

In [83]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [84]:
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [85]:
X_train.shape

(15279, 50)

In [86]:
X_test.shape

(3820, 50)

## DNN Model

In [87]:
def DNN_plot_loss(history, starting_epoch,previous_val_loss):

        trace0=go.Scatter(
                y=history.history['loss'][starting_epoch:],
                x=history.epoch[starting_epoch:],
                mode='lines',
                marker=dict(
                color="blue",
                size=5,
                opacity=0.5
                ),
                name="Training Loss"
            )


        trace1=go.Scatter(
                y=history.history['val_loss'][starting_epoch:],
                x=history.epoch[starting_epoch:],
                mode='lines',
                marker=dict(
                color="red",
                size=5,
                opacity=0.5
                ),
                name="Validation Loss"
            )
        
        trace2=go.Scatter(
                y=list(np.ones([len(history.epoch[starting_epoch:])])*np.asarray(previous_val_loss).min()),
                x=history.epoch[starting_epoch:],
                mode='lines',
                marker=dict(
                color="grey",
                size=5,

                ),
                name="Lowest error from previous models"
            )

        data=[trace0, trace1,trace2]
        figure=go.Figure(
            data=data,
            layout=go.Layout(
                title="Learning curve",
                yaxis=dict(title="Loss",range=(900,1300)),
                xaxis=dict(title="Epoch",range=(starting_epoch,history.epoch[-1])),
                legend=dict(
                    x=0.57,
                    y=1,
                    traceorder="normal",
                    font=dict(
                        family="sans-serif",
                        size=12,
                        color="black"
                    ),
                bgcolor=None,



            )))
        iplot(figure)

In [88]:
def performance_summary(model, X_test, y_test ):
    
    y_hat=model.predict(X_test)
    
    df_summary=pd.DataFrame(y_hat, columns=["y_hat"])
    df_summary["y_true"]=y_test
    df_summary["abs_error"]=np.abs(df_summary.y_true-df_summary.y_hat)
    df_summary["error"]=df_summary.y_hat-df_summary.y_true
    df_summary["relative_error"]= df_summary["error"]/df_summary.y_true
    df_summary["relative_abs_error"]= df_summary["abs_error"]/df_summary.y_true
    df_summary["diff_to_benchmark"]= df_summary.y_true / df_summary.y_hat -1
    df_summary["variation_from_benchmark"]= df_summary.y_true  - df_summary.y_hat 
    
    share_within_5pct=(df_summary.query("relative_abs_error<0.05").shape[0]/df_summary.shape[0])*100
    
    print("{:.2f}% : Share of forecasts within 5% absolute error\n".format(share_within_5pct))
    print("{:.2f}   : Mean absolute error \n".format(df_summary.abs_error.mean()))
    print("{:.2f}% : Mean absolute percentage error\n".format(df_summary.relative_abs_error.mean()*100))
    
    return(df_summary)



In [89]:
tf.keras.backend.clear_session()
tf.random.set_seed(60)

model=keras.models.Sequential([
    
    keras.layers.Dense(1024, input_dim = X_train.shape[1]), 
    keras.layers.LeakyReLU(),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.5),
    
    keras.layers.Dense(512),  
    keras.layers.LeakyReLU(),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),

    keras.layers.Dense(512),  
    keras.layers.LeakyReLU(),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    
    keras.layers.Dense(units=256), 
    keras.layers.LeakyReLU(),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    
    keras.layers.Dense(units=256), 
    keras.layers.LeakyReLU(),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.02),

    
    keras.layers.Dense(units=128),
    keras.layers.LeakyReLU(), 
    keras.layers.Dropout(0.01),
    keras.layers.Dense(units=1, activation="linear"),



],name="Learning_rate_decay",)


In [90]:
model.summary()

Model: "Learning_rate_decay"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              52224     
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 1024)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 1024)              4096      
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 512)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 512)       

In [91]:
# Calculating learning rate decay
learning_rate = 0.005
decay = 5e-4
n_epochs=400
n_steps_per_epoch = len(X_train) // 1024
epochs = np.arange(n_epochs)
lrs = learning_rate / (1 + decay * epochs * n_steps_per_epoch)

In [92]:
# Visualizing learning rate decay

trace0=go.Scatter(
            y=lrs,
            x=epochs,
            mode='lines',
            marker=dict(
            color="red",
            size=5,
            opacity=0.5
            )
    )
        

data=[trace0]
figure=go.Figure(
            data=data,
            layout=go.Layout(
                title="Learning rate decay",
                yaxis=dict(title="Learning rate"),
                xaxis=dict(title="Epoch"),
                legend=dict(
                    x=1,
                    y=1,
                    traceorder="normal",
                    font=dict(
                        family="sans-serif",
                        size=12,
                        color="black"
                    ),
                bgcolor=None


            )))
iplot(figure)

In [93]:
%%time
#Added learning rate decay to Adam optimizer
optimizer = keras.optimizers.Adam(lr=0.005, decay=5e-4)



model.compile(optimizer=optimizer, warm_start=False, 
            loss='mean_absolute_error')


history = model.fit(X_train, y_train,
                    epochs=200, batch_size=1024,
                    validation_data=(X_test, y_test), 
                    verbose=1)

Train on 15279 samples, validate on 3820 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200


Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200


Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
Wall time: 59.3 s


In [94]:
DNN_plot_loss(history,starting_epoch=10, previous_val_loss=10)

In [95]:
df_summary = performance_summary(model, X_test, y_test )

41.94% : Share of forecasts within 5% absolute error

998.89   : Mean absolute error 

8.88% : Mean absolute percentage error



In [96]:
df_summary.describe()

Unnamed: 0,y_hat,y_true,abs_error,error,relative_error,relative_abs_error,diff_to_benchmark,variation_from_benchmark
count,3820.0,3820.0,3820.0,3820.0,3820.0,3820.0,3820.0,3820.0
mean,11039.555664,11108.809011,998.892819,-69.252725,0.008904,0.088777,0.006246,69.252725
std,2194.90625,2625.90009,1050.360156,1447.93211,0.127785,0.092331,0.124089,1447.93211
min,6179.633789,5083.056478,0.746462,-8812.225586,-0.440611,0.000117,-0.608214,-8969.494683
25%,9420.509277,9181.53699,281.160226,-692.385901,-0.059531,0.026716,-0.062338,-666.127876
50%,10873.270996,10863.034483,674.856529,17.549732,0.001703,0.06344,-0.0017,-17.549732
75%,12363.777832,12534.714388,1340.494061,666.127876,0.066482,0.120356,0.063299,692.385901
max,20203.171875,20000.0,8969.494683,8969.494683,1.552413,1.552413,0.787666,8812.225586


In [97]:
df_summary

Unnamed: 0,y_hat,y_true,abs_error,error,relative_error,relative_abs_error,diff_to_benchmark,variation_from_benchmark
0,13287.498047,11414.285714,1873.212333,1873.212333,0.164111,0.164111,-0.140976,-1873.212333
1,8109.391113,8210.937500,101.546387,-101.546387,-0.012367,0.012367,0.012522,101.546387
2,11913.771484,11978.465680,64.694195,-64.694195,-0.005401,0.005401,0.005430,64.694195
3,11720.905273,10592.631414,1128.273859,1128.273859,0.106515,0.106515,-0.096262,-1128.273859
4,7944.010254,7992.916175,48.905921,-48.905921,-0.006119,0.006119,0.006156,48.905921
...,...,...,...,...,...,...,...,...
3815,14278.571289,13700.000000,578.571289,578.571289,0.042231,0.042231,-0.040520,-578.571289
3816,8423.924805,7600.000000,823.924805,823.924805,0.108411,0.108411,-0.097808,-823.924805
3817,10344.359375,10599.651655,255.292280,-255.292280,-0.024085,0.024085,0.024679,255.292280
3818,9394.470703,9169.550173,224.920530,224.920530,0.024529,0.024529,-0.023942,-224.920530


In [98]:
df_summary

Unnamed: 0,y_hat,y_true,abs_error,error,relative_error,relative_abs_error,diff_to_benchmark,variation_from_benchmark
0,13287.498047,11414.285714,1873.212333,1873.212333,0.164111,0.164111,-0.140976,-1873.212333
1,8109.391113,8210.937500,101.546387,-101.546387,-0.012367,0.012367,0.012522,101.546387
2,11913.771484,11978.465680,64.694195,-64.694195,-0.005401,0.005401,0.005430,64.694195
3,11720.905273,10592.631414,1128.273859,1128.273859,0.106515,0.106515,-0.096262,-1128.273859
4,7944.010254,7992.916175,48.905921,-48.905921,-0.006119,0.006119,0.006156,48.905921
...,...,...,...,...,...,...,...,...
3815,14278.571289,13700.000000,578.571289,578.571289,0.042231,0.042231,-0.040520,-578.571289
3816,8423.924805,7600.000000,823.924805,823.924805,0.108411,0.108411,-0.097808,-823.924805
3817,10344.359375,10599.651655,255.292280,-255.292280,-0.024085,0.024085,0.024679,255.292280
3818,9394.470703,9169.550173,224.920530,224.920530,0.024529,0.024529,-0.023942,-224.920530


In [99]:
counts, bins = np.histogram(df_summary.diff_to_benchmark*100-2.5, bins=range(-100, 100, 5))
bins = bins[1:]
import plotly.express as px
fig = px.bar(x=bins, y=counts, labels={'x': 'diff to benchmark', 'y':'count'})
fig.show()

In [100]:
ypred=model.predict(X_test)

# Benchmarking for current offers batch

In [101]:
X_current = ml_data.query("offer_date_q_2020Q4==1")
X_current.reset_index(inplace=True, drop=True)

In [102]:
y_b=X_current.unit_price
X_b = X_current[top_features_DNN]

In [103]:
X_b=scaler.transform(X_b)

In [104]:
y_benchmark = model.predict(X_b)

In [105]:
df_benchmark = X_current[["Id","Area","unit_price","url"]]

In [106]:
df_benchmark["unit_price_hat"] = model.predict(X_b)

In [107]:
df_benchmark = df_benchmark.query("unit_price<=30000")

In [108]:
df_benchmark["price"] = df_benchmark.unit_price * df_benchmark.Area
df_benchmark["price_benchmark"] = df_benchmark.unit_price_hat * df_benchmark.Area

In [109]:
df_benchmark["diff_to_benchmark"] = (df_benchmark.unit_price / df_benchmark.unit_price_hat -1)*100

In [110]:
df_benchmark.describe()

Unnamed: 0,Id,Area,unit_price,unit_price_hat,price,price_benchmark,diff_to_benchmark
count,35647.0,35647.0,35647.0,35647.0,35647.0,35647.0,35647.0
mean,60904190.0,59.535511,11525.259537,11215.125977,685707.7,668846.5,2.993845
std,1114287.0,27.208517,2606.509445,2147.241699,387274.5,360805.3,13.912943
min,23260980.0,20.0,5021.676301,6172.064941,195000.0,185387.9,-68.052388
25%,60930470.0,41.6,9674.881085,9627.102539,455000.0,444625.7,-5.348915
50%,61090750.0,53.7,11189.655172,11061.323242,578344.0,566148.9,1.815859
75%,61208210.0,68.615,12937.931034,12505.039062,770000.0,763796.0,9.951717
max,61332350.0,200.0,20000.0,20656.736328,3990000.0,3598857.0,118.908025


In [111]:
df_benchmark.head()

Unnamed: 0,Id,Area,unit_price,url,unit_price_hat,price,price_benchmark,diff_to_benchmark
0,61017653,198.0,9000.0,https://www.otodom.pl/pl/oferta/200-m2-2-pozio...,8685.285156,1782000.0,1719686.0,3.623541
1,61234785,75.0,10580.0,https://www.otodom.pl/pl/oferta/goclaw-bora-ko...,9912.948242,793500.0,743471.1,6.729096
2,60981957,47.6,13632.352941,https://www.otodom.pl/pl/oferta/3-pokoje-ochot...,10592.106445,648900.0,504184.3,28.702945
3,60822327,68.9,9143.686502,https://www.otodom.pl/pl/oferta/swietnie-skomu...,10245.456055,630000.0,705911.9,-10.753739
4,61247094,26.0,12688.461538,https://www.otodom.pl/pl/oferta/kawalerka-na-p...,11113.286133,329900.0,288945.4,14.173804


In [112]:
counts, bins = np.histogram(df_benchmark.diff_to_benchmark - 2.5, bins=range(-100, 100, 5))
bins = bins[1:]
import plotly.express as px
fig = px.bar(x=bins, y=counts, labels={'x': 'diff to benchmark', 'y':'count'})
fig.show()

In [113]:
df_benchmark.sort_values(by="diff_to_benchmark", ascending=False)

Unnamed: 0,Id,Area,unit_price,url,unit_price_hat,price,price_benchmark,diff_to_benchmark
3228,59910643,164.00,14939.024390,https://www.otodom.pl/pl/oferta/6-pok-160-m-og...,6824.338379,2450000.0,1.119191e+06,118.908025
6497,60355470,50.00,20000.000000,https://www.otodom.pl/pl/oferta/3-pokoje-z-2-b...,9228.772461,1000000.0,4.614386e+05,116.713545
5401,61131693,90.59,18974.964124,https://www.otodom.pl/pl/oferta/apartament-ul-...,8858.880859,1718942.0,8.025260e+05,114.191436
10592,61221577,45.00,18444.444444,https://www.otodom.pl/pl/oferta/piekne-mieszka...,8619.814453,830000.0,3.878917e+05,113.977279
4446,61254740,90.56,18981.250000,https://www.otodom.pl/pl/oferta/mieszkanie-w-p...,9299.240234,1718942.0,8.421392e+05,104.116138
...,...,...,...,...,...,...,...,...
35007,61046235,112.00,5348.214286,not_specified,13293.588867,599000.0,1.488882e+06,-59.768469
7816,61332176,80.00,5487.500000,https://www.otodom.pl/pl/oferta/unikatowy-apar...,14102.091797,439000.0,1.128167e+06,-61.087333
18465,60753538,80.00,5487.500000,https://www.otodom.pl/oferta/unikatowy-apartam...,14715.865234,439000.0,1.177269e+06,-62.710314
30087,60902429,184.50,5414.634146,not_specified,14874.618164,999000.0,2.744367e+06,-63.598164


In [114]:
#df_benchmark.query("monthly_price>2500 and diff_to_benchmark >20 and Area < 60").to_excel("price_benchmarking.xlsx")

In [115]:
(df_benchmark["diff_to_benchmark"]*df_benchmark.unit_price/100).max()

23342.70908650902

In [116]:
(df_benchmark["diff_to_benchmark"]*df_benchmark.unit_price/100).mean()

543.2057369607129

In [117]:
(df_benchmark["diff_to_benchmark"]*df_benchmark.unit_price/100).max()
(df_benchmark["diff_to_benchmark"]*df_benchmark.unit_price/100).mean()

23342.70908650902

543.2057369607129

In [118]:
df_benchmark.describe()

Unnamed: 0,Id,Area,unit_price,unit_price_hat,price,price_benchmark,diff_to_benchmark
count,35647.0,35647.0,35647.0,35647.0,35647.0,35647.0,35647.0
mean,60904190.0,59.535511,11525.259537,11215.125977,685707.7,668846.5,2.993845
std,1114287.0,27.208517,2606.509445,2147.241699,387274.5,360805.3,13.912943
min,23260980.0,20.0,5021.676301,6172.064941,195000.0,185387.9,-68.052388
25%,60930470.0,41.6,9674.881085,9627.102539,455000.0,444625.7,-5.348915
50%,61090750.0,53.7,11189.655172,11061.323242,578344.0,566148.9,1.815859
75%,61208210.0,68.615,12937.931034,12505.039062,770000.0,763796.0,9.951717
max,61332350.0,200.0,20000.0,20656.736328,3990000.0,3598857.0,118.908025


In [119]:
df_summary_q3 = performance_summary(model, X_b, y_b)

35.30% : Share of forecasts within 5% absolute error

1182.16   : Mean absolute error 

9.88% : Mean absolute percentage error



In [120]:
df_summary = performance_summary(model, X_test, y_test)

41.94% : Share of forecasts within 5% absolute error

998.89   : Mean absolute error 

8.88% : Mean absolute percentage error



In [121]:
df_summary_q3.describe()

Unnamed: 0,y_hat,y_true,abs_error,error,relative_error,relative_abs_error,diff_to_benchmark,variation_from_benchmark
count,35647.0,35647.0,35647.0,35647.0,35647.0,35647.0,35647.0,35647.0
mean,11215.125977,11525.259537,1182.162886,-310.146633,-0.011629,0.09882299,0.029938,310.146633
std,2147.241699,2606.509445,1171.308806,1635.028741,0.134216,0.09155888,0.139129,1635.028741
min,6172.064941,5021.676301,0.001033,-10771.227539,-0.543187,1.210068e-07,-0.680524,-11502.671875
25%,9627.102539,9674.881085,367.843758,-1082.296829,-0.09051,0.03485531,-0.053489,-585.986465
50%,11061.323242,11189.655172,828.648112,-186.319851,-0.017835,0.07511787,0.018159,186.319851
75%,12505.039062,12937.931034,1601.314301,585.986465,0.056512,0.1378735,0.099517,1082.296829
max,20656.736328,20000.0,11502.671875,11502.671875,2.130124,2.130124,1.18908,10771.227539


In [122]:
df_summary.describe()

Unnamed: 0,y_hat,y_true,abs_error,error,relative_error,relative_abs_error,diff_to_benchmark,variation_from_benchmark
count,3820.0,3820.0,3820.0,3820.0,3820.0,3820.0,3820.0,3820.0
mean,11039.555664,11108.809011,998.892819,-69.252725,0.008904,0.088777,0.006246,69.252725
std,2194.90625,2625.90009,1050.360156,1447.93211,0.127785,0.092331,0.124089,1447.93211
min,6179.633789,5083.056478,0.746462,-8812.225586,-0.440611,0.000117,-0.608214,-8969.494683
25%,9420.509277,9181.53699,281.160226,-692.385901,-0.059531,0.026716,-0.062338,-666.127876
50%,10873.270996,10863.034483,674.856529,17.549732,0.001703,0.06344,-0.0017,-17.549732
75%,12363.777832,12534.714388,1340.494061,666.127876,0.066482,0.120356,0.063299,692.385901
max,20203.171875,20000.0,8969.494683,8969.494683,1.552413,1.552413,0.787666,8812.225586


## Variation to benchmark distribution

In [123]:
trace0 = go.Histogram(
    x = df_summary.diff_to_benchmark, 
    histnorm = 'percent',
    xbins=dict(size = 0.05, start = -0.525, end=0.5),
    marker = dict(
        color="blue",
        opacity=0.5
        ),
        name = "2020Q2"
        
                     )

trace1 = go.Histogram(
    x = df_summary_q3.diff_to_benchmark, 
    histnorm = 'percent',
    xbins=dict(size = 0.05, start = -0.525, end=0.5),
    marker = dict(
        color="red",
        opacity=0.5
        ),
        name = "2020Q4"
        
                     )

data = [trace0,trace1]

figure = go.Figure(
    data = data,
    layout = go.Layout(
    barmode = "overlay",
        
    title = "Price distribution")
        
)

iplot(figure)