In [10]:
import pandas as pd
import numpy as np
import re
import json
import numpy as np
import timeit 
from datetime import datetime

In [11]:
import tensorflow as tf
from tensorflow import keras

In [12]:
# Cufflinks wrapper on plotly
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot
cufflinks.go_offline()

# Set global theme

import plotly.figure_factory as ff

import plotly.graph_objects as go

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
#Selecting a central city point to center all graphs around - Swietokrzyska Subway 
center_coors=52.235176, 21.008393

In [15]:
df=pd.read_excel("RE_models_input.xlsx")

In [16]:
df.columns

Index(['Id', 'Area', 'Price', 'latitude', 'longitude', 'build_year',
       'building_floors_num', 'rooms_num', 'City', 'subdistrict', 'market',
       'Building_material', 'Building_ownership', 'Building_type',
       'Construction_status', 'floor_no', 'Heating', 'Windows_type',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_electricity',
       'Media_types_internet', 'Media_types_phone', 'Media_types_sewage',
       'Media_types_water', 'Security_types_alarm

In [17]:
df['rooms_num']=np.where(df['rooms_num']=="more",10,df['rooms_num'])

In [18]:
df.drop(columns=['floor_no',"City","district_old",'price_decrease_from_20k','price_decrease_per_10min'],inplace=True)

In [19]:
df.columns

Index(['Id', 'Area', 'Price', 'latitude', 'longitude', 'build_year',
       'building_floors_num', 'rooms_num', 'subdistrict', 'market',
       'Building_material', 'Building_ownership', 'Building_type',
       'Construction_status', 'Heating', 'Windows_type',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_electricity',
       'Media_types_internet', 'Media_types_phone', 'Media_types_sewage',
       'Media_types_water', 'Security_types_alarm',
       'Security_

In [20]:
df["distance_driving"]=(df.distance_driving_8AM+df.distance_return_driving_5PM)/2
df["distance_transit"]=(df.distance_transit_8AM+df.distance_return_transit_5PM)/2
df["time_driving"]=(df.time_driving_8AM+df.time_return_driving_5PM)/2
df["time_transit"]=(df.time_return_transit_5PM+df.time_transit_8AM)/2

In [21]:
df.drop(columns=['distance_transit_8AM', 'time_transit_8AM', 'distance_driving_8AM',
       'time_driving_8AM', 'distance_return_transit_5PM',
       'time_return_transit_5PM', 'distance_return_driving_5PM',
       'time_return_driving_5PM'],inplace=True)

# Data exploration

## Category composition

In [22]:
unique_features=["district","market","Building_material","Building_ownership","Building_type","Construction_status","Heating",
"Windows_type"]

In [23]:
import plotly.express as px

In [24]:
for feature in unique_features:
    feature_list=["Id"]
    feature_list.append(feature)

    df_temp=df[feature_list].groupby(feature, as_index=False).count()
    df_temp.rename(columns={"Id":"share"},inplace=True)
    df_temp.sort_values(by="share",inplace=True, ascending=False)
    df_temp["share"]=df_temp["share"]/df.shape[0]
    df_temp["share"]=np.around(df_temp["share"],3)*100
    df_temp["colour"]=np.where(df_temp[feature]=="not_specified","missing_data","valid_data")
    
    fig = px.bar(df_temp, x=feature, y='share', color="colour",
                 color_discrete_sequence=["blue", "red"],
                 category_orders={"colour": ["valid_data", "missing_data"]},)
    print("\n Feature summary for {} - Share of category within whole sample".format(feature))
    fig.show()
    print("----------------------------------------------------------------------------------------------------------------------------\n\n\n")

    


 Feature summary for district - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for market - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Building_material - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Building_ownership - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Building_type - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Construction_status - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Heating - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Windows_type - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------





## Average unit price by feature 

In [25]:
for feature in unique_features:
    feature_list=["unit_price"]
    feature_list.append(feature)

    df_temp=df[feature_list]
    
    print("\n Unit price distribution in split by {}".format(feature))
    
    fig = fig = px.box(df_temp, y="unit_price", x=feature, points="suspectedoutliers");
    fig.update_yaxes(range=[5000, 25000])


    print("----------------------------------------------------------------------------------------------------------------------------\n\n\n")

    


 Unit price distribution in split by district


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by market


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Building_material


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Building_ownership


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Building_type


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Construction_status


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Heating


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Windows_type


----------------------------------------------------------------------------------------------------------------------------





# Transforming data for ML models

In [26]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
df_cat=df[unique_features]

In [28]:
enc=OneHotEncoder()
enc.fit(df_cat)

one_hot_val=enc.fit_transform(df_cat).toarray().astype(int)
one_hot_columns=enc.get_feature_names(unique_features)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [29]:
df_one_hot=pd.DataFrame(one_hot_val, columns=one_hot_columns)

In [30]:
df_one_hot.columns

Index(['district_Bemowo', 'district_Bialoleka', 'district_Bielany',
       'district_Downtown', 'district_Mokotow', 'district_Ochota',
       'district_Praga', 'district_Southern Praga', 'district_Subburbs',
       'district_Targowek', 'district_Ursynow', 'district_Wawer',
       'district_Wilanow', 'district_Wlochy', 'district_Wola',
       'district_Zoliborz', 'market_primary', 'market_secondary',
       'Building_material_breezeblock', 'Building_material_brick',
       'Building_material_cellular_concrete', 'Building_material_concrete',
       'Building_material_concrete_plate', 'Building_material_hydroton',
       'Building_material_not_specified', 'Building_material_other',
       'Building_material_reinforced_concrete', 'Building_material_silikat',
       'Building_material_wood', 'Building_ownership_co_operative_ownership',
       'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe',
       'Building_ownership_full_ownership', 'Building_ownership_not_spec

In [31]:
not_specified_idx=[]
for i in range(0,df_one_hot.columns.shape[0]):
    if "not_specified" in df_one_hot.columns[i]:
        not_specified_idx.append(i)
not_specified_idx=np.asarray(not_specified_idx)

Let's drop all "not_specified" columns and secondary market in Mokotow district (it will be used as baseline in models)

In [32]:
drop_collinear_cols=list(df_one_hot.columns[not_specified_idx])
drop_collinear_cols.append('district_Mokotow')
drop_collinear_cols.append('market_secondary')

In [33]:
def list_diff(list1, list2):
    out = []
    for ele in list1:
        if not ele in list2:
            out.append(ele)
    return out

In [34]:
df_one_hot.columns

Index(['district_Bemowo', 'district_Bialoleka', 'district_Bielany',
       'district_Downtown', 'district_Mokotow', 'district_Ochota',
       'district_Praga', 'district_Southern Praga', 'district_Subburbs',
       'district_Targowek', 'district_Ursynow', 'district_Wawer',
       'district_Wilanow', 'district_Wlochy', 'district_Wola',
       'district_Zoliborz', 'market_primary', 'market_secondary',
       'Building_material_breezeblock', 'Building_material_brick',
       'Building_material_cellular_concrete', 'Building_material_concrete',
       'Building_material_concrete_plate', 'Building_material_hydroton',
       'Building_material_not_specified', 'Building_material_other',
       'Building_material_reinforced_concrete', 'Building_material_silikat',
       'Building_material_wood', 'Building_ownership_co_operative_ownership',
       'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe',
       'Building_ownership_full_ownership', 'Building_ownership_not_spec

In [35]:
df_cat_columns=list_diff(df_one_hot.columns,drop_collinear_cols)

In [36]:
df_cat=df_one_hot[df_cat_columns]

In [37]:
columns_base=list_diff(df.columns, unique_features)

In [38]:
drop_columns_base=[
    'Price','latitude','longitude','lon_mod','lat_mod','grid_price','sample_size','City','subdistrict'
                  ]

In [39]:
columns_base=list_diff(columns_base, drop_columns_base)

In [40]:
df_base=df[columns_base]

In [41]:
df_base.columns

Index(['Id', 'Area', 'build_year', 'building_floors_num', 'rooms_num',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_electricity',
       'Media_types_internet', 'Media_types_phone', 'Media_types_sewage',
       'Media_types_water', 'Security_types_alarm',
       'Security_types_anti_burglary_door', 'Security_types_closed_area',
       'Security_types_entryphone', 'Security_types_monitoring',
       'Security_types_roller_shutters', 'floor_num', 'unit_price',


## Combining data for modeling

In [42]:
ml_data=pd.concat([df_base,df_cat],axis=1)

In [43]:
ml_data.shape

(11788, 89)

In [44]:
ml_data.head()

Unnamed: 0,Id,Area,build_year,building_floors_num,rooms_num,Equipment_types_dishwasher,Equipment_types_fridge,Equipment_types_furniture,Equipment_types_oven,Equipment_types_stove,...,Construction_status_to_completion,Construction_status_to_renovation,Heating_boiler_room,Heating_electrical,Heating_gas,Heating_other,Heating_urban,Windows_type_aluminium,Windows_type_plastic,Windows_type_wooden
0,60534950,80.1,2021,8,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60534789,52.27,2021,8,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60534636,64.63,2021,8,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,60441425,46.86,2020,14,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60225257,57.4,2019,11,3,1,1,1,1,1,...,0,0,0,0,0,0,1,0,1,0


In [45]:
ml_data.columns[:]

Index(['Id', 'Area', 'build_year', 'building_floors_num', 'rooms_num',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_electricity',
       'Media_types_internet', 'Media_types_phone', 'Media_types_sewage',
       'Media_types_water', 'Security_types_alarm',
       'Security_types_anti_burglary_door', 'Security_types_closed_area',
       'Security_types_entryphone', 'Security_types_monitoring',
       'Security_types_roller_shutters', 'floor_num', 'unit_price',


# Selecting best features

In [46]:
counts, bins = np.histogram(ml_data.unit_price, bins=range(0, 50000, 1000))
bins = bins[1:]

fig = px.bar(x=bins, y=counts, labels={'x':'Unit price', 'y':'count'})
fig.show()

In [47]:
X=ml_data.copy()
X=X.query("unit_price<=25000 and unit_price>5000")
y=X.unit_price
X.drop(columns=["unit_price","Id"],inplace=True)

In [48]:
from sklearn.feature_selection import SelectKBest, f_regression, chi2

In [49]:
bestfeatures = SelectKBest(score_func=f_regression, k="all")

In [50]:
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [51]:
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']

In [52]:
featureScores.nlargest(30,'Score')

Unnamed: 0,Specs,Score
35,distance_driving,6889.140723
36,distance_transit,6725.884253
38,time_transit,5360.201112
37,time_driving,4741.988679
42,district_Downtown,3516.937318
40,district_Bialoleka,1766.097573
75,Building_type_tenement,769.001425
54,market_primary,701.343385
70,Building_type_block,629.435651
46,district_Subburbs,582.320969


In [53]:
featureScores.query("Score>40").shape

(51, 2)

In [54]:
top_features=featureScores.nlargest(50,'Score').Specs.unique()

In [55]:
featureScores.sort_values(by="Score",inplace=True,ascending=False)

import plotly.express as px
fig = px.bar(featureScores, x='Specs', y='Score')
fig.show()

In [56]:
X=X[top_features]

## Spliting into train and test sets

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [128]:
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

# Training initial model

In [167]:
from sklearn.ensemble import RandomForestRegressor

## Hyper parameters selection

In [50]:
param_grid = [
   
    
    { "bootstrap":[False],
     "n_estimators":[300],
     "max_features":[10,15,20,30],"max_depth":[10,15,20,25,30],"min_samples_leaf":[3,6,12],"min_samples_split":[6,12,24],
     "min_impurity_decrease":[0.01,0.03,0.1,0.3]
},
    
]

In [90]:
from sklearn.model_selection import GridSearchCV

In [91]:
forest_cl=RandomForestRegressor(random_state=10, n_jobs=2)

In [92]:
grid_search = GridSearchCV(forest_cl, param_grid, cv=4,
                          scoring="neg_mean_absolute_error",return_train_score=True)

In [93]:
grid_search.fit(X_train, y_train)
rf_clf_best_params = grid_search.best_estimator_


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=2,
                                             oob_score=False, random_...
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
  

In [94]:
grid_search.best_params_

{'max_depth': 20,
 'max_features': 30,
 'min_impurity_decrease': 0.01,
 'min_samples_leaf': 3,
 'min_samples_split': 6,
 'n_estimators': 300}

In [95]:
grid_search.best_params_.keys()

dict_keys(['max_depth', 'max_features', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'n_estimators'])

In [168]:
#rf_model=grid_search.best_estimator_

rf_model=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features=30, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=6, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=2, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

In [169]:
rf_model.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features=30, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=6, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=2, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

In [170]:
rf_model.score(X_train,y_train)

0.8920943080190676

In [171]:
rf_model.score(X_test,y_test)

0.6994301554997804

In [172]:
importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X.columns
importance_df.sort_values(by="coefficients",inplace=True)

In [173]:

trace0=go.Bar(
        x=importance_df.coefficients,
        y=importance_df.features,
        orientation="h",
        
        marker=dict(
        color="blue",

        opacity=0.5
        ),
       
    )








data=[trace0]
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="RF Classifier - Feature importance",
        xaxis=dict(title="Importance"),
        yaxis=dict(title="Feature")

    
    ))
iplot(figure)

In [174]:
def performance_summary(model, X_train,y_train, X_test, y_test ):
    
    y_hat=model.predict(X_test)
    
    df_summary=pd.DataFrame(y_hat, columns=["y_hat"])
    df_summary["y_true"]=y_test
    df_summary["predicted_rank"]=df_summary.y_hat.rank()
    df_summary["abs_error"]=np.abs(df_summary.y_true-df_summary.y_hat)
    df_summary["error"]=df_summary.y_hat-df_summary.y_true
    df_summary["relative_error"]= df_summary["error"]/df_summary.y_true
    df_summary["relative_abs_error"]= df_summary["abs_error"]/df_summary.y_true
    df_summary.sort_values(by="predicted_rank",inplace=True)
    df_summary.reset_index(drop=True, inplace=True)
    
    
    return(df_summary)
    

## Error analysis

In [175]:
df_summary=performance_summary(rf_model, X_train,y_train, X_test, y_test)

In [176]:
df_summary.describe()

Unnamed: 0,y_hat,y_true,predicted_rank,abs_error,error,relative_error,relative_abs_error
count,2330.0,2330.0,2330.0,2330.0,2330.0,2330.0,2330.0
mean,11119.714998,11115.604721,1165.5,1050.698057,4.110277,0.019464,0.092305
std,2413.967596,2986.134603,672.757328,1255.285166,1637.120753,0.140152,0.107228
min,5975.218922,5357.0,1.0,0.558841,-11321.638781,-0.480326,3.8e-05
25%,9417.251563,9007.75,583.25,259.89792,-551.654599,-0.048075,0.025321
50%,10915.928576,10698.0,1165.5,640.086259,87.887721,0.008678,0.058762
75%,12386.559233,12498.25,1747.75,1383.774089,735.65597,0.07162,0.12336
max,23179.475384,25000.0,2330.0,11321.638781,9695.495471,1.407592,1.407592


In [177]:
df_summary.head()

Unnamed: 0,y_hat,y_true,predicted_rank,abs_error,error,relative_error,relative_abs_error
0,5975.218922,6055,1.0,79.781078,-79.781078,-0.013176,0.013176
1,6376.263629,6745,2.0,368.736371,-368.736371,-0.054668,0.054668
2,6392.824857,5829,3.0,563.824857,563.824857,0.096728,0.096728
3,6439.048714,7726,4.0,1286.951286,-1286.951286,-0.166574,0.166574
4,6613.999964,6209,5.0,404.999964,404.999964,0.065228,0.065228


In [178]:


trace0=go.Scatter(
        name="Predicted",
        y=df_summary.y_hat,
        x=df_summary.predicted_rank,
        mode='lines',
        marker=dict(
        size=5,
        opacity=0.3
        ),
       
    )




trace1=go.Scatter(
        name="Actual",
        y=df_summary.y_true,
        x=df_summary.predicted_rank,
        mode='markers',
        marker=dict(
        color="blue",
    
        size=5,
        opacity=0.2
        ),
        text=df_summary.error
       
    )




data=[trace0,trace1]
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Predicted vs actual unit price",
        yaxis=dict(title="Unit price",range=(0,25000)),
        xaxis=dict(title="Predicted rank")

    
    ))
iplot(figure)

In [179]:
counts, bins = np.histogram(df_summary.relative_error*100,bins=range(-50,50,2))
counts=counts/df_summary.shape[0]*100
bins=bins[1:]

In [180]:

fig = px.bar(x=bins, y=counts, labels={"x":"relative variance [%]", "y":"count"})
fig.show()

In [109]:
df_summary

Unnamed: 0,y_hat,y_true,predicted_rank,abs_error,error,relative_error,relative_abs_error
0,5975.218922,6055,1.0,79.781078,-79.781078,-0.013176,0.013176
1,6376.263629,6745,2.0,368.736371,-368.736371,-0.054668,0.054668
2,6392.824857,5829,3.0,563.824857,563.824857,0.096728,0.096728
3,6439.048714,7726,4.0,1286.951286,-1286.951286,-0.166574,0.166574
4,6613.999964,6209,5.0,404.999964,404.999964,0.065228,0.065228
...,...,...,...,...,...,...,...
2325,20872.439647,23043,2326.0,2170.560353,-2170.560353,-0.094196,0.094196
2326,21043.187119,20204,2327.0,839.187119,839.187119,0.041536,0.041536
2327,21368.665650,19928,2328.0,1440.665650,1440.665650,0.072294,0.072294
2328,23091.050279,24000,2329.0,908.949721,-908.949721,-0.037873,0.037873


In [110]:
print("Share of forecasts within 25% absolute error {}".format(df_summary.query("relative_abs_error<0.25").shape[0]/df_summary.shape[0]))

Share of forecasts within 25% absolute error 0.934763948497854


In [111]:
print("Share of forecasts within 10% absolute error {}".format(df_summary.query("relative_abs_error<0.10").shape[0]/df_summary.shape[0]))

Share of forecasts within 10% absolute error 0.6759656652360515


In [112]:
print("Share of forecasts within 5% absolute error {}".format(df_summary.query("relative_abs_error<0.05").shape[0]/df_summary.shape[0]))

Share of forecasts within 5% absolute error 0.4497854077253219


# Neural networks models

In [129]:
y_train_ar=y_train_ar.reshape(-1,1)
y_test_ar=y_test_ar.reshape(-1,1)

In [130]:
y_train_ar.shape

(9317, 1)

In [131]:
type(y_train_ar)

numpy.ndarray

In [132]:
X_train_ar.shape

(9317, 50)

In [133]:
y_test_ar.shape

(2330, 1)

In [134]:
X_test_ar.shape

(2330, 50)

In [157]:
tf.keras.backend.clear_session()
tf.random.set_seed(60)

DNN=keras.models.Sequential([
    keras.layers.Dense(input_shape=[X_train.shape[1] ],units=100, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=50, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=10, activation='relu'),
    keras.layers.Dense(units=1, activation="linear"),



])

In [158]:
DNN.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               5100      
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 10,671
Trainable params: 10,671
Non-trainable params: 0
____________________________________________________

In [159]:
optimizer = keras.optimizers.Adam(lr=0.0003)



DNN.compile(optimizer=optimizer, 
                    loss='mse')


history = DNN.fit(X_train, y_train_ar,
                    epochs=50,
                    validation_data=(X_test, y_test_ar),
                    verbose=1)

Train on 9317 samples, validate on 2330 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [163]:
df_summary_dnn=performance_summary(DNN, X_train,y_train, X_test, y_test)



In [164]:
df_summary_dnn.describe()

Unnamed: 0,y_hat,y_true,predicted_rank,abs_error,error,relative_error,relative_abs_error
count,2330.0,2330.0,2330.0,2330.0,2330.0,2330.0,2330.0
mean,10259.59668,11115.604721,1165.5,1701.022163,-856.011568,-0.044505,0.142915
std,1774.607178,2986.134603,672.757328,1760.851096,2293.958352,0.179248,0.116952
min,4939.857422,5357.0,1.0,0.272949,-14507.110352,-0.67275,3.6e-05
25%,8940.170898,9007.75,583.25,560.716553,-1836.387451,-0.157053,0.056831
50%,10229.019043,10698.0,1165.5,1205.521973,-536.831055,-0.051594,0.116704
75%,11589.226318,12498.25,1747.75,2174.33374,595.123291,0.063808,0.198656
max,14463.048828,25000.0,2330.0,14507.110352,6491.181641,0.94239,0.94239


In [165]:
print("Share of forecasts within 5% absolute error {}".format(df_summary_dnn.query("relative_abs_error<0.05").shape[0]/df_summary.shape[0]))

Share of forecasts within 5% absolute error 0.22360515021459226


In [182]:


trace0=go.Scatter(
        name="Predicted",
        y=df_summary_dnn.y_hat,
        x=df_summary_dnn.predicted_rank,
        mode='lines',
        marker=dict(
        size=5,
        opacity=0.3
        ),
       
    )




trace1=go.Scatter(
        name="Actual",
        y=df_summary_dnn.y_true,
        x=df_summary_dnn.predicted_rank,
        mode='markers',
        marker=dict(
        color="blue",
    
        size=5,
        opacity=0.2
        ),
        text=df_summary.error
       
    )




data=[trace0,trace1]
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Predicted vs actual unit price",
        yaxis=dict(title="Unit price",range=(0,25000)),
        xaxis=dict(title="Predicted rank")

    
    ))

iplot(figure)

In [184]:
counts, bins = np.histogram(df_summary_dnn.relative_error*100,bins=range(-50,50,2))
counts=counts/df_summary_dnn.shape[0]*100
bins=bins[1:]

In [185]:

fig = px.bar(x=bins, y=counts, labels={"x":"relative variance [%]", "y":"count"})
fig.show()