In [1]:
import pandas as pd
import numpy as np
import re
import json
import numpy as np
import timeit 
from datetime import datetime

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint

In [3]:
# Cufflinks wrapper on plotly
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot
cufflinks.go_offline()

# Set global theme

import plotly.figure_factory as ff

import plotly.graph_objects as go
import plotly.express as px

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
#Selecting a central city point to center all graphs around - Swietokrzyska Subway 
center_coors=52.235176, 21.008393

In [6]:
df=pd.read_excel("RE_models_input.xlsx")

In [7]:
df.columns

Index(['Id', 'Area', 'Price', 'latitude', 'longitude', 'build_year',
       'building_floors_num', 'rooms_num', 'City', 'subdistrict', 'market',
       'Building_material', 'Building_ownership', 'Building_type',
       'Construction_status', 'floor_no', 'Heating', 'Windows_type',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_electricity',
       'Media_types_internet', 'Media_types_phone', 'Media_types_sewage',
       'Media_types_water', 'Security_types_alarm

In [8]:
df['rooms_num']=np.where(df['rooms_num']=="more",10,df['rooms_num'])

In [9]:
df.drop(columns=['floor_no',"City","district_old",'price_decrease_from_20k','price_decrease_per_10min'],inplace=True)

In [10]:
df.columns

Index(['Id', 'Area', 'Price', 'latitude', 'longitude', 'build_year',
       'building_floors_num', 'rooms_num', 'subdistrict', 'market',
       'Building_material', 'Building_ownership', 'Building_type',
       'Construction_status', 'Heating', 'Windows_type',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_electricity',
       'Media_types_internet', 'Media_types_phone', 'Media_types_sewage',
       'Media_types_water', 'Security_types_alarm',
       'Security_

In [11]:
df["distance_driving"]=(df.distance_driving_8AM+df.distance_return_driving_5PM)/2
df["distance_transit"]=(df.distance_transit_8AM+df.distance_return_transit_5PM)/2
df["time_driving"]=(df.time_driving_8AM+df.time_return_driving_5PM)/2
df["time_transit"]=(df.time_return_transit_5PM+df.time_transit_8AM)/2

In [12]:
df.drop(columns=['distance_transit_8AM', 'time_transit_8AM', 'distance_driving_8AM',
       'time_driving_8AM', 'distance_return_transit_5PM',
       'time_return_transit_5PM', 'distance_return_driving_5PM',
       'time_return_driving_5PM'],inplace=True)

# Data exploration

## Category composition

In [13]:
unique_features=["district","market","Building_material","Building_ownership","Building_type","Construction_status","Heating",
"Windows_type"]

In [14]:
for feature in unique_features:
    feature_list=["Id"]
    feature_list.append(feature)

    df_temp=df[feature_list].groupby(feature, as_index=False).count()
    df_temp.rename(columns={"Id":"share"},inplace=True)
    df_temp.sort_values(by="share",inplace=True, ascending=False)
    df_temp["share"]=df_temp["share"]/df.shape[0]
    df_temp["share"]=np.around(df_temp["share"],3)*100
    df_temp["colour"]=np.where(df_temp[feature]=="not_specified","missing_data","valid_data")
    
    fig = px.bar(df_temp, x=feature, y='share', color="colour",
                 color_discrete_sequence=["blue", "red"],
                 category_orders={"colour": ["valid_data", "missing_data"]},)
    print("\n Feature summary for {} - Share of category within whole sample".format(feature))
    fig.show()
    print("----------------------------------------------------------------------------------------------------------------------------\n\n\n")

    


 Feature summary for district - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for market - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Building_material - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Building_ownership - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Building_type - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Construction_status - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Heating - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Windows_type - Share of category within whole sample


----------------------------------------------------------------------------------------------------------------------------





## Average unit price by feature 

In [15]:
for feature in unique_features:
    feature_list=["unit_price"]
    feature_list.append(feature)

    df_temp=df[feature_list]
    
    print("\n Unit price distribution in split by {}".format(feature))
    
    fig = fig = px.box(df_temp, y="unit_price", x=feature, points="suspectedoutliers");
    fig.update_yaxes(range=[5000, 25000])


    print("----------------------------------------------------------------------------------------------------------------------------\n\n\n")

    


 Unit price distribution in split by district


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by market


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Building_material


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Building_ownership


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Building_type


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Construction_status


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Heating


----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Windows_type


----------------------------------------------------------------------------------------------------------------------------





# Transforming data for ML models

In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
df_cat=df[unique_features]

In [18]:
enc=OneHotEncoder()
enc.fit(df_cat)

one_hot_val=enc.fit_transform(df_cat).toarray().astype(int)
one_hot_columns=enc.get_feature_names(unique_features)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [19]:
df_one_hot=pd.DataFrame(one_hot_val, columns=one_hot_columns)

In [20]:
df_one_hot.columns

Index(['district_Bemowo', 'district_Bialoleka', 'district_Bielany',
       'district_Downtown', 'district_Mokotow', 'district_Ochota',
       'district_Praga', 'district_Southern Praga', 'district_Subburbs',
       'district_Targowek', 'district_Ursynow', 'district_Wawer',
       'district_Wilanow', 'district_Wlochy', 'district_Wola',
       'district_Zoliborz', 'market_primary', 'market_secondary',
       'Building_material_breezeblock', 'Building_material_brick',
       'Building_material_cellular_concrete', 'Building_material_concrete',
       'Building_material_concrete_plate', 'Building_material_hydroton',
       'Building_material_not_specified', 'Building_material_other',
       'Building_material_reinforced_concrete', 'Building_material_silikat',
       'Building_material_wood', 'Building_ownership_co_operative_ownership',
       'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe',
       'Building_ownership_full_ownership', 'Building_ownership_not_spec

In [21]:
not_specified_idx=[]
for i in range(0,df_one_hot.columns.shape[0]):
    if "not_specified" in df_one_hot.columns[i]:
        not_specified_idx.append(i)
not_specified_idx=np.asarray(not_specified_idx)

Let's drop all "not_specified" columns and secondary market in Mokotow district (it will be used as baseline in models)

In [22]:
drop_collinear_cols=list(df_one_hot.columns[not_specified_idx])
drop_collinear_cols.append('district_Mokotow')
drop_collinear_cols.append('market_secondary')

In [23]:
def list_diff(list1, list2):
    out = []
    for ele in list1:
        if not ele in list2:
            out.append(ele)
    return out

In [24]:
df_one_hot.columns

Index(['district_Bemowo', 'district_Bialoleka', 'district_Bielany',
       'district_Downtown', 'district_Mokotow', 'district_Ochota',
       'district_Praga', 'district_Southern Praga', 'district_Subburbs',
       'district_Targowek', 'district_Ursynow', 'district_Wawer',
       'district_Wilanow', 'district_Wlochy', 'district_Wola',
       'district_Zoliborz', 'market_primary', 'market_secondary',
       'Building_material_breezeblock', 'Building_material_brick',
       'Building_material_cellular_concrete', 'Building_material_concrete',
       'Building_material_concrete_plate', 'Building_material_hydroton',
       'Building_material_not_specified', 'Building_material_other',
       'Building_material_reinforced_concrete', 'Building_material_silikat',
       'Building_material_wood', 'Building_ownership_co_operative_ownership',
       'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe',
       'Building_ownership_full_ownership', 'Building_ownership_not_spec

In [25]:
df_cat_columns=list_diff(df_one_hot.columns,drop_collinear_cols)

In [26]:
df_cat=df_one_hot[df_cat_columns]

In [27]:
columns_base=list_diff(df.columns, unique_features)

In [28]:
drop_columns_base=[
    'Price','latitude','longitude','lon_mod','lat_mod','grid_price','sample_size','City','subdistrict'
                  ]

In [29]:
columns_base=list_diff(columns_base, drop_columns_base)

In [30]:
df_base=df[columns_base]

In [31]:
df_base.columns

Index(['Id', 'Area', 'build_year', 'building_floors_num', 'rooms_num',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_electricity',
       'Media_types_internet', 'Media_types_phone', 'Media_types_sewage',
       'Media_types_water', 'Security_types_alarm',
       'Security_types_anti_burglary_door', 'Security_types_closed_area',
       'Security_types_entryphone', 'Security_types_monitoring',
       'Security_types_roller_shutters', 'floor_num', 'unit_price',


## Combining data for modeling

In [32]:
ml_data=pd.concat([df_base,df_cat],axis=1)

In [33]:
ml_data.shape

(11788, 89)

In [34]:
ml_data.head()

Unnamed: 0,Id,Area,build_year,building_floors_num,rooms_num,Equipment_types_dishwasher,Equipment_types_fridge,Equipment_types_furniture,Equipment_types_oven,Equipment_types_stove,...,Construction_status_to_completion,Construction_status_to_renovation,Heating_boiler_room,Heating_electrical,Heating_gas,Heating_other,Heating_urban,Windows_type_aluminium,Windows_type_plastic,Windows_type_wooden
0,60534950,80.1,2021,8,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60534789,52.27,2021,8,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60534636,64.63,2021,8,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,60441425,46.86,2020,14,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60225257,57.4,2019,11,3,1,1,1,1,1,...,0,0,0,0,0,0,1,0,1,0


In [35]:
ml_data.columns[:]

Index(['Id', 'Area', 'build_year', 'building_floors_num', 'rooms_num',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_electricity',
       'Media_types_internet', 'Media_types_phone', 'Media_types_sewage',
       'Media_types_water', 'Security_types_alarm',
       'Security_types_anti_burglary_door', 'Security_types_closed_area',
       'Security_types_entryphone', 'Security_types_monitoring',
       'Security_types_roller_shutters', 'floor_num', 'unit_price',


# Selecting best features

In [36]:
counts, bins = np.histogram(ml_data.unit_price, bins=range(0, 50000, 1000))
bins = bins[1:]

fig = px.bar(x=bins, y=counts, labels={'x':'Unit price', 'y':'count'})
fig.show()

In [37]:
X=ml_data.copy()
X=X.query("unit_price<=25000 and unit_price>5000")
y=X.unit_price
X.drop(columns=["unit_price","Id"],inplace=True)

In [38]:
from sklearn.feature_selection import SelectKBest, f_regression, chi2

In [39]:
bestfeatures = SelectKBest(score_func=f_regression, k="all")

In [40]:
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [41]:
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']

In [42]:
featureScores.nlargest(30,'Score')

Unnamed: 0,Specs,Score
35,distance_driving,6889.140723
36,distance_transit,6725.884253
38,time_transit,5360.201112
37,time_driving,4741.988679
42,district_Downtown,3516.937318
40,district_Bialoleka,1766.097573
75,Building_type_tenement,769.001425
54,market_primary,701.343385
70,Building_type_block,629.435651
46,district_Subburbs,582.320969


In [43]:
featureScores.query("Score>40").shape

(51, 2)

In [44]:
top_features=featureScores.nlargest(30,'Score').Specs.unique()

In [45]:
featureScores.sort_values(by="Score",inplace=True,ascending=False)

import plotly.express as px
fig = px.bar(featureScores, x='Specs', y='Score')
fig.show()

In [46]:
X=X[top_features]

## Spliting into train and test sets

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [48]:
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

# Training initial model

In [49]:
from sklearn.ensemble import RandomForestRegressor

## Hyper parameters selection

In [50]:
param_grid = [
   
    
    { "bootstrap":[False],
     "n_estimators":[300],
     "max_features":[10,15,20,30],"max_depth":[10,15,20,25,30],"min_samples_leaf":[3,6,12],"min_samples_split":[6,12,24],
     "min_impurity_decrease":[0.01,0.03,0.1,0.3]
},
    
]

In [51]:
from sklearn.model_selection import GridSearchCV

In [52]:
forest_cl=RandomForestRegressor(random_state=10, n_jobs=2)

In [53]:
grid_search = GridSearchCV(forest_cl, param_grid, cv=4,
                          scoring="neg_mean_absolute_error",return_train_score=True)

In [54]:
#grid_search.fit(X_train, y_train)
#rf_clf_best_params = grid_search.best_estimator_


In [55]:
#grid_search.best_params_

In [56]:
#grid_search.best_params_.keys()

In [57]:
#rf_model=grid_search.best_estimator_

rf_model=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features=30, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=6, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=2, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

In [58]:
rf_model.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features=30, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=6, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=2, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

In [59]:
rf_model.score(X_train,y_train)

0.8948822323367762

In [60]:
rf_model.score(X_test,y_test)

0.6942192231860257

In [61]:
importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X.columns
importance_df.sort_values(by="coefficients",inplace=True)

In [62]:

trace0=go.Bar(
        x=importance_df.coefficients,
        y=importance_df.features,
        orientation="h",
        
        marker=dict(
        color="blue",

        opacity=0.5
        ),
       
    )








data=[trace0]
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="RF Classifier - Feature importance",
        xaxis=dict(title="Importance"),
        yaxis=dict(title="Feature")

    
    ))
iplot(figure)

In [63]:
def performance_summary(model, X_test, y_test ):
    
    y_hat=model.predict(X_test)
    
    df_summary=pd.DataFrame(y_hat, columns=["y_hat"])
    df_summary["y_true"]=y_test
    df_summary["abs_error"]=np.abs(df_summary.y_true-df_summary.y_hat)
    df_summary["error"]=df_summary.y_hat-df_summary.y_true
    df_summary["relative_error"]= df_summary["error"]/df_summary.y_true
    df_summary["relative_abs_error"]= df_summary["abs_error"]/df_summary.y_true
    
    print("Share of forecasts within 25% absolute error {:.3f}\n".format(df_summary.query("relative_abs_error<0.25").shape[0]/df_summary.shape[0]))
    
    print("Share of forecasts within 10% absolute error {:.3f}\n".format(df_summary.query("relative_abs_error<0.10").shape[0]/df_summary.shape[0]))
    
    print("Share of forecasts within 5% absolute error {:.3f}\n".format(df_summary.query("relative_abs_error<0.05").shape[0]/df_summary.shape[0]))
    
    return(df_summary)
    

## Error analysis

In [64]:
df_summary=performance_summary(rf_model, X_test, y_test)

Share of forecasts within 25% absolute error 0.931

Share of forecasts within 10% absolute error 0.681

Share of forecasts within 5% absolute error 0.445



In [65]:
df_summary.describe()

Unnamed: 0,y_hat,y_true,abs_error,error,relative_error,relative_abs_error
count,2330.0,2330.0,2330.0,2330.0,2330.0,2330.0
mean,11115.221709,11115.604721,1054.023484,-0.383012,0.018684,0.092704
std,2445.812283,2986.134603,1270.907009,1651.256188,0.142342,0.109603
min,5948.6967,5357.0,0.7858,-11647.491826,-0.4659,7.7e-05
25%,9409.605871,9007.75,250.442715,-572.386772,-0.049092,0.02393
50%,10895.196716,10698.0,651.088525,61.157643,0.006108,0.057948
75%,12400.003881,12498.25,1392.450294,716.76362,0.070896,0.122748
max,23270.928575,25000.0,11647.491826,10065.413455,1.455818,1.455818


In [66]:
df_summary.head()

Unnamed: 0,y_hat,y_true,abs_error,error,relative_error,relative_abs_error
0,11549.840741,12105,555.159259,-555.159259,-0.045862,0.045862
1,12692.447437,12831,138.552563,-138.552563,-0.010798,0.010798
2,10393.201605,9224,1169.201605,1169.201605,0.126756,0.126756
3,9206.581258,9346,139.418742,-139.418742,-0.014917,0.014917
4,11905.078672,11906,0.921328,-0.921328,-7.7e-05,7.7e-05


In [67]:


trace0=go.Scatter(
        name="Predicted",
        y=df_summary.y_true,
        x=df_summary.y_true,
        mode='lines',
        marker=dict(
        size=5,
        opacity=0.3
        ),
       
    )




trace1=go.Scatter(
        name="Actual",
        y=df_summary.y_hat,
        x=df_summary.y_true,
        mode='markers',
        marker=dict(
        color="blue",
    
        size=5,
        opacity=0.1
        ),
        text=df_summary.error
       
    )




data=[trace0,trace1]
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Predicted vs actual unit price",
        yaxis=dict(title="Predicted price",range=(0,25000)),
        xaxis=dict(title="Actual price")

    
    ))

iplot(figure)

In [68]:
counts, bins = np.histogram(df_summary.relative_error*100,bins=range(-50,50,2))
counts=counts/df_summary.shape[0]*100
bins=bins[1:]

In [69]:

fig = px.bar(x=bins, y=counts, labels={"x":"relative variance [%]", "y":"count"})
fig.show()

# Neural networks models

## Transforming and scaling data for NN

In [70]:
def DNN_plot_loss(history, starting_epoch):

        trace0=go.Scatter(
                y=history.history['loss'][starting_epoch:],
                x=history.epoch[starting_epoch:],
                mode='lines',
                marker=dict(
                color="blue",
                size=5,
                opacity=0.5
                ),
                name="Training Loss"
            )


        trace1=go.Scatter(
                y=history.history['val_loss'][starting_epoch:],
                x=history.epoch[starting_epoch:],
                mode='lines',
                marker=dict(
                color="red",
                size=5,
                opacity=0.5
                ),
                name="Validation Loss"
            )

        data=[trace0, trace1]
        figure=go.Figure(
            data=data,
            layout=go.Layout(
                title="Learning curve",
                yaxis=dict(title="Loss"),
                xaxis=dict(title="Epoch",range=(starting_epoch,history.epoch[-1])),
                legend=dict(
                    x=1,
                    y=1,
                    traceorder="normal",
                    font=dict(
                        family="sans-serif",
                        size=12,
                        color="black"
                    ),
                bgcolor=None


            )))
        iplot(figure)

In [71]:
y_train=np.asarray(y_train).reshape(-1,1)
y_train.shape

(9317, 1)

In [72]:
y_test=np.asarray(y_test).reshape(-1,1)
y_test.shape

(2330, 1)

In [73]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [74]:
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [75]:
X_train.shape

(9317, 30)

In [76]:
X_test.shape

(2330, 30)

## Initial model

In [77]:
tf.keras.backend.clear_session()
tf.random.set_seed(60)

DNN=keras.models.Sequential([
    
    keras.layers.Dense(1024, input_dim = X_train.shape[1]),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.4),
    keras.layers.BatchNormalization(),

    keras.layers.Dense(units=512),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.3),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(units=512),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.2),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(units=256),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.2),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(units=256),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.1),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(units=128),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.05),
    keras.layers.Dense(units=1, activation="linear"),



])

In [78]:
DNN.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              31744     
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 1024)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 1024)              4096      
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0

In [79]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [80]:
t1=datetime.now()

In [81]:
optimizer = keras.optimizers.Adam(lr=0.01, decay=8e-5)



DNN.compile(optimizer=optimizer, 
            loss='mean_absolute_error',
            metrics=['mean_absolute_error'])


history = DNN.fit(X_train, y_train,
                    epochs=300,batch_size=512,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks_list,
                    verbose=1)

Train on 9317 samples, validate on 2330 samples
Epoch 1/300
Epoch 00001: val_loss improved from inf to 6774.90706, saving model to Weights-001--6774.90706.hdf5
Epoch 2/300
Epoch 00002: val_loss improved from 6774.90706 to 5136.79004, saving model to Weights-002--5136.79004.hdf5
Epoch 3/300
Epoch 00003: val_loss did not improve from 5136.79004
Epoch 4/300
Epoch 00004: val_loss did not improve from 5136.79004
Epoch 5/300
Epoch 00005: val_loss did not improve from 5136.79004
Epoch 6/300
Epoch 00006: val_loss did not improve from 5136.79004
Epoch 7/300
Epoch 00007: val_loss did not improve from 5136.79004
Epoch 8/300
Epoch 00008: val_loss did not improve from 5136.79004
Epoch 9/300
Epoch 00009: val_loss improved from 5136.79004 to 2823.29307, saving model to Weights-009--2823.29307.hdf5
Epoch 10/300
Epoch 00010: val_loss improved from 2823.29307 to 2141.70752, saving model to Weights-010--2141.70752.hdf5
Epoch 11/300
Epoch 00011: val_loss did not improve from 2141.70752
Epoch 12/300
Epoch 

Epoch 00023: val_loss did not improve from 1361.56763
Epoch 24/300
Epoch 00024: val_loss improved from 1361.56763 to 1342.10056, saving model to Weights-024--1342.10056.hdf5
Epoch 25/300
Epoch 00025: val_loss did not improve from 1342.10056
Epoch 26/300
Epoch 00026: val_loss improved from 1342.10056 to 1321.08330, saving model to Weights-026--1321.08330.hdf5
Epoch 27/300
Epoch 00027: val_loss did not improve from 1321.08330
Epoch 28/300
Epoch 00028: val_loss improved from 1321.08330 to 1317.28772, saving model to Weights-028--1317.28772.hdf5
Epoch 29/300
Epoch 00029: val_loss did not improve from 1317.28772
Epoch 30/300
Epoch 00030: val_loss improved from 1317.28772 to 1308.28051, saving model to Weights-030--1308.28051.hdf5
Epoch 31/300
Epoch 00031: val_loss did not improve from 1308.28051
Epoch 32/300
Epoch 00032: val_loss improved from 1308.28051 to 1299.46177, saving model to Weights-032--1299.46177.hdf5
Epoch 33/300
Epoch 00033: val_loss did not improve from 1299.46177
Epoch 34/30

Epoch 46/300
Epoch 00046: val_loss improved from 1256.12451 to 1242.94881, saving model to Weights-046--1242.94881.hdf5
Epoch 47/300
Epoch 00047: val_loss did not improve from 1242.94881
Epoch 48/300
Epoch 00048: val_loss did not improve from 1242.94881
Epoch 49/300
Epoch 00049: val_loss did not improve from 1242.94881
Epoch 50/300
Epoch 00050: val_loss did not improve from 1242.94881
Epoch 51/300
Epoch 00051: val_loss did not improve from 1242.94881
Epoch 52/300
Epoch 00052: val_loss improved from 1242.94881 to 1240.22996, saving model to Weights-052--1240.22996.hdf5
Epoch 53/300
Epoch 00053: val_loss did not improve from 1240.22996
Epoch 54/300
Epoch 00054: val_loss did not improve from 1240.22996
Epoch 55/300
Epoch 00055: val_loss did not improve from 1240.22996
Epoch 56/300
Epoch 00056: val_loss did not improve from 1240.22996
Epoch 57/300
Epoch 00057: val_loss did not improve from 1240.22996
Epoch 58/300
Epoch 00058: val_loss did not improve from 1240.22996
Epoch 59/300
Epoch 0005

Epoch 00069: val_loss did not improve from 1219.60044
Epoch 70/300
Epoch 00070: val_loss did not improve from 1219.60044
Epoch 71/300
Epoch 00071: val_loss did not improve from 1219.60044
Epoch 72/300
Epoch 00072: val_loss did not improve from 1219.60044
Epoch 73/300
Epoch 00073: val_loss did not improve from 1219.60044
Epoch 74/300
Epoch 00074: val_loss improved from 1219.60044 to 1211.19238, saving model to Weights-074--1211.19238.hdf5
Epoch 75/300
Epoch 00075: val_loss did not improve from 1211.19238
Epoch 76/300
Epoch 00076: val_loss improved from 1211.19238 to 1210.96175, saving model to Weights-076--1210.96175.hdf5
Epoch 77/300
Epoch 00077: val_loss did not improve from 1210.96175
Epoch 78/300
Epoch 00078: val_loss did not improve from 1210.96175
Epoch 79/300
Epoch 00079: val_loss did not improve from 1210.96175
Epoch 80/300
Epoch 00080: val_loss did not improve from 1210.96175
Epoch 81/300
Epoch 00081: val_loss did not improve from 1210.96175
Epoch 82/300
Epoch 00082: val_loss d

Epoch 93/300
Epoch 00093: val_loss did not improve from 1204.98430
Epoch 94/300
Epoch 00094: val_loss did not improve from 1204.98430
Epoch 95/300
Epoch 00095: val_loss did not improve from 1204.98430
Epoch 96/300
Epoch 00096: val_loss did not improve from 1204.98430
Epoch 97/300
Epoch 00097: val_loss did not improve from 1204.98430
Epoch 98/300
Epoch 00098: val_loss did not improve from 1204.98430
Epoch 99/300
Epoch 00099: val_loss did not improve from 1204.98430
Epoch 100/300
Epoch 00100: val_loss did not improve from 1204.98430
Epoch 101/300
Epoch 00101: val_loss did not improve from 1204.98430
Epoch 102/300
Epoch 00102: val_loss did not improve from 1204.98430
Epoch 103/300
Epoch 00103: val_loss did not improve from 1204.98430
Epoch 104/300
Epoch 00104: val_loss improved from 1204.98430 to 1200.57403, saving model to Weights-104--1200.57403.hdf5
Epoch 105/300
Epoch 00105: val_loss did not improve from 1200.57403
Epoch 106/300
Epoch 00106: val_loss did not improve from 1200.57403
Ep

Epoch 117/300
Epoch 00117: val_loss did not improve from 1187.85802
Epoch 118/300
Epoch 00118: val_loss did not improve from 1187.85802
Epoch 119/300
Epoch 00119: val_loss did not improve from 1187.85802
Epoch 120/300
Epoch 00120: val_loss did not improve from 1187.85802
Epoch 121/300
Epoch 00121: val_loss did not improve from 1187.85802
Epoch 122/300
Epoch 00122: val_loss did not improve from 1187.85802
Epoch 123/300
Epoch 00123: val_loss improved from 1187.85802 to 1187.08522, saving model to Weights-123--1187.08522.hdf5
Epoch 124/300
Epoch 00124: val_loss did not improve from 1187.08522
Epoch 125/300
Epoch 00125: val_loss did not improve from 1187.08522
Epoch 126/300
Epoch 00126: val_loss did not improve from 1187.08522
Epoch 127/300
Epoch 00127: val_loss did not improve from 1187.08522
Epoch 128/300
Epoch 00128: val_loss did not improve from 1187.08522
Epoch 129/300
Epoch 00129: val_loss did not improve from 1187.08522
Epoch 130/300
Epoch 00130: val_loss improved from 1187.08522 to

Epoch 141/300
Epoch 00141: val_loss improved from 1180.57853 to 1178.67487, saving model to Weights-141--1178.67487.hdf5
Epoch 142/300
Epoch 00142: val_loss did not improve from 1178.67487
Epoch 143/300
Epoch 00143: val_loss did not improve from 1178.67487
Epoch 144/300
Epoch 00144: val_loss did not improve from 1178.67487
Epoch 145/300
Epoch 00145: val_loss did not improve from 1178.67487
Epoch 146/300
Epoch 00146: val_loss did not improve from 1178.67487
Epoch 147/300
Epoch 00147: val_loss did not improve from 1178.67487
Epoch 148/300
Epoch 00148: val_loss did not improve from 1178.67487
Epoch 149/300
Epoch 00149: val_loss did not improve from 1178.67487
Epoch 150/300
Epoch 00150: val_loss did not improve from 1178.67487
Epoch 151/300
Epoch 00151: val_loss did not improve from 1178.67487
Epoch 152/300
Epoch 00152: val_loss did not improve from 1178.67487
Epoch 153/300
Epoch 00153: val_loss did not improve from 1178.67487
Epoch 154/300
Epoch 00154: val_loss did not improve from 1178.6

Epoch 165/300
Epoch 00165: val_loss did not improve from 1178.67487
Epoch 166/300
Epoch 00166: val_loss did not improve from 1178.67487
Epoch 167/300
Epoch 00167: val_loss did not improve from 1178.67487
Epoch 168/300
Epoch 00168: val_loss did not improve from 1178.67487
Epoch 169/300
Epoch 00169: val_loss did not improve from 1178.67487
Epoch 170/300
Epoch 00170: val_loss did not improve from 1178.67487
Epoch 171/300
Epoch 00171: val_loss did not improve from 1178.67487
Epoch 172/300
Epoch 00172: val_loss did not improve from 1178.67487
Epoch 173/300
Epoch 00173: val_loss did not improve from 1178.67487
Epoch 174/300
Epoch 00174: val_loss did not improve from 1178.67487
Epoch 175/300
Epoch 00175: val_loss did not improve from 1178.67487
Epoch 176/300
Epoch 00176: val_loss did not improve from 1178.67487
Epoch 177/300
Epoch 00177: val_loss did not improve from 1178.67487
Epoch 178/300
Epoch 00178: val_loss did not improve from 1178.67487
Epoch 179/300
Epoch 00179: val_loss did not impr

Epoch 189/300
Epoch 00189: val_loss did not improve from 1173.20881
Epoch 190/300
Epoch 00190: val_loss did not improve from 1173.20881
Epoch 191/300
Epoch 00191: val_loss did not improve from 1173.20881
Epoch 192/300
Epoch 00192: val_loss did not improve from 1173.20881
Epoch 193/300
Epoch 00193: val_loss did not improve from 1173.20881
Epoch 194/300
Epoch 00194: val_loss did not improve from 1173.20881
Epoch 195/300
Epoch 00195: val_loss did not improve from 1173.20881
Epoch 196/300
Epoch 00196: val_loss did not improve from 1173.20881
Epoch 197/300
Epoch 00197: val_loss improved from 1173.20881 to 1167.23403, saving model to Weights-197--1167.23403.hdf5
Epoch 198/300
Epoch 00198: val_loss did not improve from 1167.23403
Epoch 199/300
Epoch 00199: val_loss did not improve from 1167.23403
Epoch 200/300
Epoch 00200: val_loss did not improve from 1167.23403
Epoch 201/300
Epoch 00201: val_loss did not improve from 1167.23403
Epoch 202/300
Epoch 00202: val_loss did not improve from 1167.2

Epoch 213/300
Epoch 00213: val_loss did not improve from 1167.23403
Epoch 214/300
Epoch 00214: val_loss did not improve from 1167.23403
Epoch 215/300
Epoch 00215: val_loss improved from 1167.23403 to 1166.01595, saving model to Weights-215--1166.01595.hdf5
Epoch 216/300
Epoch 00216: val_loss did not improve from 1166.01595
Epoch 217/300
Epoch 00217: val_loss did not improve from 1166.01595
Epoch 218/300
Epoch 00218: val_loss did not improve from 1166.01595
Epoch 219/300
Epoch 00219: val_loss did not improve from 1166.01595
Epoch 220/300
Epoch 00220: val_loss did not improve from 1166.01595
Epoch 221/300
Epoch 00221: val_loss did not improve from 1166.01595
Epoch 222/300
Epoch 00222: val_loss did not improve from 1166.01595
Epoch 223/300
Epoch 00223: val_loss did not improve from 1166.01595
Epoch 224/300
Epoch 00224: val_loss did not improve from 1166.01595
Epoch 225/300
Epoch 00225: val_loss did not improve from 1166.01595
Epoch 226/300
Epoch 00226: val_loss did not improve from 1166.0

Epoch 237/300
Epoch 00237: val_loss did not improve from 1166.01595
Epoch 238/300
Epoch 00238: val_loss did not improve from 1166.01595
Epoch 239/300
Epoch 00239: val_loss did not improve from 1166.01595
Epoch 240/300
Epoch 00240: val_loss did not improve from 1166.01595
Epoch 241/300
Epoch 00241: val_loss did not improve from 1166.01595
Epoch 242/300
Epoch 00242: val_loss did not improve from 1166.01595
Epoch 243/300
Epoch 00243: val_loss did not improve from 1166.01595
Epoch 244/300
Epoch 00244: val_loss did not improve from 1166.01595
Epoch 245/300
Epoch 00245: val_loss did not improve from 1166.01595
Epoch 246/300
Epoch 00246: val_loss did not improve from 1166.01595
Epoch 247/300
Epoch 00247: val_loss did not improve from 1166.01595
Epoch 248/300
Epoch 00248: val_loss did not improve from 1166.01595
Epoch 249/300
Epoch 00249: val_loss did not improve from 1166.01595
Epoch 250/300
Epoch 00250: val_loss did not improve from 1166.01595
Epoch 251/300
Epoch 00251: val_loss did not impr

Epoch 261/300
Epoch 00261: val_loss did not improve from 1163.87742
Epoch 262/300
Epoch 00262: val_loss did not improve from 1163.87742
Epoch 263/300
Epoch 00263: val_loss did not improve from 1163.87742
Epoch 264/300
Epoch 00264: val_loss did not improve from 1163.87742
Epoch 265/300
Epoch 00265: val_loss did not improve from 1163.87742
Epoch 266/300
Epoch 00266: val_loss did not improve from 1163.87742
Epoch 267/300
Epoch 00267: val_loss did not improve from 1163.87742
Epoch 268/300
Epoch 00268: val_loss did not improve from 1163.87742
Epoch 269/300
Epoch 00269: val_loss did not improve from 1163.87742
Epoch 270/300
Epoch 00270: val_loss did not improve from 1163.87742
Epoch 271/300
Epoch 00271: val_loss did not improve from 1163.87742
Epoch 272/300
Epoch 00272: val_loss did not improve from 1163.87742
Epoch 273/300
Epoch 00273: val_loss did not improve from 1163.87742
Epoch 274/300
Epoch 00274: val_loss did not improve from 1163.87742
Epoch 275/300
Epoch 00275: val_loss did not impr

Epoch 285/300
Epoch 00285: val_loss did not improve from 1163.87742
Epoch 286/300
Epoch 00286: val_loss did not improve from 1163.87742
Epoch 287/300
Epoch 00287: val_loss did not improve from 1163.87742
Epoch 288/300
Epoch 00288: val_loss did not improve from 1163.87742
Epoch 289/300
Epoch 00289: val_loss did not improve from 1163.87742
Epoch 290/300
Epoch 00290: val_loss did not improve from 1163.87742
Epoch 291/300
Epoch 00291: val_loss did not improve from 1163.87742
Epoch 292/300
Epoch 00292: val_loss did not improve from 1163.87742
Epoch 293/300
Epoch 00293: val_loss did not improve from 1163.87742
Epoch 294/300
Epoch 00294: val_loss did not improve from 1163.87742
Epoch 295/300
Epoch 00295: val_loss improved from 1163.87742 to 1151.01672, saving model to Weights-295--1151.01672.hdf5
Epoch 296/300
Epoch 00296: val_loss did not improve from 1151.01672
Epoch 297/300
Epoch 00297: val_loss did not improve from 1151.01672
Epoch 298/300
Epoch 00298: val_loss did not improve from 1151.0

In [82]:
weights_file = 'Weights-236--1166.82550.hdf5' # choose the best checkpoint 
DNN.load_weights(weights_file) # load it
DNN.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

In [83]:
t2=datetime.now()

In [84]:
(t2-t1).seconds

79

In [85]:
DNN_plot_loss(history,starting_epoch=10)

In [92]:
df_summary_dnn=performance_summary(DNN,X_test, y_test)
#d

Share of forecasts within 25% absolute error 0.926

Share of forecasts within 10% absolute error 0.638

Share of forecasts within 5% absolute error 0.381



In [93]:
df_summary_dnn.describe()

Unnamed: 0,y_hat,y_true,abs_error,error,relative_error,relative_abs_error
count,2330.0,2330.0,2330.0,2330.0,2330.0,2330.0
mean,10909.083008,11115.604721,1166.825485,-206.516382,0.001588,0.099999
std,2373.523438,2986.134603,1353.992657,1775.583025,0.142053,0.100883
min,6166.135742,5357.0,0.512207,-13585.668945,-0.5505,6.7e-05
25%,9245.732666,9007.75,305.75293,-844.804688,-0.074616,0.02989
50%,10662.800781,10698.0,738.214844,-45.675293,-0.004523,0.069934
75%,12067.731934,12498.25,1517.845215,638.0896,0.065042,0.137207
max,22237.052734,25000.0,13585.668945,8470.246094,0.92984,0.92984


In [94]:
def plot_predictions(df_summary):

    trace0=go.Scatter(
            name="Predicted",
            y=df_summary.y_true,
            x=df_summary.y_true,
            mode='lines',
            marker=dict(
            size=5,
            opacity=0.3
            ),

        )




    trace1=go.Scatter(
            name="Actual",
            y=df_summary.y_hat,
            x=df_summary.y_true,
            mode='markers',
            marker=dict(
            color="blue",

            size=5,
            opacity=0.1
            ),
            text=df_summary_dnn.error

        )




    data=[trace0,trace1]
    figure=go.Figure(
        data=data,
        layout=go.Layout(
            title="Predicted vs actual unit price",
            yaxis=dict(title="Predicted price",range=(0,25000)),
            xaxis=dict(title="Actual price")


        ))

    iplot(figure)

In [95]:
plot_predictions(df_summary_dnn)

## Modified model

In [90]:
tf.keras.backend.clear_session()
tf.random.set_seed(60)


DNN_mod=keras.models.Sequential([
    
    keras.layers.Dense(1034, input_dim = X_train.shape[1]),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.5),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(units=512),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.2),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(units=256),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.2),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(units=256),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.1),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(units=128),
    keras.layers.LeakyReLU(),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(units=1, activation="linear"),



])

In [91]:
t1=datetime.now()
optimizer = keras.optimizers.Adam(lr=0.01, decay=8e-5)




DNN_mod.compile(optimizer=optimizer, 
            loss='mean_absolute_error',
            metrics=['mean_absolute_error'])


history2 = DNN_mod.fit(X_train, y_train,
                    epochs=300,batch_size=512,
                    validation_data=(X_test, y_test),
                    verbose=1)

Train on 9317 samples, validate on 2330 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300


KeyboardInterrupt: 

In [None]:
t2=datetime.now()
(t2-t1).seconds

In [None]:
DNN_plot_loss(history2,starting_epoch=10)

In [None]:
df_summary_dnn_mod=performance_summary(DNN_mod, X_test, y_test)
#df_summary_dnn_mod=performance_summary(DNN_mod, X_train, y_train)

In [None]:
df_summary_dnn_mod.describe()

In [None]:
plot_predictions(df_summary_dnn_mod)