In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time


In [24]:
df = pd.read_pickle('files/preprocessed.pkl')
df = df.rename(columns={"price.mainValue": "Price"}).drop('Unnamed: 0.1', axis=1)


In [25]:

# We use the numpy fuction log1p which  applies log(1+x)
# to all elements of the column to fix skewed features
# Source: https://www.kaggle.com/erick5/predicting-house-prices-with-machine-learning

df["Price"] = np.log1p(df["Price"])


In [26]:
# from scipy.stats import skew
#
# numeric_feats = df.dtypes[df.dtypes != "object"].index
# skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna()))\
#     .sort_values(ascending=False)
# skewness = pd.DataFrame({'Skewed Features' :skewed_feats})
# skewness.head()


In [27]:
# skewness = skewness[abs(skewness) > 0.75]
# print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))
#
# from scipy.special import boxcox1p
# skewed_features = skewness.index
# lam = 0.15
# for feat in skewed_features:
#     df[feat] = boxcox1p(df[feat], lam)
#     df[feat] += 1

In [28]:
df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]
from sklearn.model_selection import train_test_split
features =  list(df.drop(columns=['Price','id'],axis=1))
X = df[features]
y = df['Price']


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [30]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
#
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

In [31]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import cross_val_score


models = [
          DecisionTreeRegressor(criterion='mse',max_depth=11),
          GradientBoostingRegressor(n_estimators=200,max_depth=12, verbose=0),
            RandomForestRegressor(min_samples_leaf =1, n_estimators=100,criterion='mse',max_depth=30,verbose=0),
            xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0,
                             learning_rate=0.05, max_depth=6,
                             min_child_weight=1.5, n_estimators=7200,
                             reg_alpha=0.9, reg_lambda=0.6,
                             subsample=0.2),
            lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
        ]


learning_mods = pd.DataFrame()
temp = {}

In [32]:
from sklearn.feature_selection import SelectFromModel

#run through models
for model in models:
    start = time.time()

    sfm = SelectFromModel(model, threshold=0.5)
    sfm.fit(X_train,y_train)
    Xtrain = sfm.transform(X_train)
    Xtest   = sfm.transform(X_test)

    print(model)
    m = str(model)
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_test, y_test, cv=5,scoring='r2')
    print('score on training',model.score(X_train, y_train))
    mean, std = scores.mean(), scores.std()
    print("r2 score: %0.2f (+/- %0.2f)" % (mean,std * 2),f'\nTook '
                                    f'{time.time() - start :.2f} 'f'seconds\n')



DecisionTreeRegressor(max_depth=11)
score on training 0.9503126526438037
r2 score: 0.51 (+/- 0.15) 
Took 0.81 seconds

GradientBoostingRegressor(max_depth=12, n_estimators=200)
score on training 0.9999522192184913
r2 score: 0.60 (+/- 0.20) 
Took 64.64 seconds

RandomForestRegressor(max_depth=30)
score on training 0.9740466190120177
r2 score: 0.72 (+/- 0.16) 
Took 27.46 seconds

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.2, gamma=0.0,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=0.05, max_delta_step=None, max_depth=6,
             min_child_weight=1.5, missing=nan, monotone_constraints=None,
             n_estimators=7200, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=0.9, reg_lambda=0.6,
             scale_pos_weight=None, subsample=0.2, tree_method=None,
             validate_parameters=None, verbosity=None)
scor



In [33]:
regressionTree_imp = model.feature_importances_

In [34]:
data_tuples = list(zip(features,regressionTree_imp.tolist()))
features_importance = pd.DataFrame(data_tuples, columns=['Feature','Value'])

In [35]:
print(len(features_importance))
unimportant_features = features_importance[features_importance['Value'] < 20]
features_importance = features_importance[features_importance['Value'] > 20]
print(len(features_importance))

215
33


In [36]:
features_importance = features_importance.sort_values(by=['Value'], ascending=False)
pd.set_option('display.float_format', lambda x: '%.1000f' % x)
features_importance['Feature']

1              property_netHabitableSurface
6      E_level__primary_energy_consumption_
9                               Living_area
4                         Construction_year
14                        Bedroom_1_surface
10                      Living_room_surface
18                          Terrace_surface
13                         Cadastral_income
21                         Number_of_floors
15                        Bedroom_2_surface
3                                 Bathrooms
20                                  Floor__
12                                  Toilets
16                   Covered_parking_spaces
0                     property_bedroomCount
7                              Energy_class
8                                   Facades
180       property_location_postalCode_2100
172       property_location_postalCode_2000
185       property_location_postalCode_2170
202       property_location_postalCode_2660
141                    property_subtype_KOT
143              property_subtyp

In [37]:
voorspelling1 = pd.DataFrame()
voorspelling2 = pd.DataFrame()

In [38]:
voorspelling1['Real'] = np.expm1(y_test).astype(int)
voorspelling2['Voorspelling'] = np.expm1(model.predict(X_test)).astype(int)

In [39]:
voorspelling2.to_csv('voor.csv')
voorspelling1.to_csv('jaja.csv')

In [40]:
model

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319,
              learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
              min_sum_hessian_in_leaf=11, n_estimators=720, num_leaves=5,
              objective='regression')

In [41]:
from sklearn.model_selection import GridSearchCV

gridParams = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16,24],
    'num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['regression'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.64, 0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

grid = GridSearchCV(model, gridParams, verbose=1, cv=4, n_jobs=-1)
# Run the grid
grid.fit(X, y)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 3456 candidates, totalling 13824 fits
{'boosting_type': 'gbdt', 'colsample_bytree': 0.64, 'learning_rate': 0.01, 'max_bin': 255, 'n_estimators': 24, 'num_leaves': 16, 'objective': 'regression', 'random_state': 500, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}
0.13326043028591775


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   41.8s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 11234 tasks      |

In [42]:
print(grid.best_params_)
print(grid.best_score_)




{'boosting_type': 'gbdt', 'colsample_bytree': 0.64, 'learning_rate': 0.01, 'max_bin': 255, 'n_estimators': 24, 'num_leaves': 16, 'objective': 'regression', 'random_state': 500, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}
0.13326043028591775
