In [2]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from scipy.stats import loguniform as sp_loguniform
from scipy.stats import randint as sp_randint

import statsmodels.api as sm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

### Experiment on Single Family

In [3]:
# get training, validation and test data for single family
X_train_sf = pd.read_csv('X_train_sf.csv')
X_train_sf.index = X_train_sf['Unnamed: 0']
X_train_sf = X_train_sf.drop(['Unnamed: 0'], axis=1)
X_val_sf = pd.read_csv('X_val_sf.csv')
X_val_sf.index = X_val_sf['Unnamed: 0']
X_val_sf = X_val_sf.drop(['Unnamed: 0'], axis=1)
X_test_sf = pd.read_csv('X_test_sf.csv')
X_test_sf.index = X_test_sf['Unnamed: 0']
X_test_sf = X_test_sf.drop(['Unnamed: 0'], axis=1)
y_train_sf = pd.read_csv('y_train_sf.csv')
y_train_sf.index = y_train_sf['Unnamed: 0']
y_train_sf = y_train_sf.drop(['Unnamed: 0'], axis=1)
y_val_sf = pd.read_csv('y_val_sf.csv')
y_val_sf.index = y_val_sf['Unnamed: 0']
y_val_sf = y_val_sf.drop(['Unnamed: 0'], axis=1)
y_test_sf = pd.read_csv('y_test_sf.csv')
y_test_sf.index = y_test_sf['Unnamed: 0']
y_test_sf = y_test_sf.drop(['Unnamed: 0'], axis=1)

In [4]:
# create a predefined validation set for random search
y_train_plus_val_sf = pd.concat([y_train_sf, y_val_sf])
y_train_plus_val_sf_copy = y_train_plus_val_sf.copy()
y_train_plus_val_sf_copy.columns = ['train_val_split']
y_train_plus_val_sf_copy.loc[y_train_sf.index,'train_val_split'] = -1
y_train_plus_val_sf_copy.loc[y_val_sf.index,'train_val_split'] = 0
val_fold_sf = np.array(y_train_plus_val_sf_copy)
ps_sf = PredefinedSplit(val_fold_sf)

# get training plus validation set
X_train_plus_val_sf = pd.concat([X_train_sf, X_val_sf])

In [5]:
# create a function to calculate baseline R squared, RMSE and MAE
def rmse(l):
    return np.sqrt((np.sum((l-np.mean(l))**2))/len(l))
def mae(l):
    return np.sum(np.abs(l-np.mean(l)))/len(l)

### Single Family: Linear regression

In [4]:
# random search for linear regression
param_lr_sf = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_sf = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_sf,
           n_iter=10,
           cv=ps_sf,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(lr_cv_sf.best_params_)
print(lr_cv_sf.best_estimator_)
print('Min RMSE for linear regression on Single Family is: {}'.format(-lr_cv_sf.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
Min RMSE for linear regression on Single Family is: 23479006242.85987


In [211]:
# R squared, MAE of best model
print(mean_absolute_error(y_val_sf, lr_cv_sf.best_estimator_.predict(X_val_sf)))
print(lr_cv_sf.best_estimator_.score(X_val_sf, y_val_sf))

# R squared, RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

106.65993138395191
0.3389078027320038
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: Lasso

In [12]:
# random search for lasso
param_la_sf = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_sf = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(la_cv_sf.best_params_)
print(la_cv_sf.best_estimator_)
print('Min RMSE for lasso on Single Family is: {}'.format(-la_cv_sf.best_score_))

{'alpha': 0.07684071705306554, 'fit_intercept': False, 'normalize': True}
Lasso(alpha=0.07684071705306554, fit_intercept=False, normalize=True)
Min RMSE for lasso on Single Family is: 237.02107253707626


In [212]:
# R squared, MAE of best model
print(mean_absolute_error(y_val_sf, la_cv_sf.best_estimator_.predict(X_val_sf)))
print(la_cv_sf.best_estimator_.score(X_val_sf, y_val_sf))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

106.50077190341187
0.3392291066302373
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: Ridge

In [14]:
# random search for ridge
param_rd_sf = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_sf = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# Find best model hyperparameters 
print(rd_cv_sf.best_params_)
print(rd_cv_sf.best_estimator_)
print('Min RMSE fort ridge on Single Family is: {}'.format(-rd_cv_sf.best_score_))

{'alpha': 76.66289057556017, 'fit_intercept': True, 'normalize': False}
Ridge(alpha=76.66289057556017)
Min RMSE fort ridge on Single Family is: 236.94456670464658


In [213]:
# R squared, MAE of best model
print(mean_absolute_error(y_val_sf, rd_cv_sf.best_estimator_.predict(X_val_sf)))
print(rd_cv_sf.best_estimator_.score(X_val_sf, y_val_sf))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

106.58155691099871
0.3395546745409962
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: KNeighborsRegressor

In [16]:
# random search for KNeighborsRegressor
param_knn_sf = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_sf = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_sf,
         n_iter=10,
         cv=ps_sf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(knn_cv_sf.best_params_)
print(knn_cv_sf.best_estimator_)
print('Min RMSE for Knn regressor on Single Family is: {}'.format(-knn_cv_sf.best_score_))

{'leaf_size': 5, 'n_neighbors': 18, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=5, n_neighbors=18, weights='distance')
Min RMSE for Knn regressor on Single Family is: 170.7286967851921


In [214]:
# R squared, MAE of best model
print(mean_absolute_error(y_val_sf, knn_cv_sf.best_estimator_.predict(X_val_sf)))
print(knn_cv_sf.best_estimator_.score(X_val_sf, y_val_sf))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

0.07647382492493975
0.9988083233217291
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: RandomForestRegressor

In [18]:
# random search for RandomForestRegressor
param_rf_sf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_sf = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(rf_cv_sf.best_params_)
print(rf_cv_sf.best_estimator_)
print('Min RMSE is for random forest regressor on Single Family is: {}'.format(-rf_cv_sf.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
Min RMSE is for random forest regressor on Single Family is: 136.8469916397392


In [215]:
# R squared, MAE of best model
print(mean_absolute_error(y_val_sf, rf_cv_sf.best_estimator_.predict(X_val_sf)))
print(rf_cv_sf.best_estimator_.score(X_val_sf, y_val_sf))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

34.78543529085287
0.885310985348233
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: MLPRegressor

In [20]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_sf = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 3, 5, 10, 20, 50, 100, 250, 500]}

mlp_cv_sf = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_sf,
         n_iter=10,
         cv=ps_sf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(mlp_cv_sf.best_params_)
print(mlp_cv_sf.best_estimator_)
print('Min RMSE for MLP regressor on Single Family is: {}'.format(-mlp_cv_sf.best_score_))

{'alpha': 1.8792349823745838, 'batch_size': 500, 'hidden_layer_sizes': 30, 'learning_rate_init': 0.08964140621713512}
MLPRegressor(alpha=1.8792349823745838, batch_size=500, hidden_layer_sizes=30,
             learning_rate_init=0.08964140621713512)
Min RMSE for MLP regressor on Single Family is: 178.02757159982698


In [217]:
# R squared, MAE of best model
print(mean_absolute_error(y_val_sf, mlp_cv_sf.best_estimator_.predict(X_val_sf)))
print(mlp_cv_sf.best_estimator_.score(X_val_sf, y_val_sf))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

73.1474578825477
0.620135671225518
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: XGBoost

In [22]:
# random search for XGBoost
param_xg_sf = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_sf = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_sf,
            n_iter=10,
            cv=ps_sf,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(xg_cv_sf.best_params_)
print(xg_cv_sf.best_estimator_)
print('Min RMSE for XGBoost on Single Family is: {}'.format(-xg_cv_sf.best_score_))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'learning_rate_init': 0.15522637752063606, 'max_depth': 11, 'min_child_weight': 3, 'n_estimators': 100}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             learning_rate_init=0.15522637752063606, max_delta_step=0,
             max_depth=11, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=16, nthread=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
       

In [216]:
# R squared, MAE of best model
print(mean_absolute_error(y_val_sf, xg_cv_sf.best_estimator_.predict(X_val_sf)))
print(xg_cv_sf.best_estimator_.score(X_val_sf, y_val_sf))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

31.23552542508914
0.9581277114423088
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Experiment on Multi Family

In [6]:
# MULTIFAMILY get training, validation and test data
X_train_mf = pd.read_csv('X_train_mf.csv')
X_train_mf.index = X_train_mf['Unnamed: 0']
X_train_mf = X_train_mf.drop(['Unnamed: 0'], axis=1)

X_val_mf = pd.read_csv('X_val_mf.csv')
X_val_mf.index = X_val_mf['Unnamed: 0']
X_val_mf = X_val_mf.drop(['Unnamed: 0'], axis=1)

X_test_mf = pd.read_csv('X_test_mf.csv')
X_test_mf.index = X_test_mf['Unnamed: 0']
X_test_mf = X_test_mf.drop(['Unnamed: 0'], axis=1)

y_train_mf = pd.read_csv('y_train_mf.csv')
y_train_mf.index = y_train_mf['Unnamed: 0']
y_train_mf = y_train_mf.drop(['Unnamed: 0'], axis=1)

y_val_mf = pd.read_csv('y_val_mf.csv')
y_val_mf.index = y_val_mf['Unnamed: 0']
y_val_mf = y_val_mf.drop(['Unnamed: 0'], axis=1)

y_test_mf = pd.read_csv('y_test_mf.csv')
y_test_mf.index = y_test_mf['Unnamed: 0']
y_test_mf = y_test_mf.drop(['Unnamed: 0'], axis=1)

In [7]:
# Multifamily
y_train_plus_val_mf = pd.concat([y_train_mf, y_val_mf])
y_train_plus_val_mf_copy = y_train_plus_val_mf.copy()
y_train_plus_val_mf_copy.columns = ['train_val_split']
y_train_plus_val_mf_copy.loc[y_train_mf.index,'train_val_split'] = -1
y_train_plus_val_mf_copy.loc[y_val_mf.index,'train_val_split'] = 0
val_fold_mf = np.array(y_train_plus_val_mf_copy)
ps_mf = PredefinedSplit(val_fold_mf)

# Multifamily training plus validation set
X_train_plus_val_mf = pd.concat([X_train_mf, X_val_mf])

### Multi Family: Linear regression

In [30]:
# random search for linear regression
param_lr_mf = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_mf = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_mf,
           n_iter=20,
           cv=ps_mf,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(lr_cv_mf.best_params_)
print(lr_cv_mf.best_estimator_)
print('MULTIFAMILY Linear min RMSE is: {}'.format(-lr_cv_mf.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
MULTIFAMILY Linear min RMSE is: 3656416892829.176


In [218]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mf, lr_cv_mf.best_estimator_.predict(X_val_mf)))
print(lr_cv_mf.best_estimator_.score(X_val_mf, y_val_mf))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

266.992634992259
0.19850866999144323
sale_price    587.030511
dtype: float64
sale_price    313.600881
dtype: float64


### Multi Family: Lasso

In [35]:
# random search for lasso
param_la_mf = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_mf = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_mf,
        n_iter=10,
        cv=ps_mf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(la_cv_mf.best_params_)
print(la_cv_mf.best_estimator_)
print('MULTI FAMILY LASSO min RMSE is: {}'.format(-la_cv_mf.best_score_))

{'alpha': 1.5094374246471327, 'fit_intercept': True, 'normalize': True}
Lasso(alpha=1.5094374246471327, normalize=True)
MULTI FAMILY LASSO min RMSE is: 654.6961994584565


In [219]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mf, la_cv_mf.best_estimator_.predict(X_val_mf)))
print(la_cv_mf.best_estimator_.score(X_val_mf, y_val_mf))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

309.15303303433683
-0.00020552166424159246
sale_price    587.030511
dtype: float64
sale_price    313.600881
dtype: float64


### Multi Family: Ridge

In [225]:
# random search for ridge
param_rd = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_mf = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd,
        n_iter=10,
        cv=ps_mf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# Find best model hyperparameters 
print(rd_cv_mf.best_params_)
print(rd_cv_mf.best_estimator_)
print('MULTIFAMILY RIDGE min RMSE is: {}'.format(-rd_cv_mf.best_score_))

{'alpha': 1.5094374246471327, 'fit_intercept': True, 'normalize': True}
Ridge(alpha=1.5094374246471327, normalize=True)
MULTIFAMILY RIDGE min RMSE is: 1655.6082440221626


In [226]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mf, rd_cv_mf.best_estimator_.predict(X_val_mf)))
print(rd_cv_mf.best_estimator_.score(X_val_mf, y_val_mf))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

264.4113647181971
0.14286609062361233
sale_price    587.030511
dtype: float64
sale_price    313.600881
dtype: float64


### Multi Family: KNN Regressor

In [227]:
# random search for KNeighborsRegressor
param_knn = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_mf = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn,
         n_iter=10,
         cv=ps_mf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(knn_cv_mf.best_params_)
print(knn_cv_mf.best_estimator_)
print('MULTIFAMILY KNN min RMSE is: {}'.format(-knn_cv_mf.best_score_))

{'leaf_size': 3, 'n_neighbors': 7, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=3, n_neighbors=7, weights='distance')
MULTIFAMILY KNN min RMSE is: 410.45699249166245


In [229]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mf, knn_cv_mf.best_estimator_.predict(X_val_mf)))
print(knn_cv_mf.best_estimator_.score(X_val_mf, y_val_mf))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

2.771917422644056e-05
0.9999999999973338
sale_price    587.030511
dtype: float64
sale_price    313.600881
dtype: float64


### Multi Family: Random Forest Regressor

In [232]:
# random search for RandomForestRegressor
param_rf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_mf = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf,
        n_iter=10,
        cv=ps_mf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(rf_cv_mf.best_params_)
print(rf_cv_mf.best_estimator_)
print('RF MULTIFAMILY min RMSE is: {}'.format(-rf_cv_mf.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
RF MULTIFAMILY min RMSE is: 366.3338564046838


In [233]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mf, rf_cv_mf.best_estimator_.predict(X_val_mf)))
print(rf_cv_mf.best_estimator_.score(X_val_mf, y_val_mf))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

103.596674462354
0.8219045426649344
sale_price    587.030511
dtype: float64
sale_price    313.600881
dtype: float64


### Multi Family: MLP Regressor

In [234]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 5, 10, 25, 100, 500]}

mlp_cv_mf = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp,
         n_iter=10,
         cv=ps_mf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(mlp_cv_mf.best_params_)
print(mlp_cv_mf.best_estimator_)
print('MLPRegressor MULTIFAMILY min RMSE is: {}'.format(-mlp_cv_mf.best_score_))

{'alpha': 2.1610275095525036, 'batch_size': 5, 'hidden_layer_sizes': 2, 'learning_rate_init': 0.005784745785308777}
MLPRegressor(alpha=2.1610275095525036, batch_size=5, hidden_layer_sizes=2,
             learning_rate_init=0.005784745785308777)
MLPRegressor MULTIFAMILY min RMSE is: 587.1315626758294


In [236]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mf, mlp_cv_mf.best_estimator_.predict(X_val_mf)))
print(mlp_cv_mf.best_estimator_.score(X_val_mf, y_val_mf))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

239.80926186905052
0.23933739132286203
sale_price    587.030511
dtype: float64
sale_price    313.600881
dtype: float64


### Multi Family: XGboost Regressor

In [238]:
# random search for XGBoost
param_xg_mf = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_mf = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_mf,
            n_iter=10,
            cv=ps_mf,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(xg_cv_mf.best_params_)
print(xg_cv_mf.best_estimator_)
print('Min RMSE for XGBoost on Multi Family is: {}'.format(-xg_cv_mf.best_score_))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'learning_rate_init': 2.1610275095525036, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 1000}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             learning_rate_init=2.1610275095525036, max_delta_step=0,
             max_depth=4, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=16,
             nthread=-1, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha

In [239]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mf, xg_cv_mf.best_estimator_.predict(X_val_mf)))
print(xg_cv_mf.best_estimator_.score(X_val_mf, y_val_mf))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

44.782822914829595
0.9866987343552989
sale_price    587.030511
dtype: float64
sale_price    313.600881
dtype: float64


### Experiment on Commercial

In [8]:
# Commercial-- training, validation and test data
X_train_cm = pd.read_csv('X_train_cm.csv')
X_train_cm.index = X_train_cm['Unnamed: 0']
X_train_cm = X_train_cm.drop(['Unnamed: 0'], axis=1)

X_val_cm = pd.read_csv('X_val_cm.csv')
X_val_cm.index = X_val_cm['Unnamed: 0']
X_val_cm = X_val_cm.drop(['Unnamed: 0'], axis=1)

X_test_cm = pd.read_csv('X_test_cm.csv')
X_test_cm.index = X_test_cm['Unnamed: 0']
X_test_cm = X_test_cm.drop(['Unnamed: 0'], axis=1)

y_train_cm = pd.read_csv('y_train_cm.csv')
y_train_cm.index = y_train_cm['Unnamed: 0']
y_train_cm = y_train_cm.drop(['Unnamed: 0'], axis=1)

y_val_cm = pd.read_csv('y_val_cm.csv')
y_val_cm.index = y_val_cm['Unnamed: 0']
y_val_cm = y_val_cm.drop(['Unnamed: 0'], axis=1)

y_test_cm = pd.read_csv('y_test_cm.csv')
y_test_cm.index = y_test_cm['Unnamed: 0']
y_test_cm = y_test_cm.drop(['Unnamed: 0'], axis=1)

In [9]:
# Commercial Y train+val,--concate and prepare for "PredefinedSplit"
y_train_plus_val_cm = pd.concat([y_train_cm, y_val_cm])
y_train_plus_val_cm_copy = y_train_plus_val_cm.copy()
y_train_plus_val_cm_copy.columns = ['train_val_split']
y_train_plus_val_cm_copy.loc[y_train_cm.index,'train_val_split'] = -1
y_train_plus_val_cm_copy.loc[y_val_cm.index,'train_val_split'] = 0
val_fold_cm = np.array(y_train_plus_val_cm_copy)
ps_cm = PredefinedSplit(val_fold_cm)

# Commercial X train+val
X_train_plus_val_cm = pd.concat([X_train_cm, X_val_cm])

### Commercial: Linear Regression

In [52]:
# random search for linear regression
param_lr_cm = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_cm = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_cm,
           n_iter=20,
           cv=ps_cm,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(lr_cv_cm.best_params_)
print(lr_cv_cm.best_estimator_)
print('Commercial Linear min RMSE is: {}'.format(-lr_cv_cm.best_score_))

{'normalize': False, 'fit_intercept': True}
LinearRegression()
Commercial Linear min RMSE is: 766.4339369144163


In [240]:
# R2, MAE of best model
print(mean_absolute_error(y_val_cm, lr_cv_cm.best_estimator_.predict(X_val_cm)))
print(lr_cv_cm.best_estimator_.score(X_val_cm, y_val_cm))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

489.94002500000005
0.21716019679148635
sale_price    844.097943
dtype: float64
sale_price    555.724382
dtype: float64


### Commercial: Lasso

In [103]:
# random search for lasso
param_la_cm = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_cm = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_cm,
        n_iter=10,
        cv=ps_cm,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(la_cv_cm.best_params_)
print(la_cv_cm.best_estimator_)
print('Commercial LASSO min RMSE is: {}'.format(-la_cv_cm.best_score_))

{'alpha': 0.15463515822289586, 'fit_intercept': True, 'normalize': False}
Lasso(alpha=0.15463515822289586)
Commercial LASSO min RMSE is: 763.8345863711204


In [241]:
# R2, MAE of best model
print(mean_absolute_error(y_val_cm, la_cv_cm.best_estimator_.predict(X_val_cm)))
print(la_cv_cm.best_estimator_.score(X_val_cm, y_val_cm))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

485.474126120736
0.22080804536397258
sale_price    844.097943
dtype: float64
sale_price    555.724382
dtype: float64


### Commercial: Ridge

In [244]:
# random search for ridge
param_rd = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_cm = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd,
        n_iter=10,
        cv=ps_cm,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# Find best model hyperparameters 
print(rd_cv_cm.best_params_)
print(rd_cv_cm.best_estimator_)
print('Commercial RIDGE min RMSE is: {}'.format(-rd_cv_cm.best_score_))

{'alpha': 2.074024196289186, 'fit_intercept': False, 'normalize': False}
Ridge(alpha=2.074024196289186, fit_intercept=False)
Commercial RIDGE min RMSE is: 763.6040526368225


In [245]:
# R2, MAE of best model
print(mean_absolute_error(y_val_cm, rd_cv_cm.best_estimator_.predict(X_val_cm)))
print(rd_cv_cm.best_estimator_.score(X_val_cm, y_val_cm))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

485.9283472196909
0.2195952755094519
sale_price    844.097943
dtype: float64
sale_price    555.724382
dtype: float64


### Commercial: KNN Regressor

In [247]:
# random search for KNeighborsRegressor
param_knn = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_cm = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn,
         n_iter=10,
         cv=ps_cm,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(knn_cv_cm.best_params_)
print(knn_cv_cm.best_estimator_)
print('Commercial KNN min RMSE is: {}'.format(-knn_cv_cm.best_score_))

{'leaf_size': 5, 'n_neighbors': 18, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=5, n_neighbors=18, weights='distance')
Commercial KNN min RMSE is: 693.8594700478346


In [248]:
# R2, MAE of best model
print(mean_absolute_error(y_val_cm, knn_cv_cm.best_estimator_.predict(X_val_cm)))
print(knn_cv_cm.best_estimator_.score(X_val_cm, y_val_cm))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

3.070960508813651e-05
0.999999999999969
sale_price    844.097943
dtype: float64
sale_price    555.724382
dtype: float64


### Commercial: Random Forest Regressor

In [250]:
# random search for RandomForestRegressor
param_rf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_cm = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf,
        n_iter=10,
        cv=ps_cm,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(rf_cv_cm.best_params_)
print(rf_cv_cm.best_estimator_)
print('RF Commercial min RMSE is: {}'.format(-rf_cv_cm.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
RF Commercial min RMSE is: 607.2888451618489


In [251]:
# R2, MAE of best model
print(mean_absolute_error(y_val_cm, rf_cv_cm.best_estimator_.predict(X_val_cm)))
print(rf_cv_cm.best_estimator_.score(X_val_cm, y_val_cm))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

221.43018079552252
0.7726925875474369
sale_price    844.097943
dtype: float64
sale_price    555.724382
dtype: float64


### Commercial: MLP Regressor

In [252]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 5, 10, 25, 100, 500]}

mlp_cv_cm = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp,
         n_iter=10,
         cv=ps_cm,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(mlp_cv_cm.best_params_)
print(mlp_cv_cm.best_estimator_)
print('MLPRegressor Commercial min RMSE is: {}'.format(-mlp_cv_cm.best_score_))

{'alpha': 0.03847481851752228, 'batch_size': 10, 'hidden_layer_sizes': (15, 15), 'learning_rate_init': 0.0002900807334178909}
MLPRegressor(alpha=0.03847481851752228, batch_size=10,
             hidden_layer_sizes=(15, 15),
             learning_rate_init=0.0002900807334178909)
MLPRegressor Commercial min RMSE is: 693.0975825470381


In [253]:
# R2, MAE of best model
print(mean_absolute_error(y_val_cm, mlp_cv_cm.best_estimator_.predict(X_val_cm)))
print(mlp_cv_cm.best_estimator_.score(X_val_cm, y_val_cm))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

449.60815793890663
0.26514292279413565
sale_price    844.097943
dtype: float64
sale_price    555.724382
dtype: float64


### Commercial: XGBoost

In [64]:
# random search for XGBoost
param_xg_cm = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_cm = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_cm,
            n_iter=10,
            cv=ps_cm,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(xg_cv_cm.best_params_)
print(xg_cv_cm.best_estimator_)
print('Min RMSE for XGBoost on Commercial is: {}'.format(-xg_cv_cm.best_score_))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'learning_rate_init': 1.474113973879948, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             learning_rate_init=1.474113973879948, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=16, nthread=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             

In [254]:
# R2, MAE of best model
print(mean_absolute_error(y_val_cm, xg_cv_cm.best_estimator_.predict(X_val_cm)))
print(xg_cv_cm.best_estimator_.score(X_val_cm, y_val_cm))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

112.68372798265496
0.9660649607132512
sale_price    844.097943
dtype: float64
sale_price    555.724382
dtype: float64


### Experiment on Industrial

In [10]:
#INDUSTRIAL get training, validation and test data
X_train_id = pd.read_csv('X_train_id.csv')
X_train_id.index = X_train_id['Unnamed: 0']
X_train_id = X_train_id.drop(['Unnamed: 0'], axis=1)

X_val_id = pd.read_csv('X_val_id.csv')
X_val_id.index = X_val_id['Unnamed: 0']
X_val_id = X_val_id.drop(['Unnamed: 0'], axis=1)

X_test_id = pd.read_csv('X_test_id.csv')
X_test_id.index = X_test_id['Unnamed: 0']
X_test_id = X_test_id.drop(['Unnamed: 0'], axis=1)

y_train_id = pd.read_csv('y_train_id.csv')
y_train_id.index = y_train_id['Unnamed: 0']
y_train_id = y_train_id.drop(['Unnamed: 0'], axis=1)

y_val_id = pd.read_csv('y_val_id.csv')
y_val_id.index = y_val_id['Unnamed: 0']
y_val_id = y_val_id.drop(['Unnamed: 0'], axis=1)

y_test_id = pd.read_csv('y_test_id.csv')
y_test_id.index = y_test_id['Unnamed: 0']
y_test_id = y_test_id.drop(['Unnamed: 0'], axis=1)

In [11]:
#Industrial
y_train_plus_val_id = pd.concat([y_train_id, y_val_id])
y_train_plus_val_id_copy = y_train_plus_val_id.copy()
y_train_plus_val_id_copy.columns = ['train_val_split']
y_train_plus_val_id_copy.loc[y_train_id.index,'train_val_split'] = -1
y_train_plus_val_id_copy.loc[y_val_id.index,'train_val_split'] = 0
val_fold_id = np.array(y_train_plus_val_id_copy)
ps_id = PredefinedSplit(val_fold_id)

# Industrial
X_train_plus_val_id = pd.concat([X_train_id, X_val_id])

### Industrial: Linear regression

In [68]:
# random search for linear regression
param_lr_id = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_id = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_id,
           n_iter=20,
           cv=3,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(lr_cv_id.best_params_)
print(lr_cv_id.best_estimator_)
print('INDUSTRIAL Linear min RMSE is: {}'.format(-lr_cv_id.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
INDUSTRIAL Linear min RMSE is: 136392782443401.25


In [255]:
# R2, MAE of best model
print(mean_absolute_error(y_val_id, lr_cv_id.best_estimator_.predict(X_val_id)))
print(lr_cv_id.best_estimator_.score(X_val_id, y_val_id))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

450.9776948362564
0.24756535504214017
sale_price    898.514899
dtype: float64
sale_price    613.953988
dtype: float64


### Industrial: Lasso

In [73]:
# random search for lasso
param_la_id = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_id = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_id,
        n_iter=10,
        cv=3,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(la_cv_id.best_params_)
print(la_cv_id.best_estimator_)
print('INDUSTRIAL LASSO min RMSE is: {}'.format(-la_cv_id.best_score_))

{'alpha': 0.15463515822289586, 'fit_intercept': True, 'normalize': False}
Lasso(alpha=0.15463515822289586)
INDUSTRIAL LASSO min RMSE is: 715.861996559595


In [256]:
# R2, MAE of best model
print(mean_absolute_error(y_val_id, la_cv_id.best_estimator_.predict(X_val_id)))
print(la_cv_id.best_estimator_.score(X_val_id, y_val_id))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

470.55275997428225
0.27768140280979503
sale_price    898.514899
dtype: float64
sale_price    613.953988
dtype: float64


### Industrial: Ridge

In [75]:
# random search for ridge
param_id = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_id = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_id,
        n_iter=10,
        cv= 3,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# Find best model hyperparameters 
print(rd_cv_id.best_params_)
print(rd_cv_id.best_estimator_)
print('INDUSTRIAL RIDGE min RMSE is: {}'.format(-rd_cv_id.best_score_))

{'alpha': 2.074024196289186, 'fit_intercept': False, 'normalize': False}
Ridge(alpha=2.074024196289186, fit_intercept=False)
INDUSTRIAL RIDGE min RMSE is: 715.6038223254677


In [257]:
# R2, MAE of best model
print(mean_absolute_error(y_val_id, rd_cv_id.best_estimator_.predict(X_val_id)))
print(rd_cv_id.best_estimator_.score(X_val_id, y_val_id))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

474.56352066651135
0.27959024969941126
sale_price    898.514899
dtype: float64
sale_price    613.953988
dtype: float64


### Industria: KNN

In [77]:
# random search for KNeighborsRegressor
param_knn_id = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_id = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_id,
         n_iter=10,
         cv= 3,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(knn_cv_id.best_params_)
print(knn_cv_id.best_estimator_)
print('INDUSTRIAL KNN min RMSE is: {}'.format(-knn_cv_id.best_score_))

{'leaf_size': 3, 'n_neighbors': 7, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=3, n_neighbors=7, weights='distance')
INDUSTRIAL KNN min RMSE is: 713.6699387011404


In [258]:
# R2, MAE of best model
print(mean_absolute_error(y_val_id, knn_cv_id.best_estimator_.predict(X_val_id)))
print(knn_cv_id.best_estimator_.score(X_val_id, y_val_id))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

2.8646295403808134e-06
0.9999999999999997
sale_price    898.514899
dtype: float64
sale_price    613.953988
dtype: float64


### Industrial: Random Forest

In [79]:
# random search for RandomForestRegressor
param_rf_id = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_id = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_id,
        n_iter=10,
        cv= 3,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(rf_cv_id.best_params_)
print(rf_cv_id.best_estimator_)
print('RF MULTIFAMILY min RMSE is: {}'.format(-rf_cv_id.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
RF MULTIFAMILY min RMSE is: 639.9228466424411


In [259]:
# R2, MAE of best model
print(mean_absolute_error(y_val_id, rf_cv_id.best_estimator_.predict(X_val_id)))
print(rf_cv_id.best_estimator_.score(X_val_id, y_val_id))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

251.95317001457693
0.7381088168799455
sale_price    898.514899
dtype: float64
sale_price    613.953988
dtype: float64


### Industrial: MLP Regressor

In [81]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_id = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 5, 10, 25, 100, 500]}

mlp_cv_id = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_id,
         n_iter=10,
         cv= 3,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(mlp_cv_id.best_params_)
print(mlp_cv_id.best_estimator_)
print('MLPRegressor INDUSTRIAL min RMSE is: {}'.format(-mlp_cv_id.best_score_))

{'alpha': 2.1610275095525036, 'batch_size': 5, 'hidden_layer_sizes': 2, 'learning_rate_init': 0.005784745785308777}
MLPRegressor(alpha=2.1610275095525036, batch_size=5, hidden_layer_sizes=2,
             learning_rate_init=0.005784745785308777)
MLPRegressor INDUSTRIAL min RMSE is: 692.1191846844391


In [260]:
# R2, MAE of best model
print(mean_absolute_error(y_val_id, mlp_cv_id.best_estimator_.predict(X_val_id)))
print(mlp_cv_id.best_estimator_.score(X_val_id, y_val_id))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

444.42761757600897
0.38659829095330733
sale_price    898.514899
dtype: float64
sale_price    613.953988
dtype: float64


### Industrial: XGBoost

In [83]:
# random search for XGBoost
param_xg_id = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_id = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_id,
            n_iter=10,
            cv=3,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(xg_cv_id.best_params_)
print(xg_cv_id.best_estimator_)
print('Min RMSE for XGBoost on Industral is: {}'.format(-xg_cv_id.best_score_))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'learning_rate_init': 1.5094374246471327, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 1000}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             learning_rate_init=1.5094374246471327, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=16,
             nthread=-1, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha

In [261]:
# R2, MAE of best model
print(mean_absolute_error(y_val_id, xg_cv_id.best_estimator_.predict(X_val_id)))
print(xg_cv_id.best_estimator_.score(X_val_id, y_val_id))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

0.0016676647596619296
0.9999999999922164
sale_price    898.514899
dtype: float64
sale_price    613.953988
dtype: float64


### Experiment on Mixed Use

In [12]:
# get training, validation and test data for mixed use
X_train_mx = pd.read_csv('X_train_mx.csv')
X_train_mx.index = X_train_mx['Unnamed: 0']
X_train_mx = X_train_mx.drop(['Unnamed: 0'], axis=1)
X_val_mx = pd.read_csv('X_val_mx.csv')
X_val_mx.index = X_val_mx['Unnamed: 0']
X_val_mx = X_val_mx.drop(['Unnamed: 0'], axis=1)
X_test_mx = pd.read_csv('X_test_mx.csv')
X_test_mx.index = X_test_mx['Unnamed: 0']
X_test_mx = X_test_mx.drop(['Unnamed: 0'], axis=1)
y_train_mx = pd.read_csv('y_train_mx.csv')
y_train_mx.index = y_train_mx['Unnamed: 0']
y_train_mx = y_train_mx.drop(['Unnamed: 0'], axis=1)
y_val_mx = pd.read_csv('y_val_mx.csv')
y_val_mx.index = y_val_mx['Unnamed: 0']
y_val_mx = y_val_mx.drop(['Unnamed: 0'], axis=1)
y_test_mx = pd.read_csv('y_test_mx.csv')
y_test_mx.index = y_test_mx['Unnamed: 0']
y_test_mx = y_test_mx.drop(['Unnamed: 0'], axis=1)

In [13]:
# get training plus validation set
X_train_plus_val_mx = pd.concat([X_train_mx, X_val_mx])
y_train_plus_val_mx = pd.concat([y_train_mx, y_val_mx])

### Mixed Use: Linear regression

In [111]:
# random search for linear regression
param_lr_mx = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_mx = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_mx,
           n_iter=10,
           cv=5,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(lr_cv_mx.best_params_)
print(lr_cv_mx.best_estimator_)
print('Min RMSE for linear regression is: {}'.format(-lr_cv_mx.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
Min RMSE for linear regression is: 10122191127754.484


In [262]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mx, lr_cv_mx.best_estimator_.predict(X_val_mx)))
print(lr_cv_mx.best_estimator_.score(X_val_mx, y_val_mx))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

166.25926778386443
0.37279578013247006
sale_price    329.708793
dtype: float64
sale_price    205.839057
dtype: float64


### Mixed Use: Lasso

In [113]:
# random search for lasso
param_la_mx = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_mx = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_mx,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(la_cv_mx.best_params_)
print(la_cv_mx.best_estimator_)
print('Min RMSE for lasso fro Mixed Use is: {}'.format(-la_cv_mx.best_score_))

{'alpha': 0.15463515822289586, 'fit_intercept': True, 'normalize': False}
Lasso(alpha=0.15463515822289586)
Min RMSE for lasso fro Mixed Use is: 261.77672139034263


In [269]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mx, la_cv_mx.best_estimator_.predict(X_val_mx)))
print(lr_cv_mx.best_estimator_.score(X_val_mx, y_val_mx))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

166.23560605571157
0.37279578013247006
sale_price    329.708793
dtype: float64
sale_price    205.839057
dtype: float64


### Mixed Use: Ridge

In [91]:
# random search for ridge
param_rd_mx = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_mx = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd_mx,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# Find best model hyperparameters 
print(rd_cv_mx.best_params_)
print(rd_cv_mx.best_estimator_)
print('Min RMSE for ridge on Mixed Use is: {}'.format(-rd_cv_mx.best_score_))

{'alpha': 76.66289057556017, 'fit_intercept': True, 'normalize': False}
Ridge(alpha=76.66289057556017)
Min RMSE for ridge on Mixed Use is: 261.77449765235895


In [270]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mx, rd_cv_mx.best_estimator_.predict(X_val_mx)))
print(rd_cv_mx.best_estimator_.score(X_val_mx, y_val_mx))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

165.30173216469058
0.37078952054923975
sale_price    329.708793
dtype: float64
sale_price    205.839057
dtype: float64


### Mixed Use: KNeighborsRegressor

In [115]:
# random search for KNeighborsRegressor
param_knn_mx = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_mx = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_mx,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(knn_cv_mx.best_params_)
print(knn_cv_mx.best_estimator_)
print('Min RMSE for Knn regressor on Mixed Use is: {}'.format(-knn_cv_mx.best_score_))

{'leaf_size': 20, 'n_neighbors': 11, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=20, n_neighbors=11, weights='distance')
Min RMSE for Knn regressor on Mixed Use is: 240.56197162525683


In [271]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mx, knn_cv_mx.best_estimator_.predict(X_val_mx)))
print(knn_cv_mx.best_estimator_.score(X_val_mx, y_val_mx))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

3.2142814392896205e-06
0.9999999999999951
sale_price    329.708793
dtype: float64
sale_price    205.839057
dtype: float64


### Mixed Use: RandomForestRegressor

In [95]:
# random search for RandomForestRegressor
param_rf_mx = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_mx = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_mx,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(rf_cv_mx.best_params_)
print(rf_cv_mx.best_estimator_)
print('Min RMSE is for random forest regressor on Mixed Use is: {}'.format(-rf_cv_mx.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
Min RMSE is for random forest regressor on Mixed Use is: 207.72343038919698


In [272]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mx, rf_cv_mx.best_estimator_.predict(X_val_mx)))
print(rf_cv_mx.best_estimator_.score(X_val_mx, y_val_mx))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

73.97172490228284
0.7960058326802726
sale_price    329.708793
dtype: float64
sale_price    205.839057
dtype: float64


### Mixed Use: MLPRegressor

In [97]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_mx = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 3, 5, 10, 20, 50, 100, 250, 500]}

mlp_cv_mx = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_mx,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(mlp_cv_mx.best_params_)
print(mlp_cv_mx.best_estimator_)
print('Min RMSE for MLP regressor on Mixed Use is: {}'.format(-mlp_cv_mx.best_score_))

{'alpha': 2.765529811671396, 'batch_size': 20, 'hidden_layer_sizes': 4, 'learning_rate_init': 0.008664699052148592}
MLPRegressor(alpha=2.765529811671396, batch_size=20, hidden_layer_sizes=4,
             learning_rate_init=0.008664699052148592)
Min RMSE for MLP regressor on Mixed Use is: 236.55352605521776


In [273]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mx, mlp_cv_mx.best_estimator_.predict(X_val_mx)))
print(mlp_cv_mx.best_estimator_.score(X_val_mx, y_val_mx))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

136.4327826735772
0.5068681123950971
sale_price    329.708793
dtype: float64
sale_price    205.839057
dtype: float64


### Mixed Use: XGBoost

In [99]:
# random search for XGBoost
param_xg_mx = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_mx = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_mx,
            n_iter=10,
            cv=5,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(xg_cv_mx.best_params_)
print(xg_cv_mx.best_estimator_)
print('Min RMSE for XGBoost on Mixed Use is: {}'.format(-xg_cv_mx.best_score_))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'learning_rate_init': 0.20318358298265976, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 500}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             learning_rate_init=0.20318358298265976, max_delta_step=0,
             max_depth=6, min_child_weight=5, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=16, nthread=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
         

In [274]:
# R2, MAE of best model
print(mean_absolute_error(y_val_mx, xg_cv_mx.best_estimator_.predict(X_val_mx)))
print(xg_cv_mx.best_estimator_.score(X_val_mx, y_val_mx))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

4.2681320396697835
0.9996029786738734
sale_price    329.708793
dtype: float64
sale_price    205.839057
dtype: float64


### Experiment on Vacant Land

In [14]:
# vacant-- training, validation and test data
X_train_va = pd.read_csv('X_train_va.csv')
X_train_va.index = X_train_va['Unnamed: 0']
X_train_va = X_train_va.drop(['Unnamed: 0'], axis=1)

X_val_va = pd.read_csv('X_val_va.csv')
X_val_va.index = X_val_va['Unnamed: 0']
X_val_va = X_val_va.drop(['Unnamed: 0'], axis=1)

X_test_va = pd.read_csv('X_test_va.csv')
X_test_va.index = X_test_va['Unnamed: 0']
X_test_va = X_test_va.drop(['Unnamed: 0'], axis=1)

y_train_va = pd.read_csv('y_train_va.csv')
y_train_va.index = y_train_va['Unnamed: 0']
y_train_va = y_train_va.drop(['Unnamed: 0'], axis=1)

y_val_va = pd.read_csv('y_val_va.csv')
y_val_va.index = y_val_va['Unnamed: 0']
y_val_va = y_val_va.drop(['Unnamed: 0'], axis=1)

y_test_va = pd.read_csv('y_test_va.csv')
y_test_va.index = y_test_va['Unnamed: 0']
y_test_va = y_test_va.drop(['Unnamed: 0'], axis=1)

In [15]:
# Vacant Y train+val,--concate and prepare for "PredefinedSplit"
y_train_plus_val_va = pd.concat([y_train_va, y_val_va])
y_train_plus_val_va_copy = y_train_plus_val_va.copy()
y_train_plus_val_va_copy.columns = ['train_val_split']
y_train_plus_val_va_copy.loc[y_train_va.index,'train_val_split'] = -1
y_train_plus_val_va_copy.loc[y_val_va.index,'train_val_split'] = 0
val_fold_va = np.array(y_train_plus_val_va_copy)
ps_va = PredefinedSplit(val_fold_va)

# Vacant X train+val
X_train_plus_val_va = pd.concat([X_train_va, X_val_va])

### Vacant Land: Linear regression

In [119]:
# random search for linear regression
param_lr_va = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_va = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_va,
           n_iter=10,
           cv=5,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(lr_cv_va.best_params_)
print(lr_cv_va.best_estimator_)
print('Min RMSE for linear regression is: {}'.format(-lr_cv_va.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
Min RMSE for linear regression is: 10051643301709.084


In [276]:
# R2, MAE of best model
print(mean_absolute_error(y_val_va, lr_cv_va.best_estimator_.predict(X_val_va)))
print(lr_cv_va.best_estimator_.score(X_val_va, y_val_va))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

156.21638625937769
0.09859158429206194
sale_price    329.708793
dtype: float64
sale_price    205.839057
dtype: float64


### Vacant Land: Lasso

In [121]:
# random search for lasso
param_la_va = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_va = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_va,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(la_cv_va.best_params_)
print(la_cv_va.best_estimator_)
print('Min RMSE for lasso fro Mixed Use is: {}'.format(-la_cv_va.best_score_))

{'alpha': 0.07684071705306554, 'fit_intercept': False, 'normalize': True}
Lasso(alpha=0.07684071705306554, fit_intercept=False, normalize=True)
Min RMSE for lasso fro Mixed Use is: 331.1095738473251


In [277]:
# R2, MAE of best model
print(mean_absolute_error(y_val_va, la_cv_va.best_estimator_.predict(X_val_va)))
print(la_cv_va.best_estimator_.score(X_val_va, y_val_va))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

155.89813219865084
0.09730745187488532
sale_price    366.61416
dtype: float64
sale_price    170.903947
dtype: float64


### Vacant Land: Ridge

In [123]:
# random search for ridge
param_rd_va = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_va = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd_va,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# Find best model hyperparameters 
print(rd_cv_va.best_params_)
print(rd_cv_va.best_estimator_)
print('Min RMSE for ridge on Vacant Land is: {}'.format(-rd_cv_va.best_score_))

{'alpha': 1.5094374246471327, 'fit_intercept': True, 'normalize': True}
Ridge(alpha=1.5094374246471327, normalize=True)
Min RMSE for ridge on Vacant Land is: 329.3994836555059


In [278]:
# R2, MAE of best model
print(mean_absolute_error(y_val_va, rd_cv_va.best_estimator_.predict(X_val_va)))
print(rd_cv_va.best_estimator_.score(X_val_va, y_val_va))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

160.61124378646315
0.05043984161035908
sale_price    366.61416
dtype: float64
sale_price    170.903947
dtype: float64


### Vacant Land: KNN Regressor

In [125]:
# random search for KNeighborsRegressor
param_knn_va = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_va = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_va,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(knn_cv_va.best_params_)
print(knn_cv_va.best_estimator_)
print('Min RMSE for Knn regressor on Mixed Use is: {}'.format(-knn_cv_va.best_score_))

{'leaf_size': 20, 'n_neighbors': 15, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=20, n_neighbors=15, weights='distance')
Min RMSE for Knn regressor on Mixed Use is: 267.6925559640128


In [279]:
# R2, MAE of best model
print(mean_absolute_error(y_val_va, knn_cv_va.best_estimator_.predict(X_val_va)))
print(knn_cv_va.best_estimator_.score(X_val_va, y_val_va))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

1.4553803074263698e-05
0.9999999999999154
sale_price    366.61416
dtype: float64
sale_price    170.903947
dtype: float64


### Vacant Land: Random Forest Regressor

In [127]:
# random search for RandomForestRegressor
param_rf_va = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_va = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_va,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(rf_cv_va.best_params_)
print(rf_cv_va.best_estimator_)
print('Min RMSE is for random forest regressor on Mixed Use is: {}'.format(-rf_cv_va.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
Min RMSE is for random forest regressor on Mixed Use is: 253.87659968943603


In [280]:
# R2, MAE of best model
print(mean_absolute_error(y_val_va, rf_cv_va.best_estimator_.predict(X_val_va)))
print(rf_cv_va.best_estimator_.score(X_val_va, y_val_va))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

68.28386125058998
0.6509289778863712
sale_price    366.61416
dtype: float64
sale_price    170.903947
dtype: float64


### Vacant Land: MLP Regressor

In [129]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_va = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 3, 5, 10, 20, 50, 100, 250, 500]}

mlp_cv_va = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_mx,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(mlp_cv_va.best_params_)
print(mlp_cv_va.best_estimator_)
print('Min RMSE for MLP regressor on Mixed Use is: {}'.format(-mlp_cv_va.best_score_))

{'alpha': 0.0011051954732269518, 'batch_size': 100, 'hidden_layer_sizes': (15, 15), 'learning_rate_init': 0.0002900807334178909}
MLPRegressor(alpha=0.0011051954732269518, batch_size=100,
             hidden_layer_sizes=(15, 15),
             learning_rate_init=0.0002900807334178909)
Min RMSE for MLP regressor on Mixed Use is: 317.7144101127273


In [281]:
# R2, MAE of best model
print(mean_absolute_error(y_val_va, mlp_cv_va.best_estimator_.predict(X_val_va)))
print(mlp_cv_va.best_estimator_.score(X_val_va, y_val_va))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

128.59049613054012
0.23827270262750644
sale_price    366.61416
dtype: float64
sale_price    170.903947
dtype: float64


### Vacant Land: XGBoost

In [131]:
# random search for XGBoost
param_xg_va = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_va = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_mx,
            n_iter=10,
            cv=5,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(xg_cv_va.best_params_)
print(xg_cv_va.best_estimator_)
print('Min RMSE for XGBoost on Mixed Use is: {}'.format(-xg_cv_va.best_score_))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'learning_rate_init': 1.5094374246471327, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 1000}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             learning_rate_init=1.5094374246471327, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=16,
             nthread=-1, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha

In [282]:
# R2, MAE of best model
print(mean_absolute_error(y_val_va, xg_cv_va.best_estimator_.predict(X_val_va)))
print(xg_cv_va.best_estimator_.score(X_val_va, y_val_va))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

10.912587082613774
0.9980158591177141
sale_price    366.61416
dtype: float64
sale_price    170.903947
dtype: float64


### Best model retraining and testing

### Single Family

In [287]:
# retrain and compute r2, rmse and mae
best_sf = xg_cv_sf.best_estimator_.fit(X_train_plus_val_sf, y_train_plus_val_sf)
print(mean_squared_error(y_test_sf, best_sf.predict(X_test_sf), squared=False))
print(mean_absolute_error(y_test_sf, best_sf.predict(X_test_sf)))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


266.5430907257662
95.60268747334048


### Multi Family

In [286]:
# retrain and compute rmse and mae
best_mf = xg_cv_mf.best_estimator_.fit(X_train_plus_val_mf, y_train_plus_val_mf)
print(mean_squared_error(y_test_mf, best_mf.predict(X_test_mf), squared=False))
print(mean_absolute_error(y_test_mf, best_mf.predict(X_test_mf)))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


360.5010708535746
167.06683443636098


### Commercial

In [294]:
# retrain and compute rmse and mae
best_cm = xg_cv_cm.best_estimator_.fit(X_train_plus_val_cm, y_train_plus_val_cm)

print(mean_squared_error(y_test_cm, best_cm.predict(X_test_cm),squared=False))
print(mean_absolute_error(y_test_cm, best_cm.predict(X_test_cm)))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


692.8512568351597
416.4571727100758


### Industrial

In [293]:
# retrain and compute rmse and mae
best_id = rf_cv_id.best_estimator_.fit(X_train_plus_val_id, y_train_plus_val_id)
print(mean_squared_error(y_test_id, best_id.predict(X_test_id),squared=False))
print(mean_absolute_error(y_test_id, best_id.predict(X_test_id)))

649.6924673878802
404.5770816435752


### Mixed Use

In [289]:
# retrain and compute rmse and mae
best_mx = rf_cv_mx.best_estimator_.fit(X_train_plus_val_mx, y_train_plus_val_mx)
print(r2_score(y_test_mx, best_mx.predict(X_test_mx)))
print(mean_squared_error(y_test_mx, best_mx.predict(X_test_mx),squared=False))
print(mean_absolute_error(y_test_mx, best_mx.predict(X_test_mx)))

0.5879191185749639
210.38400528985744
101.5975210098866


### Vacant Land

In [292]:
# retrain and compute rmse and mae
best_va = xg_cv_va.best_estimator_.fit(X_train_plus_val_va, y_train_plus_val_va)
print(mean_squared_error(y_test_va, best_va.predict(X_test_va),squared=False))
print(mean_absolute_error(y_test_va, best_va.predict(X_test_va)))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


295.2098092945048
139.06310158084332


In [206]:
# pickle best model out
import pickle
with open('best_sf.pickle', 'wb') as file:
    pickle.dump(best_sf, file)
with open('best_mf.pickle','wb') as file:
    pickle.dump(best_mf, file)
with open('best_cm.pickle','wb') as file:
    pickle.dump(best_cm, file)
with open('best_id.pickle','wb') as file:
    pickle.dump(best_id, file)
with open('best_mx.pickle','wb') as file:
    pickle.dump(best_mx, file)
with open('best_va.pickle','wb') as file:
    pickle.dump(best_va, file)                