In [1]:
import time
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn import svm
from sklearn.model_selection import GridSearchCV

import pickle
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

import src.utils as utils

Using TensorFlow backend.


# Condos

In [2]:
data_filename = 'data/features/CON_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [3]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


### Train models

In [4]:
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

In [5]:
# Ridge
filename = 'models/soldprice/data_models/condo_price_zillow-redfin_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(1, 1.5, 2, 2.5, 3))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  1.0
----- Training scores -----
R2 on log scale:  0.6382971002880207
MAE on log scale:  0.3185230138089127
MAE on original $ scale:  160144.8875919427
----- Validation scores -----
R2 on log scale:  0.6292163815737918
MAE on log scale:  0.3171481689170869
MAE on original $ scale:  192083.79349019701


In [6]:
# XGBoost
filename = 'models/soldprice/data_models/condo_price_zillow-redfin_XGBoost.pkl'
params = {
    'max_depth':range(21,28,2),
    'gamma':[i/10.0 for i in range(0,3)],
    'reg_alpha':[1e-2, 0.1, 1, 10]
}

print("XGBoost model: ")
# t0 = time.time()

# model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.0, learning_rate=0.1, max_delta_step=0,
       max_depth=23, min_child_weight=1, missing=nan, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.9865527050239752
MAE on log scale:  0.057272628423993964
MAE on original $ scale:  26947.47150444139
----- Validation scores -----
R2 on log scale:  0.9452556746101984
MAE on log scale:  0.11473856539000461
MAE on original $ scale:  57163.18539364009


In [7]:
# Light GBM
filename = 'models/soldprice/data_models/condo_price_zillow-redfin_LGBM.pkl'
params = {'num_leaves': [30, 100, 200], 
          'max_depth':[-1, 16, 32, 64], 
          'learning_rate':[0.01, 0.1, 1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
# t0 = time.time()

# model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=16, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=2048,
       n_jobs=-1, num_leaves=30, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.9832844145177274
MAE on log scale:  0.059777141672348155
MAE on original $ scale:  26655.79428002409
----- Validation scores -----
R2 on log scale:  0.9475539824570391
MAE on log scale:  0.11431590043015316
MAE on original $ scale:  57069.10543402096


### Test-Train Split, Response = 'DOM'

In [8]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


### Train models

In [9]:
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

In [10]:
# Ridge
filename = 'models/dom/data_models/condo_dom_zillow-redfin_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(0.5, 1, 1.5, 2))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  1.0
----- Training scores -----
R2 on log scale:  0.07316034102448887
MAE on log scale:  0.19855581322303706
MAE on original $ scale:  0.7072533653093103
----- Validation scores -----
R2 on log scale:  0.05862482536363722
MAE on log scale:  0.2050005368842763
MAE on original $ scale:  0.7290693080585253


In [11]:
# XGBoost
filename = 'models/dom/data_models/condo_dom_zillow-redfin_XGBoost.pkl'
params = {
    'max_depth':range(18,23,2),
    'gamma':[i/10.0 for i in range(0,3)],
    'reg_alpha':[1e-2, 0.1, 1, 10]
}

print("XGBoost model: ")
# t0 = time.time()

# # model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.1, max_delta_step=0,
       max_depth=18, min_child_weight=1, missing=nan, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.4750179091072324
MAE on log scale:  0.15055135281140122
MAE on original $ scale:  0.5364716456178968
----- Validation scores -----
R2 on log scale:  0.1779568551006866
MAE on log scale:  0.19025634498462451
MAE on original $ scale:  0.6762753617847805


In [14]:
# Light GBM
filename = 'models/dom/data_models/condo_dom_zillow-redfin_LGBM.pkl'
params = {'num_leaves': [30, 100, 200], 
          'max_depth':[-1, 16, 32, 64], 
          'learning_rate':[0.01, 0.1, 1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
t0 = time.time()

# model
model = model = LGBMRegressor(random_state=9001)
grid = GridSearchCV(model, params, verbose=1)

# train and save model
utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=1)]: Done 324 out of 324 | elapsed:  8.8min finished


training time:  535.0616109371185
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.01, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=512,
       n_jobs=-1, num_leaves=200, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.4235505747964915
MAE on log scale:  0.1572258950778728
MAE on original $ scale:  0.5590334326958408
----- Validation scores -----
R2 on log scale:  0.18789981237809472
MAE on log scale:  0.1894706139118776
MAE on original $ scale:  0.6738703637160046


# Multi Families

In [15]:
data_filename = 'data/features/MF_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [16]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [17]:
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

In [18]:
# Ridge
filename = 'models/soldprice/data_models/mf_price_zillow-redfin_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  2.5
----- Training scores -----
R2 on log scale:  0.5417342593516201
MAE on log scale:  0.34693197029251505
MAE on original $ scale:  241020.27993794376
----- Validation scores -----
R2 on log scale:  0.5362543705460012
MAE on log scale:  0.3503044007103347
MAE on original $ scale:  165764.26074559786


In [19]:
# XGBoost
filename = 'models/soldprice/data_models/mf_price_zillow-redfin_XGBoost.pkl'
params = {
    'max_depth':range(21,28,2),
    'gamma':[i/10.0 for i in range(0,3)],
    'reg_alpha':[1e-2, 0.1, 1, 10]
}

print("XGBoost model: ")
# t0 = time.time()

# model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.3, learning_rate=0.1, max_delta_step=0,
       max_depth=27, min_child_weight=1, missing=nan, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.926527391605652
MAE on log scale:  0.14671760425120417
MAE on original $ scale:  66488.96989508197
----- Validation scores -----
R2 on log scale:  0.8108434310530985
MAE on log scale:  0.21652329641836707
MAE on original $ scale:  94344.66500234966


In [20]:
# Light GBM
filename = 'models/soldprice/data_models/mf_price_zillow-redfin_LGBM.pkl'
params = {'num_leaves': [30, 100, 200], 
          'max_depth':[-1, 16, 32, 64], 
          'learning_rate':[0.01, 0.1, 1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
# t0 = time.time()

# model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.01, max_depth=8, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=1024,
       n_jobs=-1, num_leaves=100, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.8931280770060035
MAE on log scale:  0.15417419475347055
MAE on original $ scale:  65270.60297681228
----- Validation scores -----
R2 on log scale:  0.8197024078169651
MAE on log scale:  0.21232030963747797
MAE on original $ scale:  91166.02958921483


### Test-Train Split, Response = 'DOM'

In [21]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [22]:
# Ridge
filename = 'models/dom/data_models/mf_dom_zillow-redfin_ridge.pkl'

print("Ridge model: ")
t0 = time.time()

# train and save model
model = RidgeCV(alphas=(0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4))
utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
training time:  0.10653996467590332
best alpha:  1.5
----- Training scores -----
R2 on log scale:  0.05096114416778208
MAE on log scale:  0.20394676057522335
MAE on original $ scale:  0.7484476129422661
----- Validation scores -----
R2 on log scale:  0.056166591716771186
MAE on log scale:  0.20408252074843689
MAE on original $ scale:  0.750006292696872


In [27]:
# XGBoost
filename = 'models/dom/data_models/mf_dom_zillow-redfin_XGBoost.pkl'
params = {
    'max_depth':range(13,22,2),
    'gamma':[i/10.0 for i in range(0,3)],
    'reg_alpha':[1, 10, 15]
}

print("XGBoost model: ")
t0 = time.time()

# model
model = XGBRegressor(random_seed=9001)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  2.3min finished


training time:  142.94370102882385
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.1, max_delta_step=0,
       max_depth=17, min_child_weight=1, missing=nan, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=10, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.127076427344058
MAE on log scale:  0.1951283940892945
MAE on original $ scale:  0.7163421743382833
----- Validation scores -----
R2 on log scale:  0.09335287704717898
MAE on log scale:  0.19958877655409699
MAE on original $ scale:  0.7333152267167388


In [26]:
# Light GBM
filename = 'models/dom/data_models/mf_dom_zillow-redfin_LGBM.pkl'
params = {'num_leaves': [15, 30, 100], 
          'max_depth':[-1, 8, 16, 32], 
          'learning_rate':[0.001, 0.01, 0.1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
t0 = time.time()

# model
model = model = LGBMRegressor(random_state=9001)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 34.8min finished


training time:  2091.482544183731
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.01, max_depth=8, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=256,
       n_jobs=-1, num_leaves=30, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.15583573819524577
MAE on log scale:  0.19176706626067455
MAE on original $ scale:  0.7031587964246946
----- Validation scores -----
R2 on log scale:  0.09327083797444657
MAE on log scale:  0.20005387893895954
MAE on original $ scale:  0.7350927332839327
