In [1]:
import time
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn import svm
from sklearn.model_selection import GridSearchCV

import pickle
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

import src.utils as utils

Using TensorFlow backend.


# Condos

In [2]:
data_filename = 'data/features/CON_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [3]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


### Train models

In [4]:
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

In [5]:
# Ridge
filename = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(0.5, 1, 1.5, 2, 2.5, 3))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  1.0
----- Training scores -----
R2 on log scale:  0.7587561565525761
MAE on log scale:  0.2584223390492803
MAE on original $ scale:  130983.78878844756
----- Validation scores -----
R2 on log scale:  0.7712947586642951
MAE on log scale:  0.25271014378069906
MAE on original $ scale:  132988.4900187464


In [6]:
# XGBoost
filename = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_XGBoost.pkl'
params = {
    'max_depth':range(27,35,2),
    'gamma':[i/10.0 for i in range(1,4)],
    'reg_alpha':[0, 1e-3, 1e-2],
    'reg_lambda': [0, 1e-2, 1, 10]
}

print("XGBoost model: ")
# t0 = time.time()

# # model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.1, learning_rate=0.1, max_delta_step=0,
       max_depth=31, min_child_weight=1, missing=nan, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=0.001, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.9887095718184126
MAE on log scale:  0.06005614430488262
MAE on original $ scale:  28661.09152060865
----- Validation scores -----
R2 on log scale:  0.9441967158606474
MAE on log scale:  0.11978942918052694
MAE on original $ scale:  59636.02004450111


In [7]:
# Light GBM
filename = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_LGBM.pkl'
params = {'num_leaves': [15, 30, 100], 
          'max_depth':[-1, 8, 16, 32], 
          'n_estimators':[256, 512, 1024]
         }

print("Light GBM model: ")
# t0 = time.time()

# # model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=1024,
       n_jobs=-1, num_leaves=30, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.9833154140777688
MAE on log scale:  0.06524832844490958
MAE on original $ scale:  30449.832169494464
----- Validation scores -----
R2 on log scale:  0.9502263365705589
MAE on log scale:  0.11522078474979952
MAE on original $ scale:  56385.44643313938


### Test-Train Split, Response = 'DOM'

In [8]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


### Train models

In [9]:
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

In [10]:
# Ridge
filename = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(0.5, 1, 1.5, 2, 2.5, 3))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  1.0
----- Training scores -----
R2 on log scale:  0.09391412332366034
MAE on log scale:  0.19593081642740193
MAE on original $ scale:  0.6978604053440208
----- Validation scores -----
R2 on log scale:  0.0805092942425577
MAE on log scale:  0.20153137067438764
MAE on original $ scale:  0.7168191172931464


In [12]:
# XGBoost
filename = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_XGBoost.pkl'
params = {
    'max_depth':[13, 15, 17, 19],
    'reg_lambda':[10, 100, 1000],
    'n_estimators': [100, 128, 256],
    'gamma':[0, 0.1, 0.2]
}

print("XGBoost model: ")
# t0 = time.time()

# # model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.1, learning_rate=0.1, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=nan, n_estimators=128,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=0, reg_lambda=100, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.5766418579767049
MAE on log scale:  0.13365997794084788
MAE on original $ scale:  0.47641091119407675
----- Validation scores -----
R2 on log scale:  0.2116770931417471
MAE on log scale:  0.18631890659683883
MAE on original $ scale:  0.6626776763184433


In [13]:
# Light GBM
filename = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_LGBM.pkl'
params = {'num_leaves': [5, 15, 30], 
          'n_estimators':[256, 512, 1024],
          'reg_lambda': [10, 100, 1000]
         }

print("Light GBM model: ")
# t0 = time.time()

# # model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=512,
       n_jobs=-1, num_leaves=15, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=100, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.35650885971385937
MAE on log scale:  0.1656770789661144
MAE on original $ scale:  0.5891632116867122
----- Validation scores -----
R2 on log scale:  0.20091545278203393
MAE on log scale:  0.18754303170396325
MAE on original $ scale:  0.6670959743919369


# Multi Families

In [14]:
data_filename = 'data/features/MF_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [15]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [16]:
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

In [17]:
# Ridge
filename = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(1.5, 2, 2.5, 3, 3.5, 4))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  2.0
----- Training scores -----
R2 on log scale:  0.646918645149337
MAE on log scale:  0.30800555546826375
MAE on original $ scale:  188322.00845235246
----- Validation scores -----
R2 on log scale:  0.6463869875368184
MAE on log scale:  0.3136232311077448
MAE on original $ scale:  153748.89968234452


In [18]:
# XGBoost
filename = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_XGBoost.pkl'
params = {
    'max_depth':range(11,19,2),
    'reg_lambda':[10, 100, 1000],
    'n_estimators':[256, 512, 1024]
}

print("XGBoost model: ")
t0 = time.time()

# model
model = XGBRegressor(random_seed=9001, gamma=0.2)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 17.4min finished


training time:  1141.4351761341095
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.1, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=nan, n_estimators=1024,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=0, reg_lambda=100, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.9551695422095061
MAE on log scale:  0.1073301908582671
MAE on original $ scale:  54577.89475918716
----- Validation scores -----
R2 on log scale:  0.8643345656117478
MAE on log scale:  0.18299987729945838
MAE on original $ scale:  88428.55361254701


In [19]:
# Light GBM
filename = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_LGBM.pkl'
params = {'reg_lambda': [10, 100, 500], 
          'learning_rate':[0.01, 0.1, 1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
# t0 = time.time()

# # model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=256,
       n_jobs=-1, num_leaves=31, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=100, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.9262942270322592
MAE on log scale:  0.12721437334501387
MAE on original $ scale:  63813.53537647143
----- Validation scores -----
R2 on log scale:  0.866623468802058
MAE on log scale:  0.1837046222519254
MAE on original $ scale:  88828.51452154995


### Test-Train Split, Response = 'DOM'

In [20]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [21]:
# Ridge
filename = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  3.0
----- Training scores -----
R2 on log scale:  0.07096800075294174
MAE on log scale:  0.2007738442777016
MAE on original $ scale:  0.7368750429455887
----- Validation scores -----
R2 on log scale:  0.07454910913310497
MAE on log scale:  0.202025197088306
MAE on original $ scale:  0.742053105983663


In [22]:
# XGBoost
filename = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_XGBoost.pkl'
params = {
    'max_depth':[15, 17, 19, 21],
    'reg_alpha':[1, 10, 100],
    'reg_lambda':[1, 10, 100]
}

print("XGBoost model: ")
t0 = time.time()

# model
model = XGBRegressor(random_seed=9001, n_estimators=1024, gamma=0.2)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 32.7min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 58.2min finished


training time:  3615.5078217983246
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.1, max_delta_step=0,
       max_depth=17, min_child_weight=1, missing=nan, n_estimators=1024,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=10, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.258027782823984
MAE on log scale:  0.1790049879985472
MAE on original $ scale:  0.6577509580107286
----- Validation scores -----
R2 on log scale:  0.10823341540566633
MAE on log scale:  0.19775306616692492
MAE on original $ scale:  0.7259870751366181


In [23]:
# Light GBM
filename = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_LGBM.pkl'
params = {'reg_lambda': [1, 10, 100], 
          'learning_rate':[0.001, 0.01, 0.1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
# t0 = time.time()

# # model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.01, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=256,
       n_jobs=-1, num_leaves=31, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=10, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.19092918582589513
MAE on log scale:  0.18737007378188933
MAE on original $ scale:  0.6877907178985545
----- Validation scores -----
R2 on log scale:  0.10777679896716319
MAE on log scale:  0.19830314917487604
MAE on original $ scale:  0.7288005479257166
