In [1]:
import time
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn import svm
from sklearn.model_selection import GridSearchCV

import pickle
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

import src.utils as utils

Using TensorFlow backend.


# Condos

In [12]:
data_filename = 'data/features/CON_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [13]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


### Train models

In [14]:
train_features = X_train_dict['zillow_remarks']
val_features = X_val_dict['zillow_remarks']

In [15]:
# Ridge
filename = 'models/soldprice/data_models/condo_price_zillow-remarks_ridge.pkl'

print("Ridge model: ")
t0 = time.time()

# train and save model
model = RidgeCV(alphas=(1, 1.5, 2, 2.5, 3))
utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
training time:  0.15759015083312988
best alpha:  2.0
----- Training scores -----
R2 on log scale:  0.6705712283553439
MAE on log scale:  0.3078468628727439
MAE on original $ scale:  153936.71136244186
----- Validation scores -----
R2 on log scale:  0.6864780959299577
MAE on log scale:  0.30140393307309715
MAE on original $ scale:  154081.03146970808


In [16]:
# XGBoost
filename = 'models/soldprice/data_models/condo_price_zillow-remarks_XGBoost.pkl'
params = {
    'max_depth':range(21,28,2),
    'gamma':[0, 0.1, 0.2, 0.3],
    'reg_alpha':[0, 0.1, 1, 10]
}

print("XGBoost model: ")
t0 = time.time()

# model
model = XGBRegressor(random_seed=9001)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 18.3min finished


training time:  1139.8139328956604
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.1, learning_rate=0.1, max_delta_step=0,
       max_depth=25, min_child_weight=1, missing=nan, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.9771803247712602
MAE on log scale:  0.08189391623228845
MAE on original $ scale:  39509.99570418232
----- Validation scores -----
R2 on log scale:  0.9395334183956024
MAE on log scale:  0.1269122734582407
MAE on original $ scale:  62236.18614504459


In [19]:
# Light GBM
filename = 'models/soldprice/data_models/condo_price_zillow-remarks_LGBM.pkl'
params = {'num_leaves': [30, 100, 200], 
          'max_depth':[-1, 16, 32, 64], 
          'learning_rate':[0.01, 0.1, 1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
t0 = time.time()

# model
model = model = LGBMRegressor(random_state=9001)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 46.0min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 79.9min finished


training time:  4799.0780737400055
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=16, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=512,
       n_jobs=-1, num_leaves=100, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.9895552173922343
MAE on log scale:  0.04976102906136195
MAE on original $ scale:  23754.32347306006
----- Validation scores -----
R2 on log scale:  0.9439960222083035
MAE on log scale:  0.12278620320321153
MAE on original $ scale:  60610.97039935996


### Test-Train Split, Response = 'DOM'

In [20]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


### Train models

In [21]:
train_features = X_train_dict['zillow_remarks']
val_features = X_val_dict['zillow_remarks']

In [32]:
# Ridge
filename = 'models/dom/data_models/condo_dom_zillow-remarks_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
training time:  0.18402409553527832
best alpha:  1.5
----- Training scores -----
R2 on log scale:  0.06900401921636756
MAE on log scale:  0.1988136406183637
MAE on original $ scale:  0.7082016501478575
----- Validation scores -----
R2 on log scale:  0.06020260159779933
MAE on log scale:  0.2038915604531875
MAE on original $ scale:  0.7253309398254354


In [30]:
# XGBoost
filename = 'models/dom/data_models/condo_dom_zillow-remarks_XGBoost.pkl'
params = {
    'max_depth':[9, 11, 13, 15],
    'n_estimators':[128, 256, 512],
    'reg_alpha':[0, 1e1, 1e2],
}

print("XGBoost model: ")
# t0 = time.time()

# # model
# model = XGBRegressor(random_seed=9001, gamma=0.1)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 15.0min finished


training time:  952.4606530666351
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.1, learning_rate=0.1, max_delta_step=0,
       max_depth=11, min_child_weight=1, missing=nan, n_estimators=256,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=10.0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.44451199115986195
MAE on log scale:  0.1535530765408908
MAE on original $ scale:  0.5466891659291848
----- Validation scores -----
R2 on log scale:  0.1818231394125681
MAE on log scale:  0.18938881288405754
MAE on original $ scale:  0.6737049686241011


In [25]:
# Light GBM
filename = 'models/dom/data_models/condo_dom_zillow-remarks_LGBM.pkl'
params = {'num_leaves': [15, 30, 100], 
          'reg_lambda':[0, 1, 1.5], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
# t0 = time.time()

# # model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  8.5min finished


training time:  508.57879877090454
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=256,
       n_jobs=-1, num_leaves=30, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=1, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.4038676230451548
MAE on log scale:  0.1594986525650244
MAE on original $ scale:  0.5667214106786268
----- Validation scores -----
R2 on log scale:  0.18477574153877074
MAE on log scale:  0.18936397267693889
MAE on original $ scale:  0.6737052929500423


# Multi Families

In [2]:
data_filename = 'data/features/MF_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [3]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [4]:
train_features = X_train_dict['zillow_remarks']
val_features = X_val_dict['zillow_remarks']

In [5]:
# Ridge
filename = 'models/soldprice/data_models/mf_price_zillow-remarks_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  2.0
----- Training scores -----
R2 on log scale:  0.5022367730434779
MAE on log scale:  0.3779949827210046
MAE on original $ scale:  267389.53535967384
----- Validation scores -----
R2 on log scale:  0.508103454920327
MAE on log scale:  0.3812680345591399
MAE on original $ scale:  184963.9127876768


In [6]:
# XGBoost
filename = 'models/soldprice/data_models/mf_price_zillow-remarks_XGBoost.pkl'
params = {
    'max_depth':range(11,19,2),
    'reg_lambda':[10, 100, 1000],
    'n_estimators':[256, 512, 1024],
    'gamma': [0.1, 0.2, 0.3]
}

print("XGBoost model: ")
# t0 = time.time()

# # model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.1, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=nan, n_estimators=1024,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=0, reg_lambda=100, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.9505125076788117
MAE on log scale:  0.11508254165735947
MAE on original $ scale:  58425.09039159831
----- Validation scores -----
R2 on log scale:  0.8570005583944709
MAE on log scale:  0.18749794501050573
MAE on original $ scale:  91700.97600740132


In [7]:
# Light GBM
filename = 'models/soldprice/data_models/mf_price_zillow-remarks_LGBM.pkl'
params = {'num_leaves': [15, 30, 100], 
          'n_estimators':[128, 256, 512],
          'reg_lambda': [10, 100, 500]
         }

print("Light GBM model: ")
# t0 = time.time()

# # model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=256,
       n_jobs=-1, num_leaves=30, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=100, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.9197747384894404
MAE on log scale:  0.13467552790882598
MAE on original $ scale:  68230.39772788587
----- Validation scores -----
R2 on log scale:  0.8605809576883381
MAE on log scale:  0.18687709526328
MAE on original $ scale:  93666.54603681735


### Test-Train Split, Response = 'DOM'

In [8]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [9]:
# Ridge
filename = 'models/dom/data_models/mf_dom_zillow-remarks_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  1.5
----- Training scores -----
R2 on log scale:  0.05102712155595335
MAE on log scale:  0.20273738433744393
MAE on original $ scale:  0.7442335973806892
----- Validation scores -----
R2 on log scale:  0.05741820025684541
MAE on log scale:  0.20381250432600204
MAE on original $ scale:  0.7487210419393997


In [10]:
# XGBoost
filename = 'models/dom/data_models/mf_dom_zillow-remarks_XGBoost.pkl'
params = {
    'max_depth':[7, 9, 11, 13],
    'reg_lambda':[1e2, 1e3, 1e4],
    'n_estimators':[128, 256, 512],
    'gamma': [0.1, 0.2, 0.3]
}

print("XGBoost model: ")
# t0 = time.time()

# # model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

XGBoost model: 
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=nan, n_estimators=256,
       n_jobs=1, nthread=None, objective='reg:linear', random_seed=9001,
       random_state=0, reg_alpha=0, reg_lambda=1000.0, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
----- Training scores -----
R2 on log scale:  0.181442749951906
MAE on log scale:  0.188024205563013
MAE on original $ scale:  0.6905133609703671
----- Validation scores -----
R2 on log scale:  0.10943112872434158
MAE on log scale:  0.1977497892944632
MAE on original $ scale:  0.7263210134623477


In [11]:
# Light GBM
filename = 'models/dom/data_models/mf_dom_zillow-remarks_LGBM.pkl'
params = {'num_leaves': [5, 15, 30], 
          'max_depth': [-1, 4, 8, 16],
          'n_estimators':[16, 32, 64, 128],
          'reg_lambda': [10, 100, 1000]
         }

print("Light GBM model: ")
# t0 = time.time()

# # model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=8, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=64,
       n_jobs=-1, num_leaves=15, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=100, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.1755450084919049
MAE on log scale:  0.18877253264569727
MAE on original $ scale:  0.692890830578461
----- Validation scores -----
R2 on log scale:  0.1077989487199289
MAE on log scale:  0.19831213696285724
MAE on original $ scale:  0.7284938625887678
