In [1]:
import time
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

import keras
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD, Adam
from keras.models import load_model

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import pickle
import re
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import src.utils as utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Condos

In [2]:
data_filename = 'data/features/CON_feats_remarks.pkl'

## Response = 'DOM'

In [3]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


## Zillow 

In [4]:
# zillow models
train_features = X_train_dict['zillow']
val_features = X_val_dict['zillow']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [5]:
filename = 'models/dom/ensemble_models/condo_dom_zillow.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.3398477624325116
MAE on log log scale:  0.16781279992242848
MAE on original scale:  0.5956409192537575
----- Validation scores -----
R2 on log log scale:  0.12479274315185596
MAE on log log scale:  0.19472271612503908
MAE on original scale:  0.6914220932897549


## Zillow + Redfin

In [7]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-redfin_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-redfin_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [8]:
filename = 'models/dom/ensemble_models/condo_dom_zillow-redfin.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.5286715397347033
MAE on log log scale:  0.14139525205668996
MAE on original scale:  0.5019954432545306
----- Validation scores -----
R2 on log log scale:  0.1076578708560667
MAE on log log scale:  0.19691778911275706
MAE on original scale:  0.6965939908721506


## Zillow + Redfin / Images

In [9]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-redfin_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-redfin_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_val_zilred = utils.generate_ensemble_features(model_filenames, val_features)

# image models
train_features = X_train_dict['img']
val_features = X_val_dict['img']

# model file names
img_ridge = 'models/dom/image_models/condo_dom_img_ridge.pkl'
img_lgbm = 'models/dom/image_models/condo_dom_img_LGBM.pkl'
img_nn = 'models/dom/image_models/condo_dom_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_val_img = utils.generate_ensemble_features(img_model_filenames, val_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_val_features = pd.concat([X_val_zilred, X_val_img], axis=1)

In [10]:
filename = 'models/dom/ensemble_models/condo_dom_zillow-redfin_img.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.7484467384102397
MAE on log log scale:  0.09753184158898864
MAE on original scale:  0.35225254933680317
----- Validation scores -----
R2 on log log scale:  0.10560300852408289
MAE on log log scale:  0.19980440832900387
MAE on original scale:  0.7106761729615927


## Zillow + Remarks

In [12]:
# zillow models
train_features = X_train_dict['zillow_remarks']
val_features = X_val_dict['zillow_remarks']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [13]:
filename = 'models/dom/ensemble_models/condo_dom_zillow-remarks.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.5069212335582984
MAE on log log scale:  0.14356732597774535
MAE on original scale:  0.5085166266604516
----- Validation scores -----
R2 on log log scale:  0.1268709995305034
MAE on log log scale:  0.19376574390002899
MAE on original scale:  0.6872967528904693


## Zillow + Redfin + Remarks

In [14]:
# zillow +redfin + remarks models
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [15]:
filename = 'models/dom/ensemble_models/condo_dom_zillow-redfin-remarks.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.5859737985531802
MAE on log log scale:  0.13104493870701325
MAE on original scale:  0.4642343678928489
----- Validation scores -----
R2 on log log scale:  0.16534761244044172
MAE on log log scale:  0.18954181203241038
MAE on original scale:  0.6725467198100459


## Zillow + Redfin + Remarks / Images

In [16]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_val_zilred = utils.generate_ensemble_features(model_filenames, val_features)

# image models
train_features = X_train_dict['img']
val_features = X_val_dict['img']

# model file names
img_ridge = 'models/dom/image_models/condo_dom_img_ridge.pkl'
img_lgbm = 'models/dom/image_models/condo_dom_img_LGBM.pkl'
img_nn = 'models/dom/image_models/condo_dom_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_val_img = utils.generate_ensemble_features(img_model_filenames, val_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_val_features = pd.concat([X_val_zilred, X_val_img], axis=1)

In [17]:
filename = 'models/dom/ensemble_models/condo_dom_zillow-redfin-remarks_img.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.7631986879665615
MAE on log log scale:  0.09505704770325671
MAE on original scale:  0.34224102371289283
----- Validation scores -----
R2 on log log scale:  0.14359947976190135
MAE on log log scale:  0.1950757426743516
MAE on original scale:  0.6944675427683112


# Multi Families

In [18]:
data_filename = 'data/features/MF_feats_remarks.pkl'
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


## Zillow

In [19]:
# zillow models
train_features = X_train_dict['zillow']
val_features = X_val_dict['zillow']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [20]:
filename = 'models/dom/ensemble_models/mf_dom_zillow.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.13444073263721212
MAE on log log scale:  0.19368852352255275
MAE on original scale:  0.7096305342863652
----- Validation scores -----
R2 on log log scale:  0.08248752570772833
MAE on log log scale:  0.20056313476082568
MAE on original scale:  0.7370553627016123


## Zillow + Redfin

In [21]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-redfin_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-redfin_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [22]:
filename = 'models/dom/ensemble_models/mf_dom_zillow-redfin.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.20592303571762505
MAE on log log scale:  0.18463802810563487
MAE on original scale:  0.6753947697595227
----- Validation scores -----
R2 on log log scale:  0.05267997456617679
MAE on log log scale:  0.20239027659880218
MAE on original scale:  0.744585674160795


## Zillow + Redfin / Images

In [23]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-redfin_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-redfin_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_val_zilred = utils.generate_ensemble_features(model_filenames, val_features)

# image models
train_features = X_train_dict['img']
val_features = X_val_dict['img']

# model file names
img_ridge = 'models/dom/image_models/mf_dom_img_ridge.pkl'
img_lgbm = 'models/dom/image_models/mf_dom_img_LGBM.pkl'
img_nn = 'models/dom/image_models/mf_dom_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_val_img = utils.generate_ensemble_features(img_model_filenames, val_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_val_features = pd.concat([X_val_zilred, X_val_img], axis=1)

In [24]:
filename = 'models/dom/ensemble_models/mf_dom_zillow-redfin_img.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.7527477597380803
MAE on log log scale:  0.08718618176543402
MAE on original scale:  0.3254538419079815
----- Validation scores -----
R2 on log log scale:  -0.030647903974755142
MAE on log log scale:  0.21275019922368063
MAE on original scale:  0.7811051112218914


## Zillow + Remarks

In [25]:
# zillow models
train_features = X_train_dict['zillow_remarks']
val_features = X_val_dict['zillow_remarks']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [26]:
filename = 'models/dom/ensemble_models/mf_dom_zillow-remarks.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.21940038250998217
MAE on log log scale:  0.18338606488112816
MAE on original scale:  0.6718662450228235
----- Validation scores -----
R2 on log log scale:  0.0787563569816625
MAE on log log scale:  0.20061414639803185
MAE on original scale:  0.7355257817276962


## Zillow + Redfin + Remarks

In [27]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [28]:
filename = 'models/dom/ensemble_models/mf_dom_zillow-redfin-remarks.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.3557079085547513
MAE on log log scale:  0.16619164440274772
MAE on original scale:  0.6092872011232693
----- Validation scores -----
R2 on log log scale:  0.013582444175546105
MAE on log log scale:  0.20511149043380278
MAE on original scale:  0.7508287517226986


## Zillow + Redfin + Remarks / Images

In [29]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_val_zilred = utils.generate_ensemble_features(model_filenames, val_features)

# image models
train_features = X_train_dict['img']
val_features = X_val_dict['img']

# model file names
img_ridge = 'models/dom/image_models/mf_dom_img_ridge.pkl'
img_lgbm = 'models/dom/image_models/mf_dom_img_LGBM.pkl'
img_nn = 'models/dom/image_models/mf_dom_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_val_img = utils.generate_ensemble_features(img_model_filenames, val_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_val_features = pd.concat([X_val_zilred, X_val_img], axis=1)

In [30]:
filename = 'models/dom/ensemble_models/mf_dom_zillow-redfin-remarks_img.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.7670185198979312
MAE on log log scale:  0.08698988809583881
MAE on original scale:  0.3238160709424381
----- Validation scores -----
R2 on log log scale:  0.005213910972757363
MAE on log log scale:  0.20883064747644875
MAE on original scale:  0.766750166302227
