In [1]:
import time
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

import keras
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD, Adam
from keras.models import load_model

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import pickle
import re
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import src.utils as utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Condos

In [2]:
# Condos
train_data_filename = 'data/features/TRAIN_CON.pkl'
test_data_filename = 'data/features/TEST_CON.pkl'

response_col = 'DOM'
val_size = 0.1
random_state = 9001

# read in data 
train_df = utils.read_preprocess_df(train_data_filename, response_col=response_col)
test_df = utils.read_preprocess_df(test_data_filename, response_col=response_col, print_info=False)

# process test data
X_train_dict, X_test_dict, y_train, y_test = utils.process_test_data(test_df, train_df, response_col, val_size, random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of test samples:  3563


## Zillow 

In [3]:
# zillow models
train_features = X_train_dict['zillow']
test_features = X_test_dict['zillow']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/condo_dom_zillow.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.3398477624325116
MAE on log log scale:  0.16781279992242848
MAE on original scale:  0.5956409192537575
----- Test scores -----
R2 on log log scale:  -0.028729719213153704
MAE on log log scale:  0.21744430375541943
MAE on original scale:  0.7703313062629066


## Zillow + Redfin

In [4]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
test_features = X_test_dict['zillow_redfin']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-redfin_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-redfin_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/condo_dom_zillow-redfin.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.5286715397347033
MAE on log log scale:  0.14139525205668996
MAE on original scale:  0.5019954432545306
----- Test scores -----
R2 on log log scale:  -0.014445607769800528
MAE on log log scale:  0.2159311428438548
MAE on original scale:  0.7653476137033777


## Zillow + Redfin / Images

In [5]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
test_features = X_test_dict['zillow_redfin']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-redfin_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-redfin_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_test_zilred = utils.generate_ensemble_features(model_filenames, test_features)

# image models
train_features = X_train_dict['img']
test_features = X_test_dict['img']

# model file names
img_ridge = 'models/dom/image_models/condo_dom_img_ridge.pkl'
img_lgbm = 'models/dom/image_models/condo_dom_img_LGBM.pkl'
img_nn = 'models/dom/image_models/condo_dom_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_test_img = utils.generate_ensemble_features(img_model_filenames, test_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_test_features = pd.concat([X_test_zilred, X_test_img], axis=1)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/condo_dom_zillow-redfin_img.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.7484467384102397
MAE on log log scale:  0.09753184158898864
MAE on original scale:  0.35225254933680317
----- Test scores -----
R2 on log log scale:  0.031166037106677313
MAE on log log scale:  0.2119237375683688
MAE on original scale:  0.7510986413677274


## Zillow + Remarks

In [6]:
# zillow + redfin models
train_features = X_train_dict['zillow_remarks']
test_features = X_test_dict['zillow_remarks']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/condo_dom_zillow-remarks.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.5069212335582984
MAE on log log scale:  0.14356732597774535
MAE on original scale:  0.5085166266604516
----- Test scores -----
R2 on log log scale:  0.020320638705354388
MAE on log log scale:  0.20958950332203907
MAE on original scale:  0.7431651858413999


## Zillow + Redfin + Remarks

In [7]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
test_features = X_test_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/condo_dom_zillow-redfin-remarks.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.5859737985531802
MAE on log log scale:  0.13104493870701325
MAE on original scale:  0.4642343678928489
----- Test scores -----
R2 on log log scale:  0.06423103744671577
MAE on log log scale:  0.20538882051288931
MAE on original scale:  0.7278025406301263


## Zillow + Redfin + Remarks / Images

In [8]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
test_features = X_test_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/condo_dom_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilredrmk = utils.generate_ensemble_features(model_filenames, train_features)
X_test_zilredrmk = utils.generate_ensemble_features(model_filenames, test_features)

# image models
train_features = X_train_dict['img']
test_features = X_test_dict['img']

# model file names
img_ridge = 'models/dom/image_models/condo_dom_img_ridge.pkl'
img_lgbm = 'models/dom/image_models/condo_dom_img_LGBM.pkl'
img_nn = 'models/dom/image_models/condo_dom_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_test_img = utils.generate_ensemble_features(img_model_filenames, test_features)


new_train_features = pd.concat([X_train_zilredrmk, X_train_img], axis=1)
new_test_features = pd.concat([X_test_zilredrmk, X_test_img], axis=1)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/condo_dom_zillow-redfin-remarks_img.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.7631986879665615
MAE on log log scale:  0.09505704770325671
MAE on original scale:  0.34224102371289283
----- Test scores -----
R2 on log log scale:  0.06130265742469787
MAE on log log scale:  0.20742324349228675
MAE on original scale:  0.7348165424580724


# Multi Families

In [9]:
# Condos
train_data_filename = 'data/features/TRAIN_MF.pkl'
test_data_filename = 'data/features/TEST_MF.pkl'

response_col = 'DOM'
val_size = 0.1
random_state = 9001

# read in data 
train_df = utils.read_preprocess_df(train_data_filename, response_col=response_col)
test_df = utils.read_preprocess_df(test_data_filename, response_col=response_col, print_info=False)

# process test data
X_train_dict, X_test_dict, y_train, y_test = utils.process_test_data(test_df, train_df, response_col, val_size, random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of test samples:  1324


## Zillow

In [10]:
# zillow models
train_features = X_train_dict['zillow']
test_features = X_test_dict['zillow']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/mf_dom_zillow.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.13444073263721212
MAE on log log scale:  0.19368852352255275
MAE on original scale:  0.7096305342863652
----- Test scores -----
R2 on log log scale:  -0.053182911076012696
MAE on log log scale:  0.19060219534339531
MAE on original scale:  0.6878817885697619


## Zillow + Redfin

In [11]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
test_features = X_test_dict['zillow_redfin']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-redfin_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-redfin_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/mf_dom_zillow-redfin.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.20592303571762505
MAE on log log scale:  0.18463802810563487
MAE on original scale:  0.6753947697595227
----- Test scores -----
R2 on log log scale:  -0.13146675608212166
MAE on log log scale:  0.1978789690549227
MAE on original scale:  0.7201404880514913


## Zillow + Redfin / Images

In [12]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
test_features = X_test_dict['zillow_redfin']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-redfin_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-redfin_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_test_zilred = utils.generate_ensemble_features(model_filenames, test_features)

# image models
train_features = X_train_dict['img']
test_features = X_test_dict['img']

# model file names
img_ridge = 'models/dom/image_models/mf_dom_img_ridge.pkl'
img_lgbm = 'models/dom/image_models/mf_dom_img_LGBM.pkl'
img_nn = 'models/dom/image_models/mf_dom_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_test_img = utils.generate_ensemble_features(img_model_filenames, test_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_test_features = pd.concat([X_test_zilred, X_test_img], axis=1)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/mf_dom_zillow-redfin_img.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.7527477597380803
MAE on log log scale:  0.08718618176543402
MAE on original scale:  0.3254538419079815
----- Test scores -----
R2 on log log scale:  -0.13327521722096836
MAE on log log scale:  0.19828243644286822
MAE on original scale:  0.712999517913087


## Zillow + Remarks

In [13]:
# zillow  models
train_features = X_train_dict['zillow_remarks']
test_features = X_test_dict['zillow_remarks']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/mf_dom_zillow-remarks.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.21940038250998217
MAE on log log scale:  0.18338606488112816
MAE on original scale:  0.6718662450228235
----- Test scores -----
R2 on log log scale:  -0.08691592920004432
MAE on log log scale:  0.19279030221937335
MAE on original scale:  0.6968939172695326


## Zillow + Redfin + Remarks

In [14]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
test_features = X_test_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/mf_dom_zillow-redfin-remarks.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.3557079085547513
MAE on log log scale:  0.16619164440274772
MAE on original scale:  0.6092872011232693
----- Validation scores -----
R2 on log log scale:  -0.18389444949267975
MAE on log log scale:  0.20209471594920647
MAE on original scale:  0.7342074219443181


## Zillow + Redfin + Remarks / Images

In [15]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
test_features = X_test_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/dom/data_models/mf_dom_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilredrmk = utils.generate_ensemble_features(model_filenames, train_features)
X_test_zilredrmk = utils.generate_ensemble_features(model_filenames, test_features)

# image models
train_features = X_train_dict['img']
test_features = X_test_dict['img']

# model file names
img_ridge = 'models/dom/image_models/mf_dom_img_ridge.pkl'
img_lgbm = 'models/dom/image_models/mf_dom_img_LGBM.pkl'
img_nn = 'models/dom/image_models/mf_dom_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_test_img = utils.generate_ensemble_features(img_model_filenames, test_features)


new_train_features = pd.concat([X_train_zilredrmk, X_train_img], axis=1)
new_test_features = pd.concat([X_test_zilredrmk, X_test_img], axis=1)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/dom/ensemble_models/mf_dom_zillow-redfin-remarks_img.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log log scale:  0.7670185198979312
MAE on log log scale:  0.08698988809583881
MAE on original scale:  0.3238160709424381
----- Test scores -----
R2 on log log scale:  -0.11397505989913514
MAE on log log scale:  0.19670349969553405
MAE on original scale:  0.707627610950544
