In [1]:
import time
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

import keras
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD, Adam
from keras.models import load_model

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import pickle
import re
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import src.utils as utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Condos

In [2]:
# Condos
train_data_filename = 'data/features/TRAIN_CON.pkl'
test_data_filename = 'data/features/TEST_CON.pkl'

response_col = 'SOLDPRICE'
val_size = 0.1
random_state = 9001

# read in data 
train_df = utils.read_preprocess_df(train_data_filename, response_col=response_col)
test_df = utils.read_preprocess_df(test_data_filename, response_col=response_col, print_info=False)

# process test data
X_train_dict, X_test_dict, y_train, y_test = utils.process_test_data(test_df, train_df, response_col, val_size, random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of test samples:  3563


## Zillow 

In [3]:
# zillow models
train_features = X_train_dict['zillow']
test_features = X_test_dict['zillow']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/condo_price_zillow.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9770709789938143
MAE on log scale:  0.08293147045887168
MAE on original scale:  39276.541893723486
----- Test scores -----
R2 on log scale:  0.8844218927870328
MAE on log scale:  0.1590807113964601
MAE on original scale:  77875.50272889803


## Zillow + Redfin

In [4]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
test_features = X_test_dict['zillow_redfin']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-redfin_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-redfin_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/condo_price_zillow-redfin.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9872015695369737
MAE on log scale:  0.056293575670782
MAE on original scale:  26143.74749291712
----- Test scores -----
R2 on log scale:  0.8929908587808036
MAE on log scale:  0.14891410013422246
MAE on original scale:  72815.13313015123


## Zillow + Redfin / Images

In [5]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
test_features = X_test_dict['zillow_redfin']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-redfin_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-redfin_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_test_zilred = utils.generate_ensemble_features(model_filenames, test_features)

# image models
train_features = X_train_dict['img']
test_features = X_test_dict['img']

# model file names
img_ridge = 'models/soldprice/image_models/condo_price_img_ridge.pkl'
img_lgbm = 'models/soldprice/image_models/condo_price_img_LGBM.pkl'
img_nn = 'models/soldprice/image_models/condo_price_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_test_img = utils.generate_ensemble_features(img_model_filenames, test_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_test_features = pd.concat([X_test_zilred, X_test_img], axis=1)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/condo_price_zillow-redfin_img.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.987675306965182
MAE on log scale:  0.05472775981973835
MAE on original scale:  25826.53708718056
----- Test scores -----
R2 on log scale:  0.898236503535547
MAE on log scale:  0.14382700978161156
MAE on original scale:  72541.2937249036


## Zillow + Remarks

In [6]:
# zillow + redfin models
train_features = X_train_dict['zillow_remarks']
test_features = X_test_dict['zillow_remarks']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/condo_price_zillow-remarks.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9902656313300511
MAE on log scale:  0.04789157698570431
MAE on original scale:  22882.34761821519
----- Test scores -----
R2 on log scale:  0.8985446277354523
MAE on log scale:  0.15185953583416362
MAE on original scale:  73848.7471576203


## Zillow + Redfin + Remarks

In [7]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
test_features = X_test_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/condo_price_zillow-redfin-remarks.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9894383123318782
MAE on log scale:  0.05804396351855273
MAE on original scale:  27769.000150056254
----- Test scores -----
R2 on log scale:  0.9066367346009893
MAE on log scale:  0.14113818614082488
MAE on original scale:  69197.37831905279


## Zillow + Redfin + Remarks / Images

In [8]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
test_features = X_test_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilredrmk = utils.generate_ensemble_features(model_filenames, train_features)
X_test_zilredrmk = utils.generate_ensemble_features(model_filenames, test_features)

# image models
train_features = X_train_dict['img']
test_features = X_test_dict['img']

# model file names
img_ridge = 'models/soldprice/image_models/condo_price_img_ridge.pkl'
img_lgbm = 'models/soldprice/image_models/condo_price_img_LGBM.pkl'
img_nn = 'models/soldprice/image_models/condo_price_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_test_img = utils.generate_ensemble_features(img_model_filenames, test_features)


new_train_features = pd.concat([X_train_zilredrmk, X_train_img], axis=1)
new_test_features = pd.concat([X_test_zilredrmk, X_test_img], axis=1)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/condo_price_zillow-redfin-remarks_img.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9899918233287566
MAE on log scale:  0.05631835852917399
MAE on original scale:  27492.519217898032
----- Test scores -----
R2 on log scale:  0.9105807706747923
MAE on log scale:  0.13701467606278672
MAE on original scale:  68900.19438034944


# Multi Families

In [9]:
# MF
train_data_filename = 'data/features/TRAIN_MF.pkl'
test_data_filename = 'data/features/TEST_MF.pkl'

response_col = 'SOLDPRICE'
val_size = 0.1
random_state = 9001

# read in data 
train_df = utils.read_preprocess_df(train_data_filename, response_col=response_col)
test_df = utils.read_preprocess_df(test_data_filename, response_col=response_col, print_info=False)

# process test data
X_train_dict, X_test_dict, y_train, y_test = utils.process_test_data(test_df, train_df, response_col, val_size, random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of test samples:  1324


## Zillow

In [10]:
# zillow models
train_features = X_train_dict['zillow']
test_features = X_test_dict['zillow']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/mf_price_zillow.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9027004422624682
MAE on log scale:  0.17135323267619057
MAE on original scale:  75952.14815279737
----- Test scores -----
R2 on log scale:  0.7182566276311096
MAE on log scale:  0.26008178601128484
MAE on original scale:  101126.66360186638


## Zillow + Redfin

In [11]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
test_features = X_test_dict['zillow_redfin']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-redfin_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-redfin_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/mf_price_zillow-redfin.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.930253658634392
MAE on log scale:  0.1456717206921096
MAE on original scale:  66506.33034253815
----- Test scores -----
R2 on log scale:  0.7357555377976674
MAE on log scale:  0.2518097315758395
MAE on original scale:  97230.56782923614


## Zillow + Redfin / Images

In [12]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
test_features = X_test_dict['zillow_redfin']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-redfin_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-redfin_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_test_zilred = utils.generate_ensemble_features(model_filenames, test_features)

# image models
train_features = X_train_dict['img']
test_features = X_test_dict['img']

# model file names
img_ridge = 'models/soldprice/image_models/mf_price_img_ridge.pkl'
img_lgbm = 'models/soldprice/image_models/mf_price_img_LGBM.pkl'
img_nn = 'models/soldprice/image_models/mf_price_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_test_img = utils.generate_ensemble_features(img_model_filenames, test_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_test_features = pd.concat([X_test_zilred, X_test_img], axis=1)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/mf_price_zillow-redfin_img.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9471486824675418
MAE on log scale:  0.11704951196902373
MAE on original scale:  56183.38219709972
----- Test scores -----
R2 on log scale:  0.7239930341074621
MAE on log scale:  0.2610282107065534
MAE on original scale:  116158.35625683986


## Zillow + Remarks

In [13]:
# zillow  models
train_features = X_train_dict['zillow_remarks']
test_features = X_test_dict['zillow_remarks']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/mf_price_zillow-remarks.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9564145854782279
MAE on log scale:  0.11450561702492648
MAE on original scale:  57115.794783186495
----- Test scores -----
R2 on log scale:  0.8008042256893431
MAE on log scale:  0.21934103594190799
MAE on original scale:  94264.0503251318


## Zillow + Redfin + Remarks

In [14]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
test_features = X_test_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_test_features = utils.generate_ensemble_features(model_filenames, test_features)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/mf_price_zillow-redfin-remarks.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9613549877512337
MAE on log scale:  0.10687082950530435
MAE on original scale:  53881.29413289305
----- Test scores -----
R2 on log scale:  0.8034245895854857
MAE on log scale:  0.21651884141599156
MAE on original scale:  93034.05277888056


## Zillow + Redfin + Remarks / Images

In [15]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
test_features = X_test_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilredrmk = utils.generate_ensemble_features(model_filenames, train_features)
X_test_zilredrmk = utils.generate_ensemble_features(model_filenames, test_features)

# image models
train_features = X_train_dict['img']
test_features = X_test_dict['img']

# model file names
img_ridge = 'models/soldprice/image_models/mf_price_img_ridge.pkl'
img_lgbm = 'models/soldprice/image_models/mf_price_img_LGBM.pkl'
img_nn = 'models/soldprice/image_models/mf_price_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_test_img = utils.generate_ensemble_features(img_model_filenames, test_features)


new_train_features = pd.concat([X_train_zilredrmk, X_train_img], axis=1)
new_test_features = pd.concat([X_test_zilredrmk, X_test_img], axis=1)

# load savced model and evaluate model performance
ensemble_model_filename = 'models/soldprice/ensemble_models/mf_price_zillow-redfin-remarks_img.pkl'
utils.load_eval_model(filename=ensemble_model_filename, X_train=new_train_features, X_val=new_test_features, y_train=y_train, y_val=y_test, val_set='Test')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9668761916169364
MAE on log scale:  0.09550969021625136
MAE on original scale:  47907.29883272742
----- Test scores -----
R2 on log scale:  0.8000592542274705
MAE on log scale:  0.217104449965957
MAE on original scale:  100540.5263474181
