In [1]:
import time
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

import keras
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD, Adam
from keras.models import load_model

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import pickle
import re
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import src.utils as utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Condos

In [2]:
# Condos
data_filename = 'data/features/CON_feats_remarks.pkl'

response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


## Zillow 

In [3]:
# zillow models
train_features = X_train_dict['zillow']
val_features = X_val_dict['zillow']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [4]:
filename = 'models/soldprice/ensemble_models/condo_price_zillow.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9770709789938143
MAE on log scale:  0.08293147045887168
MAE on original scale:  39276.541893723486
----- Validation scores -----
R2 on log scale:  0.9358461493224051
MAE on log scale:  0.12716408270035465
MAE on original scale:  63049.58397297901


## Zillow + Redfin

In [5]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-redfin_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-redfin_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [6]:
filename = 'models/soldprice/ensemble_models/condo_price_zillow-redfin.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9872015695369737
MAE on log scale:  0.056293575670782
MAE on original scale:  26143.74749291712
----- Validation scores -----
R2 on log scale:  0.9465467733790236
MAE on log scale:  0.1139449257351419
MAE on original scale:  56211.03081559749


## Zillow + Redfin / Images

In [7]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-redfin_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-redfin_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_val_zilred = utils.generate_ensemble_features(model_filenames, val_features)

# image models
train_features = X_train_dict['img']
val_features = X_val_dict['img']

# model file names
img_ridge = 'models/soldprice/image_models/condo_price_img_ridge.pkl'
img_lgbm = 'models/soldprice/image_models/condo_price_img_LGBM.pkl'
img_nn = 'models/soldprice/image_models/condo_price_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_val_img = utils.generate_ensemble_features(img_model_filenames, val_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_val_features = pd.concat([X_val_zilred, X_val_img], axis=1)

In [8]:
filename = 'models/soldprice/ensemble_models/condo_price_zillow-redfin_img.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.987675306965182
MAE on log scale:  0.05472775981973835
MAE on original scale:  25826.53708718056
----- Validation scores -----
R2 on log scale:  0.9487411960406639
MAE on log scale:  0.11068116930198636
MAE on original scale:  55161.96707491294


## Zillow + Remarks

In [9]:
# zillow + redfin models
train_features = X_train_dict['zillow_remarks']
val_features = X_val_dict['zillow_remarks']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [10]:
filename = 'models/soldprice/ensemble_models/condo_price_zillow-remarks.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9902656313300511
MAE on log scale:  0.04789157698570431
MAE on original scale:  22882.34761821519
----- Validation scores -----
R2 on log scale:  0.9417857172419306
MAE on log scale:  0.12553223439604957
MAE on original scale:  61789.05129383452


## Zillow + Redfin + Remarks

In [11]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [12]:
filename = 'models/soldprice/ensemble_models/condo_price_zillow-redfin-remarks.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9894383123318782
MAE on log scale:  0.05804396351855273
MAE on original scale:  27769.000150056254
----- Validation scores -----
R2 on log scale:  0.9468740564064093
MAE on log scale:  0.11765100411973349
MAE on original scale:  57442.09678942026


## Zillow + Redfin + Remarks / Images

In [13]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/condo_price_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilredrmk = utils.generate_ensemble_features(model_filenames, train_features)
X_val_zilredrmk = utils.generate_ensemble_features(model_filenames, val_features)

# image models
train_features = X_train_dict['img']
val_features = X_val_dict['img']

# model file names
img_ridge = 'models/soldprice/image_models/condo_price_img_ridge.pkl'
img_lgbm = 'models/soldprice/image_models/condo_price_img_LGBM.pkl'
img_nn = 'models/soldprice/image_models/condo_price_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_val_img = utils.generate_ensemble_features(img_model_filenames, val_features)


new_train_features = pd.concat([X_train_zilredrmk, X_train_img], axis=1)
new_val_features = pd.concat([X_val_zilredrmk, X_val_img], axis=1)

In [14]:
filename = 'models/soldprice/ensemble_models/condo_price_zillow-redfin-remarks_img.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9899918233287566
MAE on log scale:  0.05631835852917399
MAE on original scale:  27492.519217898032
----- Validation scores -----
R2 on log scale:  0.9480963429251249
MAE on log scale:  0.11531666716356166
MAE on original scale:  56973.9875373285


# Multi Families

In [15]:
data_filename = 'data/features/MF_feats_remarks.pkl'
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


## Zillow

In [16]:
# zillow models
train_features = X_train_dict['zillow']
val_features = X_val_dict['zillow']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [17]:
filename = 'models/soldprice/ensemble_models/mf_price_zillow.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9027004422624682
MAE on log scale:  0.17135323267619057
MAE on original scale:  75952.14815279737
----- Validation scores -----
R2 on log scale:  0.7818259251270351
MAE on log scale:  0.23125811479508848
MAE on original scale:  102274.21503579343


## Zillow + Redfin

In [18]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-redfin_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-redfin_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [19]:
filename = 'models/soldprice/ensemble_models/mf_price_zillow-redfin.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.930253658634392
MAE on log scale:  0.1456717206921096
MAE on original scale:  66506.33034253815
----- Validation scores -----
R2 on log scale:  0.8065647889361993
MAE on log scale:  0.22189878710806116
MAE on original scale:  95183.12813496303


## Zillow + Redfin / Images

In [20]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin']
val_features = X_val_dict['zillow_redfin']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-redfin_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-redfin_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-redfin_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_val_zilred = utils.generate_ensemble_features(model_filenames, val_features)

# image models
train_features = X_train_dict['img']
val_features = X_val_dict['img']

# model file names
img_ridge = 'models/soldprice/image_models/mf_price_img_ridge.pkl'
img_lgbm = 'models/soldprice/image_models/mf_price_img_LGBM.pkl'
img_nn = 'models/soldprice/image_models/mf_price_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_val_img = utils.generate_ensemble_features(img_model_filenames, val_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_val_features = pd.concat([X_val_zilred, X_val_img], axis=1)

In [21]:
filename = 'models/soldprice/ensemble_models/mf_price_zillow-redfin_img.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9471486824675418
MAE on log scale:  0.11704951196902373
MAE on original scale:  56183.38219709972
----- Validation scores -----
R2 on log scale:  0.776545832678107
MAE on log scale:  0.23883339857670968
MAE on original scale:  110294.26905187263


## Zillow + Remarks

In [22]:
# zillow models
train_features = X_train_dict['zillow_remarks']
val_features = X_val_dict['zillow_remarks']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [23]:
filename = 'models/soldprice/ensemble_models/mf_price_zillow-remarks.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9564145854782279
MAE on log scale:  0.11450561702492648
MAE on original scale:  57115.794783186495
----- Validation scores -----
R2 on log scale:  0.8412633530056879
MAE on log scale:  0.2000494989388186
MAE on original scale:  95432.94113230876


## Zillow + Redfin + Remarks

In [24]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

new_train_features = utils.generate_ensemble_features(model_filenames, train_features)
new_val_features = utils.generate_ensemble_features(model_filenames, val_features)

In [25]:
filename = 'models/soldprice/ensemble_models/mf_price_zillow-redfin-remarks.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9613549877512337
MAE on log scale:  0.10687082950530435
MAE on original scale:  53881.29413289305
----- Validation scores -----
R2 on log scale:  0.8491355349702072
MAE on log scale:  0.19637866786965497
MAE on original scale:  93711.03639775746


## Zillow + Redfin + Remarks / Images

In [26]:
# zillow + redfin models
train_features = X_train_dict['zillow_redfin_remarks']
val_features = X_val_dict['zillow_redfin_remarks']

# model file names
ridge = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_ridge.pkl'
xgboost = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_XGBoost.pkl'
lgbm = 'models/soldprice/data_models/mf_price_zillow-redfin-remarks_LGBM.pkl'

model_filenames = [ridge, xgboost, lgbm]

X_train_zilred = utils.generate_ensemble_features(model_filenames, train_features)
X_val_zilred = utils.generate_ensemble_features(model_filenames, val_features)

# image models
train_features = X_train_dict['img']
val_features = X_val_dict['img']

# model file names
img_ridge = 'models/soldprice/image_models/mf_price_img_ridge.pkl'
img_lgbm = 'models/soldprice/image_models/mf_price_img_LGBM.pkl'
img_nn = 'models/soldprice/image_models/mf_price_img_nn.h5'

img_model_filenames = [img_ridge, img_lgbm, img_nn]

X_train_img = utils.generate_ensemble_features(img_model_filenames, train_features)
X_val_img = utils.generate_ensemble_features(img_model_filenames, val_features)

new_train_features = pd.concat([X_train_zilred, X_train_img], axis=1)
new_val_features = pd.concat([X_val_zilred, X_val_img], axis=1)

In [27]:
filename = 'models/soldprice/ensemble_models/mf_price_zillow-redfin-remarks_img.pkl'
model = LinearRegression()

# train and save model
utils.train_save_model(model, X=new_train_features, y=y_train, filename=filename)
# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=new_train_features, X_val=new_val_features, y_train=y_train, y_val=y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----- Training scores -----
R2 on log scale:  0.9668761916169364
MAE on log scale:  0.09550969021625136
MAE on original scale:  47907.29883272742
----- Validation scores -----
R2 on log scale:  0.8365160441762507
MAE on log scale:  0.20393784006537008
MAE on original scale:  98759.78435332692
